231 files changed, 4974 insertions, 2556 deletions
diff --git a/arch/arm/boot/dts/rk3036.dtsi b/arch/arm/boot/dts/rk3036.dtsi
index a935523a1eb8..7c2dc19925a1 100644
--- a/arch/arm/boot/dts/rk3036.dtsi
+++ b/arch/arm/boot/dts/rk3036.dtsi
@@ -204,7 +204,6 @@
 		g-np-tx-fifo-size = <16>;
 		g-rx-fifo-size = <275>;
 		g-tx-fifo-size = <256 128 128 64 64 32>;
-		g-use-dma;
 		status = "disabled";
 	};
 
diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi
index 17ec2e2d7a60..74a749c566ee 100644
--- a/arch/arm/boot/dts/rk3288.dtsi
+++ b/arch/arm/boot/dts/rk3288.dtsi
@@ -596,7 +596,6 @@
 		g-np-tx-fifo-size = <16>;
 		g-rx-fifo-size = <275>;
 		g-tx-fifo-size = <256 128 128 64 64 32>;
-		g-use-dma;
 		phys = <&usbphy0>;
 		phy-names = "usb2-phy";
 		status = "disabled";
diff --git a/arch/arm/boot/dts/rk3xxx.dtsi b/arch/arm/boot/dts/rk3xxx.dtsi
index e15beb3c671e..8fbd3c806fa0 100644
--- a/arch/arm/boot/dts/rk3xxx.dtsi
+++ b/arch/arm/boot/dts/rk3xxx.dtsi
@@ -181,7 +181,6 @@
 		g-np-tx-fifo-size = <16>;
 		g-rx-fifo-size = <275>;
 		g-tx-fifo-size = <256 128 128 64 64 32>;
-		g-use-dma;
 		phys = <&usbphy0>;
 		phy-names = "usb2-phy";
 		status = "disabled";
diff --git a/arch/arm/include/asm/xen/hypercall.h b/arch/arm/include/asm/xen/hypercall.h
index 9d874db13c0e..3522cbaed316 100644
--- a/arch/arm/include/asm/xen/hypercall.h
+++ b/arch/arm/include/asm/xen/hypercall.h
@@ -1,87 +1 @@
-/******************************************************************************
- * hypercall.h
- *
- * Linux-specific hypervisor handling.
- *
- * Stefano Stabellini <stefano.stabellini@eu.citrix.com>, Citrix, 2012
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#ifndef _ASM_ARM_XEN_HYPERCALL_H
-#define _ASM_ARM_XEN_HYPERCALL_H
-
-#include <linux/bug.h>
-
-#include <xen/interface/xen.h>
-#include <xen/interface/sched.h>
-#include <xen/interface/platform.h>
-
-long privcmd_call(unsigned call, unsigned long a1,
-		unsigned long a2, unsigned long a3,
-		unsigned long a4, unsigned long a5);
-int HYPERVISOR_xen_version(int cmd, void *arg);
-int HYPERVISOR_console_io(int cmd, int count, char *str);
-int HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count);
-int HYPERVISOR_sched_op(int cmd, void *arg);
-int HYPERVISOR_event_channel_op(int cmd, void *arg);
-unsigned long HYPERVISOR_hvm_op(int op, void *arg);
-int HYPERVISOR_memory_op(unsigned int cmd, void *arg);
-int HYPERVISOR_physdev_op(int cmd, void *arg);
-int HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args);
-int HYPERVISOR_tmem_op(void *arg);
-int HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type);
-int HYPERVISOR_platform_op_raw(void *arg);
-static inline int HYPERVISOR_platform_op(struct xen_platform_op *op)
-{
-	op->interface_version = XENPF_INTERFACE_VERSION;
-	return HYPERVISOR_platform_op_raw(op);
-}
-int HYPERVISOR_multicall(struct multicall_entry *calls, uint32_t nr);
-
-static inline int
-HYPERVISOR_suspend(unsigned long start_info_mfn)
-{
-	struct sched_shutdown r = { .reason = SHUTDOWN_suspend };
-
-	/* start_info_mfn is unused on ARM */
-	return HYPERVISOR_sched_op(SCHEDOP_shutdown, &r);
-}
-
-static inline void
-MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
-			unsigned int new_val, unsigned long flags)
-{
-	BUG();
-}
-
-static inline void
-MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req,
-		 int count, int *success_count, domid_t domid)
-{
-	BUG();
-}
-
-#endif /* _ASM_ARM_XEN_HYPERCALL_H */
+#include <xen/arm/hypercall.h>
diff --git a/arch/arm/include/asm/xen/hypervisor.h b/arch/arm/include/asm/xen/hypervisor.h
index 95251512e2c4..d6e7709d0688 100644
--- a/arch/arm/include/asm/xen/hypervisor.h
+++ b/arch/arm/include/asm/xen/hypervisor.h
@@ -1,39 +1 @@
-#ifndef _ASM_ARM_XEN_HYPERVISOR_H
-#define _ASM_ARM_XEN_HYPERVISOR_H
-
-#include <linux/init.h>
-
-extern struct shared_info *HYPERVISOR_shared_info;
-extern struct start_info *xen_start_info;
-
-/* Lazy mode for batching updates / context switch */
-enum paravirt_lazy_mode {
-	PARAVIRT_LAZY_NONE,
-	PARAVIRT_LAZY_MMU,
-	PARAVIRT_LAZY_CPU,
-};
-
-static inline enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
-{
-	return PARAVIRT_LAZY_NONE;
-}
-
-extern struct dma_map_ops *xen_dma_ops;
-
-#ifdef CONFIG_XEN
-void __init xen_early_init(void);
-#else
-static inline void xen_early_init(void) { return; }
-#endif
-
-#ifdef CONFIG_HOTPLUG_CPU
-static inline void xen_arch_register_cpu(int num)
-{
-}
-
-static inline void xen_arch_unregister_cpu(int num)
-{
-}
-#endif
-
-#endif /* _ASM_ARM_XEN_HYPERVISOR_H */
+#include <xen/arm/hypervisor.h>
diff --git a/arch/arm/include/asm/xen/interface.h b/arch/arm/include/asm/xen/interface.h
index 75d596862892..88c0d75da190 100644
--- a/arch/arm/include/asm/xen/interface.h
+++ b/arch/arm/include/asm/xen/interface.h
@@ -1,85 +1 @@
-/******************************************************************************
- * Guest OS interface to ARM Xen.
- *
- * Stefano Stabellini <stefano.stabellini@eu.citrix.com>, Citrix, 2012
- */
-
-#ifndef _ASM_ARM_XEN_INTERFACE_H
-#define _ASM_ARM_XEN_INTERFACE_H
-
-#include <linux/types.h>
-
-#define uint64_aligned_t uint64_t __attribute__((aligned(8)))
-
-#define __DEFINE_GUEST_HANDLE(name, type) \
-	typedef struct { union { type *p; uint64_aligned_t q; }; }  \
-        __guest_handle_ ## name
-
-#define DEFINE_GUEST_HANDLE_STRUCT(name) \
-	__DEFINE_GUEST_HANDLE(name, struct name)
-#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name)
-#define GUEST_HANDLE(name)        __guest_handle_ ## name
-
-#define set_xen_guest_handle(hnd, val)			\
-	do {						\
-		if (sizeof(hnd) == 8)			\
-			*(uint64_t *)&(hnd) = 0;	\
-		(hnd).p = val;				\
-	} while (0)
-
-#define __HYPERVISOR_platform_op_raw __HYPERVISOR_platform_op
-
-#ifndef __ASSEMBLY__
-/* Explicitly size integers that represent pfns in the interface with
- * Xen so that we can have one ABI that works for 32 and 64 bit guests.
- * Note that this means that the xen_pfn_t type may be capable of
- * representing pfn's which the guest cannot represent in its own pfn
- * type. However since pfn space is controlled by the guest this is
- * fine since it simply wouldn't be able to create any sure pfns in
- * the first place.
- */
-typedef uint64_t xen_pfn_t;
-#define PRI_xen_pfn "llx"
-typedef uint64_t xen_ulong_t;
-#define PRI_xen_ulong "llx"
-typedef int64_t xen_long_t;
-#define PRI_xen_long "llx"
-/* Guest handles for primitive C types. */
-__DEFINE_GUEST_HANDLE(uchar, unsigned char);
-__DEFINE_GUEST_HANDLE(uint,  unsigned int);
-DEFINE_GUEST_HANDLE(char);
-DEFINE_GUEST_HANDLE(int);
-DEFINE_GUEST_HANDLE(void);
-DEFINE_GUEST_HANDLE(uint64_t);
-DEFINE_GUEST_HANDLE(uint32_t);
-DEFINE_GUEST_HANDLE(xen_pfn_t);
-DEFINE_GUEST_HANDLE(xen_ulong_t);
-
-/* Maximum number of virtual CPUs in multi-processor guests. */
-#define MAX_VIRT_CPUS 1
-
-struct arch_vcpu_info { };
-struct arch_shared_info { };
-
-/* TODO: Move pvclock definitions some place arch independent */
-struct pvclock_vcpu_time_info {
-	u32   version;
-	u32   pad0;
-	u64   tsc_timestamp;
-	u64   system_time;
-	u32   tsc_to_system_mul;
-	s8    tsc_shift;
-	u8    flags;
-	u8    pad[2];
-} __attribute__((__packed__)); /* 32 bytes */
-
-/* It is OK to have a 12 bytes struct with no padding because it is packed */
-struct pvclock_wall_clock {
-	u32   version;
-	u32   sec;
-	u32   nsec;
-	u32   sec_hi;
-} __attribute__((__packed__));
-#endif
-
-#endif /* _ASM_ARM_XEN_INTERFACE_H */
+#include <xen/arm/interface.h>
diff --git a/arch/arm/include/asm/xen/page-coherent.h b/arch/arm/include/asm/xen/page-coherent.h
index 95ce6ac3a971..b3ef061d8b74 100644
--- a/arch/arm/include/asm/xen/page-coherent.h
+++ b/arch/arm/include/asm/xen/page-coherent.h
@@ -1,98 +1 @@
-#ifndef _ASM_ARM_XEN_PAGE_COHERENT_H
-#define _ASM_ARM_XEN_PAGE_COHERENT_H
-
-#include <asm/page.h>
-#include <linux/dma-mapping.h>
-
-void __xen_dma_map_page(struct device *hwdev, struct page *page,
-	     dma_addr_t dev_addr, unsigned long offset, size_t size,
-	     enum dma_data_direction dir, unsigned long attrs);
-void __xen_dma_unmap_page(struct device *hwdev, dma_addr_t handle,
-		size_t size, enum dma_data_direction dir,
-		unsigned long attrs);
-void __xen_dma_sync_single_for_cpu(struct device *hwdev,
-		dma_addr_t handle, size_t size, enum dma_data_direction dir);
-
-void __xen_dma_sync_single_for_device(struct device *hwdev,
-		dma_addr_t handle, size_t size, enum dma_data_direction dir);
-
-static inline void *xen_alloc_coherent_pages(struct device *hwdev, size_t size,
-		dma_addr_t *dma_handle, gfp_t flags, unsigned long attrs)
-{
-	return __generic_dma_ops(hwdev)->alloc(hwdev, size, dma_handle, flags, attrs);
-}
-
-static inline void xen_free_coherent_pages(struct device *hwdev, size_t size,
-		void *cpu_addr, dma_addr_t dma_handle, unsigned long attrs)
-{
-	__generic_dma_ops(hwdev)->free(hwdev, size, cpu_addr, dma_handle, attrs);
-}
-
-static inline void xen_dma_map_page(struct device *hwdev, struct page *page,
-	     dma_addr_t dev_addr, unsigned long offset, size_t size,
-	     enum dma_data_direction dir, unsigned long attrs)
-{
-	unsigned long page_pfn = page_to_xen_pfn(page);
-	unsigned long dev_pfn = XEN_PFN_DOWN(dev_addr);
-	unsigned long compound_pages =
-		(1<<compound_order(page)) * XEN_PFN_PER_PAGE;
-	bool local = (page_pfn <= dev_pfn) &&
-		(dev_pfn - page_pfn < compound_pages);
-
-	/*
-	 * Dom0 is mapped 1:1, while the Linux page can span across
-	 * multiple Xen pages, it's not possible for it to contain a
-	 * mix of local and foreign Xen pages. So if the first xen_pfn
-	 * == mfn the page is local otherwise it's a foreign page
-	 * grant-mapped in dom0. If the page is local we can safely
-	 * call the native dma_ops function, otherwise we call the xen
-	 * specific function.
-	 */
-	if (local)
-		__generic_dma_ops(hwdev)->map_page(hwdev, page, offset, size, dir, attrs);
-	else
-		__xen_dma_map_page(hwdev, page, dev_addr, offset, size, dir, attrs);
-}
-
-static inline void xen_dma_unmap_page(struct device *hwdev, dma_addr_t handle,
-		size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
-	unsigned long pfn = PFN_DOWN(handle);
-	/*
-	 * Dom0 is mapped 1:1, while the Linux page can be spanned accross
-	 * multiple Xen page, it's not possible to have a mix of local and
-	 * foreign Xen page. Dom0 is mapped 1:1, so calling pfn_valid on a
-	 * foreign mfn will always return false. If the page is local we can
-	 * safely call the native dma_ops function, otherwise we call the xen
-	 * specific function.
-	 */
-	if (pfn_valid(pfn)) {
-		if (__generic_dma_ops(hwdev)->unmap_page)
-			__generic_dma_ops(hwdev)->unmap_page(hwdev, handle, size, dir, attrs);
-	} else
-		__xen_dma_unmap_page(hwdev, handle, size, dir, attrs);
-}
-
-static inline void xen_dma_sync_single_for_cpu(struct device *hwdev,
-		dma_addr_t handle, size_t size, enum dma_data_direction dir)
-{
-	unsigned long pfn = PFN_DOWN(handle);
-	if (pfn_valid(pfn)) {
-		if (__generic_dma_ops(hwdev)->sync_single_for_cpu)
-			__generic_dma_ops(hwdev)->sync_single_for_cpu(hwdev, handle, size, dir);
-	} else
-		__xen_dma_sync_single_for_cpu(hwdev, handle, size, dir);
-}
-
-static inline void xen_dma_sync_single_for_device(struct device *hwdev,
-		dma_addr_t handle, size_t size, enum dma_data_direction dir)
-{
-	unsigned long pfn = PFN_DOWN(handle);
-	if (pfn_valid(pfn)) {
-		if (__generic_dma_ops(hwdev)->sync_single_for_device)
-			__generic_dma_ops(hwdev)->sync_single_for_device(hwdev, handle, size, dir);
-	} else
-		__xen_dma_sync_single_for_device(hwdev, handle, size, dir);
-}
-
-#endif /* _ASM_ARM_XEN_PAGE_COHERENT_H */
+#include <xen/arm/page-coherent.h>
diff --git a/arch/arm/include/asm/xen/page.h b/arch/arm/include/asm/xen/page.h
index 415dbc6e43fd..31bbc803cecb 100644
--- a/arch/arm/include/asm/xen/page.h
+++ b/arch/arm/include/asm/xen/page.h
@@ -1,122 +1 @@
-#ifndef _ASM_ARM_XEN_PAGE_H
-#define _ASM_ARM_XEN_PAGE_H
-
-#include <asm/page.h>
-#include <asm/pgtable.h>
-
-#include <linux/pfn.h>
-#include <linux/types.h>
-#include <linux/dma-mapping.h>
-
-#include <xen/xen.h>
-#include <xen/interface/grant_table.h>
-
-#define phys_to_machine_mapping_valid(pfn) (1)
-
-/* Xen machine address */
-typedef struct xmaddr {
-	phys_addr_t maddr;
-} xmaddr_t;
-
-/* Xen pseudo-physical address */
-typedef struct xpaddr {
-	phys_addr_t paddr;
-} xpaddr_t;
-
-#define XMADDR(x)	((xmaddr_t) { .maddr = (x) })
-#define XPADDR(x)	((xpaddr_t) { .paddr = (x) })
-
-#define INVALID_P2M_ENTRY      (~0UL)
-
-/*
- * The pseudo-physical frame (pfn) used in all the helpers is always based
- * on Xen page granularity (i.e 4KB).
- *
- * A Linux page may be split across multiple non-contiguous Xen page so we
- * have to keep track with frame based on 4KB page granularity.
- *
- * PV drivers should never make a direct usage of those helpers (particularly
- * pfn_to_gfn and gfn_to_pfn).
- */
-
-unsigned long __pfn_to_mfn(unsigned long pfn);
-extern struct rb_root phys_to_mach;
-
-/* Pseudo-physical <-> Guest conversion */
-static inline unsigned long pfn_to_gfn(unsigned long pfn)
-{
-	return pfn;
-}
-
-static inline unsigned long gfn_to_pfn(unsigned long gfn)
-{
-	return gfn;
-}
-
-/* Pseudo-physical <-> BUS conversion */
-static inline unsigned long pfn_to_bfn(unsigned long pfn)
-{
-	unsigned long mfn;
-
-	if (phys_to_mach.rb_node != NULL) {
-		mfn = __pfn_to_mfn(pfn);
-		if (mfn != INVALID_P2M_ENTRY)
-			return mfn;
-	}
-
-	return pfn;
-}
-
-static inline unsigned long bfn_to_pfn(unsigned long bfn)
-{
-	return bfn;
-}
-
-#define bfn_to_local_pfn(bfn)	bfn_to_pfn(bfn)
-
-/* VIRT <-> GUEST conversion */
-#define virt_to_gfn(v)		(pfn_to_gfn(virt_to_phys(v) >> XEN_PAGE_SHIFT))
-#define gfn_to_virt(m)		(__va(gfn_to_pfn(m) << XEN_PAGE_SHIFT))
-
-/* Only used in PV code. But ARM guests are always HVM. */
-static inline xmaddr_t arbitrary_virt_to_machine(void *vaddr)
-{
-	BUG();
-}
-
-/* TODO: this shouldn't be here but it is because the frontend drivers
- * are using it (its rolled in headers) even though we won't hit the code path.
- * So for right now just punt with this.
- */
-static inline pte_t *lookup_address(unsigned long address, unsigned int *level)
-{
-	BUG();
-	return NULL;
-}
-
-extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
-				   struct gnttab_map_grant_ref *kmap_ops,
-				   struct page **pages, unsigned int count);
-
-extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
-				     struct gnttab_unmap_grant_ref *kunmap_ops,
-				     struct page **pages, unsigned int count);
-
-bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
-bool __set_phys_to_machine_multi(unsigned long pfn, unsigned long mfn,
-		unsigned long nr_pages);
-
-static inline bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
-{
-	return __set_phys_to_machine(pfn, mfn);
-}
-
-#define xen_remap(cookie, size) ioremap_cache((cookie), (size))
-#define xen_unmap(cookie) iounmap((cookie))
-
-bool xen_arch_need_swiotlb(struct device *dev,
-			   phys_addr_t phys,
-			   dma_addr_t dev_addr);
-unsigned long xen_get_swiotlb_free_pages(unsigned int order);
-
-#endif /* _ASM_ARM_XEN_PAGE_H */
+#include <xen/arm/page.h>
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index b38c10c73579..af05f8e0903e 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -87,9 +87,11 @@ struct kvm_regs {
 /* Supported VGICv3 address types  */
 #define KVM_VGIC_V3_ADDR_TYPE_DIST	2
 #define KVM_VGIC_V3_ADDR_TYPE_REDIST	3
+#define KVM_VGIC_ITS_ADDR_TYPE		4
 
 #define KVM_VGIC_V3_DIST_SIZE		SZ_64K
 #define KVM_VGIC_V3_REDIST_SIZE		(2 * SZ_64K)
+#define KVM_VGIC_V3_ITS_SIZE		(2 * SZ_64K)
 
 #define KVM_ARM_VCPU_POWER_OFF		0 /* CPU is started in OFF state */
 #define KVM_ARM_VCPU_PSCI_0_2		1 /* CPU uses PSCI v0.2 */
diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
index 3e1cd0452d67..90d0176fb30d 100644
--- a/arch/arm/kvm/Kconfig
+++ b/arch/arm/kvm/Kconfig
@@ -34,6 +34,7 @@ config KVM
 	select HAVE_KVM_IRQFD
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_IRQ_ROUTING
+	select HAVE_KVM_MSI
 	depends on ARM_VIRT_EXT && ARM_LPAE && ARM_ARCH_TIMER
 	---help---
 	  Support hosting virtualized guest machines.
diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile
index f19842ea5418..d571243ab4d1 100644
--- a/arch/arm/kvm/Makefile
+++ b/arch/arm/kvm/Makefile
@@ -32,5 +32,6 @@ obj-y += $(KVM)/arm/vgic/vgic-mmio.o
 obj-y += $(KVM)/arm/vgic/vgic-mmio-v2.o
 obj-y += $(KVM)/arm/vgic/vgic-mmio-v3.o
 obj-y += $(KVM)/arm/vgic/vgic-kvm-device.o
+obj-y += $(KVM)/arm/vgic/vgic-its.o
 obj-y += $(KVM)/irqchip.o
 obj-y += $(KVM)/arm/arch_timer.o
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 19b5f5c1c0ff..8f92efa8460e 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -221,6 +221,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_MAX_VCPUS:
 		r = KVM_MAX_VCPUS;
 		break;
+	case KVM_CAP_MSI_DEVID:
+		if (!kvm)
+			r = -EINVAL;
+		else
+			r = kvm->arch.vgic.msis_require_devid;
+		break;
 	default:
 		r = kvm_arch_dev_ioctl_check_extension(kvm, ext);
 		break;
diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c
index f193414d0f6f..4986dc0c1dff 100644
--- a/arch/arm/xen/enlighten.c
+++ b/arch/arm/xen/enlighten.c
@@ -372,8 +372,7 @@ static int __init xen_guest_init(void)
 	 * for secondary CPUs as they are brought up.
 	 * For uniformity we use VCPUOP_register_vcpu_info even on cpu0.
 	 */
-	xen_vcpu_info = __alloc_percpu(sizeof(struct vcpu_info),
-			                       sizeof(struct vcpu_info));
+	xen_vcpu_info = alloc_percpu(struct vcpu_info);
 	if (xen_vcpu_info == NULL)
 		return -ENOMEM;
 
diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c
index d062f08f5020..bd62d94f8ac5 100644
--- a/arch/arm/xen/mm.c
+++ b/arch/arm/xen/mm.c
@@ -186,7 +186,6 @@ struct dma_map_ops *xen_dma_ops;
 EXPORT_SYMBOL(xen_dma_ops);
 
 static struct dma_map_ops xen_swiotlb_dma_ops = {
-	.mapping_error = xen_swiotlb_dma_mapping_error,
 	.alloc = xen_swiotlb_alloc_coherent,
 	.free = xen_swiotlb_free_coherent,
 	.sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 657be7f5014e..111742126897 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -110,6 +110,7 @@ config ARM64
 	select POWER_SUPPLY
 	select SPARSE_IRQ
 	select SYSCTL_EXCEPTION_TRACE
+	select THREAD_INFO_IN_TASK
 	help
 	  ARM 64-bit (AArch64) Linux support.
 
@@ -239,6 +240,9 @@ config PGTABLE_LEVELS
 	default 3 if ARM64_16K_PAGES && ARM64_VA_BITS_47
 	default 4 if !ARM64_64K_PAGES && ARM64_VA_BITS_48
 
+config ARCH_SUPPORTS_UPROBES
+	def_bool y
+
 source "init/Kconfig"
 
 source "kernel/Kconfig.freezer"
@@ -791,6 +795,14 @@ config SETEND_EMULATION
 	  If unsure, say Y
 endif
 
+config ARM64_SW_TTBR0_PAN
+	bool "Emulate Privileged Access Never using TTBR0_EL1 switching"
+	help
+	  Enabling this option prevents the kernel from accessing
+	  user-space memory directly by pointing TTBR0_EL1 to a reserved
+	  zeroed area and reserved ASID. The user access routines
+	  restore the valid TTBR0_EL1 temporarily.
+
 menu "ARMv8.1 architectural features"
 
 config ARM64_HW_AFDBM
diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug
index b661fe742615..d1ebd46872fd 100644
--- a/arch/arm64/Kconfig.debug
+++ b/arch/arm64/Kconfig.debug
@@ -2,9 +2,13 @@ menu "Kernel hacking"
 
 source "lib/Kconfig.debug"
 
-config ARM64_PTDUMP
+config ARM64_PTDUMP_CORE
+	def_bool n
+
+config ARM64_PTDUMP_DEBUGFS
 	bool "Export kernel pagetable layout to userspace via debugfs"
 	depends on DEBUG_KERNEL
+	select ARM64_PTDUMP_CORE
 	select DEBUG_FS
         help
 	  Say Y here if you want to show the kernel pagetable layout in a
@@ -38,6 +42,35 @@ config ARM64_RANDOMIZE_TEXT_OFFSET
 	  of TEXT_OFFSET and platforms must not require a specific
 	  value.
 
+config DEBUG_WX
+	bool "Warn on W+X mappings at boot"
+	select ARM64_PTDUMP_CORE
+	---help---
+	  Generate a warning if any W+X mappings are found at boot.
+
+	  This is useful for discovering cases where the kernel is leaving
+	  W+X mappings after applying NX, as such mappings are a security risk.
+	  This check also includes UXN, which should be set on all kernel
+	  mappings.
+
+	  Look for a message in dmesg output like this:
+
+	    arm64/mm: Checked W+X mappings: passed, no W+X pages found.
+
+	  or like this, if the check failed:
+
+	    arm64/mm: Checked W+X mappings: FAILED, <N> W+X pages found.
+
+	  Note that even if the check fails, your kernel is possibly
+	  still fine, as W+X mappings are not a security hole in
+	  themselves, what they do is that they make the exploitation
+	  of other unfixed kernel bugs easier.
+
+	  There is no runtime or memory usage effect of this option
+	  once the kernel has booted up - it's a one time check.
+
+	  If in doubt, say "Y".
+
 config DEBUG_SET_MODULE_RONX
 	bool "Set loadable kernel module data as NX and text as RO"
 	depends on MODULES
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 3635b8662724..b9a4a934ca05 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -37,10 +37,16 @@ $(warning LSE atomics not supported by binutils)
   endif
 endif
 
-KBUILD_CFLAGS	+= -mgeneral-regs-only $(lseinstr)
+brokengasinst := $(call as-instr,1:\n.inst 0\n.rept . - 1b\n\nnop\n.endr\n,,-DCONFIG_BROKEN_GAS_INST=1)
+
+ifneq ($(brokengasinst),)
+$(warning Detected assembler with broken .inst; disassembly will be unreliable)
+endif
+
+KBUILD_CFLAGS	+= -mgeneral-regs-only $(lseinstr) $(brokengasinst)
 KBUILD_CFLAGS	+= -fno-asynchronous-unwind-tables
 KBUILD_CFLAGS	+= $(call cc-option, -mpc-relative-literal-loads)
-KBUILD_AFLAGS	+= $(lseinstr)
+KBUILD_AFLAGS	+= $(lseinstr) $(brokengasinst)
 
 ifeq ($(CONFIG_CPU_BIG_ENDIAN), y)
 KBUILD_CPPFLAGS	+= -mbig-endian
diff --git a/arch/arm64/boot/dts/hisilicon/hi6220.dtsi b/arch/arm64/boot/dts/hisilicon/hi6220.dtsi
index 17839db585d5..e0ea60382087 100644
--- a/arch/arm64/boot/dts/hisilicon/hi6220.dtsi
+++ b/arch/arm64/boot/dts/hisilicon/hi6220.dtsi
@@ -747,7 +747,6 @@
 			clocks = <&sys_ctrl HI6220_USBOTG_HCLK>;
 			clock-names = "otg";
 			dr_mode = "otg";
-			g-use-dma;
 			g-rx-fifo-size = <512>;
 			g-np-tx-fifo-size = <128>;
 			g-tx-fifo-size = <128 128 128 128 128 128>;
diff --git a/arch/arm64/boot/dts/mediatek/mt8173-evb.dts b/arch/arm64/boot/dts/mediatek/mt8173-evb.dts
index 2a7f731c7759..0ecaad4333a7 100644
--- a/arch/arm64/boot/dts/mediatek/mt8173-evb.dts
+++ b/arch/arm64/boot/dts/mediatek/mt8173-evb.dts
@@ -34,15 +34,6 @@
 
 	chosen { };
 
-	usb_p1_vbus: regulator@0 {
-		compatible = "regulator-fixed";
-		regulator-name = "usb_vbus";
-		regulator-min-microvolt = <5000000>;
-		regulator-max-microvolt = <5000000>;
-		gpio = <&pio 130 GPIO_ACTIVE_HIGH>;
-		enable-active-high;
-	};
-
 	connector {
 		compatible = "hdmi-connector";
 		label = "hdmi";
@@ -54,6 +45,29 @@
 			};
 		};
 	};
+
+	extcon_usb: extcon_iddig {
+		compatible = "linux,extcon-usb-gpio";
+		id-gpio = <&pio 16 GPIO_ACTIVE_HIGH>;
+	};
+
+	usb_p1_vbus: regulator@0 {
+		compatible = "regulator-fixed";
+		regulator-name = "usb_vbus";
+		regulator-min-microvolt = <5000000>;
+		regulator-max-microvolt = <5000000>;
+		gpio = <&pio 130 GPIO_ACTIVE_HIGH>;
+		enable-active-high;
+	};
+
+	usb_p0_vbus: regulator@1 {
+		compatible = "regulator-fixed";
+		regulator-name = "vbus";
+		regulator-min-microvolt = <5000000>;
+		regulator-max-microvolt = <5000000>;
+		gpio = <&pio 9 GPIO_ACTIVE_HIGH>;
+		enable-active-high;
+	};
 };
 
 &cec {
@@ -243,6 +257,20 @@
 			bias-pull-down = <MTK_PUPD_SET_R1R0_10>;
 		};
 	};
+
+	usb_id_pins_float: usb_iddig_pull_up {
+		pins_iddig {
+			pinmux = <MT8173_PIN_16_IDDIG__FUNC_IDDIG>;
+			bias-pull-up;
+		};
+	};
+
+	usb_id_pins_ground: usb_iddig_pull_down {
+		pins_iddig {
+			pinmux = <MT8173_PIN_16_IDDIG__FUNC_IDDIG>;
+			bias-pull-down;
+		};
+	};
 };
 
 &pwm0 {
@@ -469,12 +497,25 @@
 	status = "okay";
 };
 
+&ssusb {
+	vusb33-supply = <&mt6397_vusb_reg>;
+	vbus-supply = <&usb_p0_vbus>;
+	extcon = <&extcon_usb>;
+	dr_mode = "otg";
+	mediatek,enable-wakeup;
+	pinctrl-names = "default", "id_float", "id_ground";
+	pinctrl-0 = <&usb_id_pins_float>;
+	pinctrl-1 = <&usb_id_pins_float>;
+	pinctrl-2 = <&usb_id_pins_ground>;
+	status = "okay";
+};
+
 &uart0 {
 	status = "okay";
 };
 
-&usb30 {
+&usb_host {
 	vusb33-supply = <&mt6397_vusb_reg>;
 	vbus-supply = <&usb_p1_vbus>;
-	mediatek,wakeup-src = <1>;
+	status = "okay";
 };
diff --git a/arch/arm64/boot/dts/mediatek/mt8173.dtsi b/arch/arm64/boot/dts/mediatek/mt8173.dtsi
index 1c71e256601d..c2d588ca59b7 100644
--- a/arch/arm64/boot/dts/mediatek/mt8173.dtsi
+++ b/arch/arm64/boot/dts/mediatek/mt8173.dtsi
@@ -707,11 +707,14 @@
 			status = "disabled";
 		};
 
-		usb30: usb@11270000 {
-			compatible = "mediatek,mt8173-xhci";
-			reg = <0 0x11270000 0 0x1000>,
+		ssusb: usb@11271000 {
+			compatible = "mediatek,mt8173-mtu3";
+			reg = <0 0x11271000 0 0x3000>,
 			      <0 0x11280700 0 0x0100>;
-			interrupts = <GIC_SPI 115 IRQ_TYPE_LEVEL_LOW>;
+			reg-names = "mac", "ippc";
+			interrupts = <GIC_SPI 64 IRQ_TYPE_LEVEL_LOW>;
+			phys = <&phy_port0 PHY_TYPE_USB3>,
+			       <&phy_port1 PHY_TYPE_USB2>;
 			power-domains = <&scpsys MT8173_POWER_DOMAIN_USB>;
 			clocks = <&topckgen CLK_TOP_USB30_SEL>,
 				 <&pericfg CLK_PERI_USB0>,
@@ -719,10 +722,22 @@
 			clock-names = "sys_ck",
 				      "wakeup_deb_p0",
 				      "wakeup_deb_p1";
-			phys = <&phy_port0 PHY_TYPE_USB3>,
-			       <&phy_port1 PHY_TYPE_USB2>;
 			mediatek,syscon-wakeup = <&pericfg>;
-			status = "okay";
+			#address-cells = <2>;
+			#size-cells = <2>;
+			ranges;
+			status = "disabled";
+
+			usb_host: xhci@11270000 {
+				compatible = "mediatek,mt8173-xhci";
+				reg = <0 0x11270000 0 0x1000>;
+				reg-names = "mac";
+				interrupts = <GIC_SPI 115 IRQ_TYPE_LEVEL_LOW>;
+				power-domains = <&scpsys MT8173_POWER_DOMAIN_USB>;
+				clocks = <&topckgen CLK_TOP_USB30_SEL>;
+				clock-names = "sys_ck";
+				status = "disabled";
+			};
 		};
 
 		u3phy: usb-phy@11290000 {
diff --git a/arch/arm64/boot/dts/rockchip/rk3368.dtsi b/arch/arm64/boot/dts/rockchip/rk3368.dtsi
index 0fcb2147c9f9..df231c4df5a5 100644
--- a/arch/arm64/boot/dts/rockchip/rk3368.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3368.dtsi
@@ -537,7 +537,6 @@
 		g-np-tx-fifo-size = <16>;
 		g-rx-fifo-size = <275>;
 		g-tx-fifo-size = <256 128 128 64 64 32>;
-		g-use-dma;
 		status = "disabled";
 	};
 
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 6be08113a96d..c3caaddde6cc 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -82,6 +82,7 @@ CONFIG_KEXEC=y
 # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
 CONFIG_COMPAT=y
 CONFIG_CPU_IDLE=y
+CONFIG_HIBERNATION=y
 CONFIG_ARM_CPUIDLE=y
 CONFIG_CPU_FREQ=y
 CONFIG_CPUFREQ_DT=y
diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
index b4ab238a59ec..8365a84c2640 100644
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -1,7 +1,6 @@
 generic-y += bugs.h
 generic-y += clkdev.h
 generic-y += cputime.h
-generic-y += current.h
 generic-y += delay.h
 generic-y += div64.h
 generic-y += dma.h
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 28bfe6132eb6..446f6c46d4b1 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -41,6 +41,15 @@
 	msr	daifclr, #2
 	.endm
 
+	.macro	save_and_disable_irq, flags
+	mrs	\flags, daif
+	msr	daifset, #2
+	.endm
+
+	.macro	restore_irq, flags
+	msr	daif, \flags
+	.endm
+
 /*
  * Enable and disable debug exceptions.
  */
@@ -202,14 +211,25 @@ lr	.req	x30		// link register
 	.endm
 
 	/*
+	 * @dst: Result of per_cpu(sym, smp_processor_id())
 	 * @sym: The name of the per-cpu variable
-	 * @reg: Result of per_cpu(sym, smp_processor_id())
 	 * @tmp: scratch register
 	 */
-	.macro this_cpu_ptr, sym, reg, tmp
-	adr_l	\reg, \sym
+	.macro adr_this_cpu, dst, sym, tmp
+	adr_l	\dst, \sym
 	mrs	\tmp, tpidr_el1
-	add	\reg, \reg, \tmp
+	add	\dst, \dst, \tmp
+	.endm
+
+	/*
+	 * @dst: Result of READ_ONCE(per_cpu(sym, smp_processor_id()))
+	 * @sym: The name of the per-cpu variable
+	 * @tmp: scratch register
+	 */
+	.macro ldr_this_cpu dst, sym, tmp
+	adr_l	\dst, \sym
+	mrs	\tmp, tpidr_el1
+	ldr	\dst, [\dst, \tmp]
 	.endm
 
 /*
@@ -395,4 +415,24 @@ alternative_endif
 	movk	\reg, :abs_g0_nc:\val
 	.endm
 
+/*
+ * Return the current thread_info.
+ */
+	.macro	get_thread_info, rd
+	mrs	\rd, sp_el0
+	.endm
+
+/*
+ * Errata workaround post TTBR0_EL1 update.
+ */
+	.macro	post_ttbr0_update_workaround
+#ifdef CONFIG_CAVIUM_ERRATUM_27456
+alternative_if ARM64_WORKAROUND_CAVIUM_27456
+	ic	iallu
+	dsb	nsh
+	isb
+alternative_else_nop_endif
+#endif
+	.endm
+
 #endif	/* __ASM_ASSEMBLER_H */
diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index 2e5fb976a572..5a2a6ee65f65 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -65,12 +65,12 @@
  *		- kaddr  - page address
  *		- size   - region size
  */
-extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end);
 extern void flush_icache_range(unsigned long start, unsigned long end);
 extern void __flush_dcache_area(void *addr, size_t len);
 extern void __clean_dcache_area_poc(void *addr, size_t len);
 extern void __clean_dcache_area_pou(void *addr, size_t len);
 extern long __flush_cache_user_range(unsigned long start, unsigned long end);
+extern void sync_icache_aliases(void *kaddr, unsigned long len);
 
 static inline void flush_cache_mm(struct mm_struct *mm)
 {
@@ -81,6 +81,11 @@ static inline void flush_cache_page(struct vm_area_struct *vma,
 {
 }
 
+static inline void flush_cache_range(struct vm_area_struct *vma,
+				     unsigned long start, unsigned long end)
+{
+}
+
 /*
  * Cache maintenance functions used by the DMA API. No to be used directly.
  */
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index 87b446535185..4174f09678c4 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -34,7 +34,8 @@
 #define ARM64_HAS_32BIT_EL0			13
 #define ARM64_HYP_OFFSET_LOW			14
 #define ARM64_MISMATCHED_CACHE_LINE_SIZE	15
+#define ARM64_HAS_NO_FPSIMD			16
 
-#define ARM64_NCAPS				16
+#define ARM64_NCAPS				17
 
 #endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 0bc0b1de90c4..b4989df48670 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -9,8 +9,6 @@
 #ifndef __ASM_CPUFEATURE_H
 #define __ASM_CPUFEATURE_H
 
-#include <linux/jump_label.h>
-
 #include <asm/cpucaps.h>
 #include <asm/hwcap.h>
 #include <asm/sysreg.h>
@@ -27,6 +25,8 @@
 
 #ifndef __ASSEMBLY__
 
+#include <linux/bug.h>
+#include <linux/jump_label.h>
 #include <linux/kernel.h>
 
 /* CPU feature register tracking */
@@ -104,14 +104,19 @@ static inline bool cpu_have_feature(unsigned int num)
 	return elf_hwcap & (1UL << num);
 }
 
+/* System capability check for constant caps */
+static inline bool cpus_have_const_cap(int num)
+{
+	if (num >= ARM64_NCAPS)
+		return false;
+	return static_branch_unlikely(&cpu_hwcap_keys[num]);
+}
+
 static inline bool cpus_have_cap(unsigned int num)
 {
 	if (num >= ARM64_NCAPS)
 		return false;
-	if (__builtin_constant_p(num))
-		return static_branch_unlikely(&cpu_hwcap_keys[num]);
-	else
-		return test_bit(num, cpu_hwcaps);
+	return test_bit(num, cpu_hwcaps);
 }
 
 static inline void cpus_set_cap(unsigned int num)
@@ -200,7 +205,7 @@ static inline bool cpu_supports_mixed_endian_el0(void)
 
 static inline bool system_supports_32bit_el0(void)
 {
-	return cpus_have_cap(ARM64_HAS_32BIT_EL0);
+	return cpus_have_const_cap(ARM64_HAS_32BIT_EL0);
 }
 
 static inline bool system_supports_mixed_endian_el0(void)
@@ -208,6 +213,17 @@ static inline bool system_supports_mixed_endian_el0(void)
 	return id_aa64mmfr0_mixed_endian_el0(read_system_reg(SYS_ID_AA64MMFR0_EL1));
 }
 
+static inline bool system_supports_fpsimd(void)
+{
+	return !cpus_have_const_cap(ARM64_HAS_NO_FPSIMD);
+}
+
+static inline bool system_uses_ttbr0_pan(void)
+{
+	return IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN) &&
+		!cpus_have_cap(ARM64_HAS_PAN);
+}
+
 #endif /* __ASSEMBLY__ */
 
 #endif
diff --git a/arch/arm64/include/asm/current.h b/arch/arm64/include/asm/current.h
new file mode 100644
index 000000000000..f2bcbe2d9889
--- /dev/null
+++ b/arch/arm64/include/asm/current.h
@@ -0,0 +1,22 @@
+#ifndef __ASM_CURRENT_H
+#define __ASM_CURRENT_H
+
+#include <linux/compiler.h>
+
+#include <asm/sysreg.h>
+
+#ifndef __ASSEMBLY__
+
+struct task_struct;
+
+static __always_inline struct task_struct *get_current(void)
+{
+	return (struct task_struct *)read_sysreg(sp_el0);
+}
+
+#define current get_current()
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __ASM_CURRENT_H */
+
diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h
index b71420a12f26..a44cf5225429 100644
--- a/arch/arm64/include/asm/debug-monitors.h
+++ b/arch/arm64/include/asm/debug-monitors.h
@@ -68,6 +68,9 @@
 #define BRK64_ESR_MASK		0xFFFF
 #define BRK64_ESR_KPROBES	0x0004
 #define BRK64_OPCODE_KPROBES	(AARCH64_BREAK_MON | (BRK64_ESR_KPROBES << 5))
+/* uprobes BRK opcodes with ESR encoding  */
+#define BRK64_ESR_UPROBES	0x0005
+#define BRK64_OPCODE_UPROBES	(AARCH64_BREAK_MON | (BRK64_ESR_UPROBES << 5))
 
 /* AArch32 */
 #define DBG_ESR_EVT_BKPT	0x4
diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index 771b3f0bc757..0b6b1633017f 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -1,6 +1,7 @@
 #ifndef _ASM_EFI_H
 #define _ASM_EFI_H
 
+#include <asm/cpufeature.h>
 #include <asm/io.h>
 #include <asm/mmu_context.h>
 #include <asm/neon.h>
@@ -78,7 +79,30 @@ static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt)
 
 static inline void efi_set_pgd(struct mm_struct *mm)
 {
-	switch_mm(NULL, mm, NULL);
+	__switch_mm(mm);
+
+	if (system_uses_ttbr0_pan()) {
+		if (mm != current->active_mm) {
+			/*
+			 * Update the current thread's saved ttbr0 since it is
+			 * restored as part of a return from exception. Set
+			 * the hardware TTBR0_EL1 using cpu_switch_mm()
+			 * directly to enable potential errata workarounds.
+			 */
+			update_saved_ttbr0(current, mm);
+			cpu_switch_mm(mm->pgd, mm);
+		} else {
+			/*
+			 * Defer the switch to the current thread's TTBR0_EL1
+			 * until uaccess_enable(). Restore the current
+			 * thread's saved ttbr0 corresponding to its active_mm
+			 * (if different from init_mm).
+			 */
+			cpu_set_reserved_ttbr0();
+			if (current->active_mm != &init_mm)
+				update_saved_ttbr0(current, current->active_mm);
+		}
+	}
 }
 
 void efi_virtmap_load(void);
diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index a55384f4a5d7..5d1700425efe 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -138,7 +138,11 @@ typedef struct user_fpsimd_state elf_fpregset_t;
  */
 #define ELF_PLAT_INIT(_r, load_addr)	(_r)->regs[0] = 0
 
-#define SET_PERSONALITY(ex)		clear_thread_flag(TIF_32BIT);
+#define SET_PERSONALITY(ex)						\
+({									\
+	clear_bit(TIF_32BIT, &current->mm->context.flags);		\
+	clear_thread_flag(TIF_32BIT);					\
+})
 
 /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
 #define ARCH_DLINFO							\
@@ -183,7 +187,11 @@ typedef compat_elf_greg_t		compat_elf_gregset_t[COMPAT_ELF_NGREG];
 					 ((x)->e_flags & EF_ARM_EABI_MASK))
 
 #define compat_start_thread		compat_start_thread
-#define COMPAT_SET_PERSONALITY(ex)	set_thread_flag(TIF_32BIT);
+#define COMPAT_SET_PERSONALITY(ex)					\
+({									\
+	set_bit(TIF_32BIT, &current->mm->context.flags);		\
+	set_thread_flag(TIF_32BIT);					\
+ })
 #define COMPAT_ARCH_DLINFO
 extern int aarch32_setup_vectors_page(struct linux_binprm *bprm,
 				      int uses_interp);
diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index f2585cdd32c2..85c4a8981d47 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -21,15 +21,12 @@
 #include <linux/futex.h>
 #include <linux/uaccess.h>
 
-#include <asm/alternative.h>
-#include <asm/cpufeature.h>
 #include <asm/errno.h>
-#include <asm/sysreg.h>
 
 #define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg)		\
+do {									\
+	uaccess_enable();						\
 	asm volatile(							\
-	ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN,		\
-		    CONFIG_ARM64_PAN)					\
 "	prfm	pstl1strm, %2\n"					\
 "1:	ldxr	%w1, %2\n"						\
 	insn "\n"							\
@@ -44,11 +41,11 @@
 "	.popsection\n"							\
 	_ASM_EXTABLE(1b, 4b)						\
 	_ASM_EXTABLE(2b, 4b)						\
-	ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN,		\
-		    CONFIG_ARM64_PAN)					\
 	: "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp)	\
 	: "r" (oparg), "Ir" (-EFAULT)					\
-	: "memory")
+	: "memory");							\
+	uaccess_disable();						\
+} while (0)
 
 static inline int
 futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
@@ -118,8 +115,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
+	uaccess_enable();
 	asm volatile("// futex_atomic_cmpxchg_inatomic\n"
-ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
 "	prfm	pstl1strm, %2\n"
 "1:	ldxr	%w1, %2\n"
 "	sub	%w3, %w1, %w4\n"
@@ -134,10 +131,10 @@ ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
 "	.popsection\n"
 	_ASM_EXTABLE(1b, 4b)
 	_ASM_EXTABLE(2b, 4b)
-ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
 	: "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp)
 	: "r" (oldval), "r" (newval), "Ir" (-EFAULT)
 	: "memory");
+	uaccess_disable();
 
 	*uval = val;
 	return ret;
diff --git a/arch/arm64/include/asm/hw_breakpoint.h b/arch/arm64/include/asm/hw_breakpoint.h
index 9510ace570e2..b6b167ac082b 100644
--- a/arch/arm64/include/asm/hw_breakpoint.h
+++ b/arch/arm64/include/asm/hw_breakpoint.h
@@ -77,7 +77,11 @@ static inline void decode_ctrl_reg(u32 reg,
 /* Lengths */
 #define ARM_BREAKPOINT_LEN_1	0x1
 #define ARM_BREAKPOINT_LEN_2	0x3
+#define ARM_BREAKPOINT_LEN_3	0x7
 #define ARM_BREAKPOINT_LEN_4	0xf
+#define ARM_BREAKPOINT_LEN_5	0x1f
+#define ARM_BREAKPOINT_LEN_6	0x3f
+#define ARM_BREAKPOINT_LEN_7	0x7f
 #define ARM_BREAKPOINT_LEN_8	0xff
 
 /* Kernel stepping */
@@ -119,7 +123,7 @@ struct perf_event;
 struct pmu;
 
 extern int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl,
-				  int *gen_len, int *gen_type);
+				  int *gen_len, int *gen_type, int *offset);
 extern int arch_check_bp_in_kernelspace(struct perf_event *bp);
 extern int arch_validate_hwbkpt_settings(struct perf_event *bp);
 extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index 7e51d1b57c0c..7803343e5881 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -19,6 +19,7 @@
 #ifndef __ASM_KERNEL_PGTABLE_H
 #define __ASM_KERNEL_PGTABLE_H
 
+#include <asm/pgtable.h>
 #include <asm/sparsemem.h>
 
 /*
@@ -54,6 +55,12 @@
 #define SWAPPER_DIR_SIZE	(SWAPPER_PGTABLE_LEVELS * PAGE_SIZE)
 #define IDMAP_DIR_SIZE		(IDMAP_PGTABLE_LEVELS * PAGE_SIZE)
 
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+#define RESERVED_TTBR0_SIZE	(PAGE_SIZE)
+#else
+#define RESERVED_TTBR0_SIZE	(0)
+#endif
+
 /* Initial memory map size */
 #if ARM64_SWAPPER_USES_SECTION_MAPS
 #define SWAPPER_BLOCK_SHIFT	SECTION_SHIFT
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 8d9fce037b2f..47619411f0ff 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -19,6 +19,7 @@
 typedef struct {
 	atomic64_t	id;
 	void		*vdso;
+	unsigned long	flags;
 } mm_context_t;
 
 /*
@@ -34,7 +35,7 @@ extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt);
 extern void init_mem_pgprot(void);
 extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
 			       unsigned long virt, phys_addr_t size,
-			       pgprot_t prot, bool allow_block_mappings);
+			       pgprot_t prot, bool page_mappings_only);
 extern void *fixmap_remap_fdt(phys_addr_t dt_phys);
 
 #endif
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index a50185375f09..0363fe80455c 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -23,6 +23,7 @@
 #include <linux/sched.h>
 
 #include <asm/cacheflush.h>
+#include <asm/cpufeature.h>
 #include <asm/proc-fns.h>
 #include <asm-generic/mm_hooks.h>
 #include <asm/cputype.h>
@@ -103,7 +104,7 @@ static inline void cpu_uninstall_idmap(void)
 	local_flush_tlb_all();
 	cpu_set_default_tcr_t0sz();
 
-	if (mm != &init_mm)
+	if (mm != &init_mm && !system_uses_ttbr0_pan())
 		cpu_switch_mm(mm->pgd, mm);
 }
 
@@ -163,20 +164,26 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
 }
 
-/*
- * This is the actual mm switch as far as the scheduler
- * is concerned.  No registers are touched.  We avoid
- * calling the CPU specific function when the mm hasn't
- * actually changed.
- */
-static inline void
-switch_mm(struct mm_struct *prev, struct mm_struct *next,
-	  struct task_struct *tsk)
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+static inline void update_saved_ttbr0(struct task_struct *tsk,
+				      struct mm_struct *mm)
 {
-	unsigned int cpu = smp_processor_id();
+	if (system_uses_ttbr0_pan()) {
+		BUG_ON(mm->pgd == swapper_pg_dir);
+		task_thread_info(tsk)->ttbr0 =
+			virt_to_phys(mm->pgd) | ASID(mm) << 48;
+	}
+}
+#else
+static inline void update_saved_ttbr0(struct task_struct *tsk,
+				      struct mm_struct *mm)
+{
+}
+#endif
 
-	if (prev == next)
-		return;
+static inline void __switch_mm(struct mm_struct *next)
+{
+	unsigned int cpu = smp_processor_id();
 
 	/*
 	 * init_mm.pgd does not contain any user mappings and it is always
@@ -190,8 +197,26 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	check_and_switch_context(next, cpu);
 }
 
+static inline void
+switch_mm(struct mm_struct *prev, struct mm_struct *next,
+	  struct task_struct *tsk)
+{
+	if (prev != next)
+		__switch_mm(next);
+
+	/*
+	 * Update the saved TTBR0_EL1 of the scheduled-in task as the previous
+	 * value may have not been initialised yet (activate_mm caller) or the
+	 * ASID has changed since the last run (following the context switch
+	 * of another thread of the same process). Avoid setting the reserved
+	 * TTBR0_EL1 to swapper_pg_dir (init_mm; e.g. via idle_task_exit).
+	 */
+	if (next != &init_mm)
+		update_saved_ttbr0(tsk, next);
+}
+
 #define deactivate_mm(tsk,mm)	do { } while (0)
-#define activate_mm(prev,next)	switch_mm(prev, next, NULL)
+#define activate_mm(prev,next)	switch_mm(prev, next, current)
 
 void verify_cpu_asid_bits(void);
 
diff --git a/arch/arm64/include/asm/neon.h b/arch/arm64/include/asm/neon.h
index 13ce4cc18e26..ad4cdc966c0f 100644
--- a/arch/arm64/include/asm/neon.h
+++ b/arch/arm64/include/asm/neon.h
@@ -9,8 +9,9 @@
  */
 
 #include <linux/types.h>
+#include <asm/fpsimd.h>
 
-#define cpu_has_neon()		(1)
+#define cpu_has_neon()		system_supports_fpsimd()
 
 #define kernel_neon_begin()	kernel_neon_begin_partial(32)
 
diff --git a/arch/arm64/include/asm/opcodes.h b/arch/arm64/include/asm/opcodes.h
deleted file mode 100644
index 123f45d92cd1..000000000000
--- a/arch/arm64/include/asm/opcodes.h
+++ /dev/null
@@ -1,5 +0,0 @@
-#ifdef CONFIG_CPU_BIG_ENDIAN
-#define CONFIG_CPU_ENDIAN_BE8 CONFIG_CPU_BIG_ENDIAN
-#endif
-
-#include <../../arm/include/asm/opcodes.h>
diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h
index 5394c8405e66..3bd498e4de4c 100644
--- a/arch/arm64/include/asm/percpu.h
+++ b/arch/arm64/include/asm/percpu.h
@@ -16,6 +16,8 @@
 #ifndef __ASM_PERCPU_H
 #define __ASM_PERCPU_H
 
+#include <asm/stack_pointer.h>
+
 static inline void set_my_cpu_offset(unsigned long off)
 {
 	asm volatile("msr tpidr_el1, %0" :: "r" (off) : "memory");
@@ -101,16 +103,16 @@ static inline unsigned long __percpu_read(void *ptr, int size)
 
 	switch (size) {
 	case 1:
-		ret = ACCESS_ONCE(*(u8 *)ptr);
+		ret = READ_ONCE(*(u8 *)ptr);
 		break;
 	case 2:
-		ret = ACCESS_ONCE(*(u16 *)ptr);
+		ret = READ_ONCE(*(u16 *)ptr);
 		break;
 	case 4:
-		ret = ACCESS_ONCE(*(u32 *)ptr);
+		ret = READ_ONCE(*(u32 *)ptr);
 		break;
 	case 8:
-		ret = ACCESS_ONCE(*(u64 *)ptr);
+		ret = READ_ONCE(*(u64 *)ptr);
 		break;
 	default:
 		BUILD_BUG();
@@ -123,16 +125,16 @@ static inline void __percpu_write(void *ptr, unsigned long val, int size)
 {
 	switch (size) {
 	case 1:
-		ACCESS_ONCE(*(u8 *)ptr) = (u8)val;
+		WRITE_ONCE(*(u8 *)ptr, (u8)val);
 		break;
 	case 2:
-		ACCESS_ONCE(*(u16 *)ptr) = (u16)val;
+		WRITE_ONCE(*(u16 *)ptr, (u16)val);
 		break;
 	case 4:
-		ACCESS_ONCE(*(u32 *)ptr) = (u32)val;
+		WRITE_ONCE(*(u32 *)ptr, (u32)val);
 		break;
 	case 8:
-		ACCESS_ONCE(*(u64 *)ptr) = (u64)val;
+		WRITE_ONCE(*(u64 *)ptr, (u64)val);
 		break;
 	default:
 		BUILD_BUG();
diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h
index 38b6a2b49d68..8d5cbec17d80 100644
--- a/arch/arm64/include/asm/perf_event.h
+++ b/arch/arm64/include/asm/perf_event.h
@@ -17,6 +17,8 @@
 #ifndef __ASM_PERF_EVENT_H
 #define __ASM_PERF_EVENT_H
 
+#include <asm/stack_pointer.h>
+
 #define	ARMV8_PMU_MAX_COUNTERS	32
 #define	ARMV8_PMU_COUNTER_MASK	(ARMV8_PMU_MAX_COUNTERS - 1)
 
diff --git a/arch/arm64/include/asm/probes.h b/arch/arm64/include/asm/probes.h
index 5af574d632fa..6a5b28904c33 100644
--- a/arch/arm64/include/asm/probes.h
+++ b/arch/arm64/include/asm/probes.h
@@ -15,21 +15,22 @@
 #ifndef _ARM_PROBES_H
 #define _ARM_PROBES_H
 
-#include <asm/opcodes.h>
-
-struct kprobe;
-struct arch_specific_insn;
-
-typedef u32 kprobe_opcode_t;
-typedef void (kprobes_handler_t) (u32 opcode, long addr, struct pt_regs *);
+typedef u32 probe_opcode_t;
+typedef void (probes_handler_t) (u32 opcode, long addr, struct pt_regs *);
 
 /* architecture specific copy of original instruction */
-struct arch_specific_insn {
-	kprobe_opcode_t *insn;
+struct arch_probe_insn {
+	probe_opcode_t *insn;
 	pstate_check_t *pstate_cc;
-	kprobes_handler_t *handler;
+	probes_handler_t *handler;
 	/* restore address after step xol */
 	unsigned long restore;
 };
+#ifdef CONFIG_KPROBES
+typedef u32 kprobe_opcode_t;
+struct arch_specific_insn {
+	struct arch_probe_insn api;
+};
+#endif
 
 #endif
diff --git a/arch/arm64/include/asm/ptdump.h b/arch/arm64/include/asm/ptdump.h
index 07b8ed037dee..6afd8476c60c 100644
--- a/arch/arm64/include/asm/ptdump.h
+++ b/arch/arm64/include/asm/ptdump.h
@@ -16,9 +16,10 @@
 #ifndef __ASM_PTDUMP_H
 #define __ASM_PTDUMP_H
 
-#ifdef CONFIG_ARM64_PTDUMP
+#ifdef CONFIG_ARM64_PTDUMP_CORE
 
 #include <linux/mm_types.h>
+#include <linux/seq_file.h>
 
 struct addr_marker {
 	unsigned long start_address;
@@ -29,16 +30,25 @@ struct ptdump_info {
 	struct mm_struct		*mm;
 	const struct addr_marker	*markers;
 	unsigned long			base_addr;
-	unsigned long			max_addr;
 };
 
-int ptdump_register(struct ptdump_info *info, const char *name);
-
+void ptdump_walk_pgd(struct seq_file *s, struct ptdump_info *info);
+#ifdef CONFIG_ARM64_PTDUMP_DEBUGFS
+int ptdump_debugfs_register(struct ptdump_info *info, const char *name);
 #else
-static inline int ptdump_register(struct ptdump_info *info, const char *name)
+static inline int ptdump_debugfs_register(struct ptdump_info *info,
+					const char *name)
 {
 	return 0;
 }
-#endif /* CONFIG_ARM64_PTDUMP */
+#endif
+void ptdump_check_wx(void);
+#endif /* CONFIG_ARM64_PTDUMP_CORE */
+
+#ifdef CONFIG_DEBUG_WX
+#define debug_checkwx()	ptdump_check_wx()
+#else
+#define debug_checkwx()	do { } while (0)
+#endif
 
 #endif /* __ASM_PTDUMP_H */
diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index ada08b5b036d..513daf050e84 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -217,6 +217,14 @@ int valid_user_regs(struct user_pt_regs *regs, struct task_struct *task);
 
 #include <asm-generic/ptrace.h>
 
+#define procedure_link_pointer(regs)	((regs)->regs[30])
+
+static inline void procedure_link_pointer_set(struct pt_regs *regs,
+					   unsigned long val)
+{
+	procedure_link_pointer(regs) = val;
+}
+
 #undef profile_pc
 extern unsigned long profile_pc(struct pt_regs *regs);
 
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index 022644704a93..d050d720a1b4 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -29,11 +29,22 @@
 
 #ifndef __ASSEMBLY__
 
+#include <asm/percpu.h>
+
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/thread_info.h>
 
-#define raw_smp_processor_id() (current_thread_info()->cpu)
+DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
+
+/*
+ * We don't use this_cpu_read(cpu_number) as that has implicit writes to
+ * preempt_count, and associated (compiler) barriers, that we'd like to avoid
+ * the expense of. If we're preemptible, the value can be stale at use anyway.
+ * And we can't use this_cpu_ptr() either, as that winds up recursing back
+ * here under CONFIG_DEBUG_PREEMPT=y.
+ */
+#define raw_smp_processor_id() (*raw_cpu_ptr(&cpu_number))
 
 struct seq_file;
 
@@ -73,6 +84,7 @@ asmlinkage void secondary_start_kernel(void);
  */
 struct secondary_data {
 	void *stack;
+	struct task_struct *task;
 	long status;
 };
 
diff --git a/arch/arm64/include/asm/stack_pointer.h b/arch/arm64/include/asm/stack_pointer.h
new file mode 100644
index 000000000000..ffcdf742cddf
--- /dev/null
+++ b/arch/arm64/include/asm/stack_pointer.h
@@ -0,0 +1,9 @@
+#ifndef __ASM_STACK_POINTER_H
+#define __ASM_STACK_POINTER_H
+
+/*
+ * how to get the current stack pointer from C
+ */
+register unsigned long current_stack_pointer asm ("sp");
+
+#endif /* __ASM_STACK_POINTER_H */
diff --git a/arch/arm64/include/asm/suspend.h b/arch/arm64/include/asm/suspend.h
index b8a313fd7a09..de5600f40adf 100644
--- a/arch/arm64/include/asm/suspend.h
+++ b/arch/arm64/include/asm/suspend.h
@@ -1,7 +1,7 @@
 #ifndef __ASM_SUSPEND_H
 #define __ASM_SUSPEND_H
 
-#define NR_CTX_REGS 10
+#define NR_CTX_REGS 12
 #define NR_CALLEE_SAVED_REGS 12
 
 /*
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 6c80b3699cb8..98ae03f8eedd 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -22,8 +22,6 @@
 
 #include <linux/stringify.h>
 
-#include <asm/opcodes.h>
-
 /*
  * ARMv8 ARM reserves the following encoding for system registers:
  * (Ref: ARMv8 ARM, Section: "System instruction class encoding overview",
@@ -37,6 +35,33 @@
 #define sys_reg(op0, op1, crn, crm, op2) \
 	((((op0)&3)<<19)|((op1)<<16)|((crn)<<12)|((crm)<<8)|((op2)<<5))
 
+#ifndef CONFIG_BROKEN_GAS_INST
+
+#ifdef __ASSEMBLY__
+#define __emit_inst(x)			.inst (x)
+#else
+#define __emit_inst(x)			".inst " __stringify((x)) "\n\t"
+#endif
+
+#else  /* CONFIG_BROKEN_GAS_INST */
+
+#ifndef CONFIG_CPU_BIG_ENDIAN
+#define __INSTR_BSWAP(x)		(x)
+#else  /* CONFIG_CPU_BIG_ENDIAN */
+#define __INSTR_BSWAP(x)		((((x) << 24) & 0xff000000)	| \
+					 (((x) <<  8) & 0x00ff0000)	| \
+					 (((x) >>  8) & 0x0000ff00)	| \
+					 (((x) >> 24) & 0x000000ff))
+#endif	/* CONFIG_CPU_BIG_ENDIAN */
+
+#ifdef __ASSEMBLY__
+#define __emit_inst(x)			.long __INSTR_BSWAP(x)
+#else  /* __ASSEMBLY__ */
+#define __emit_inst(x)			".long " __stringify(__INSTR_BSWAP(x)) "\n\t"
+#endif	/* __ASSEMBLY__ */
+
+#endif	/* CONFIG_BROKEN_GAS_INST */
+
 #define SYS_MIDR_EL1			sys_reg(3, 0, 0, 0, 0)
 #define SYS_MPIDR_EL1			sys_reg(3, 0, 0, 0, 5)
 #define SYS_REVIDR_EL1			sys_reg(3, 0, 0, 0, 6)
@@ -81,10 +106,10 @@
 #define REG_PSTATE_PAN_IMM		sys_reg(0, 0, 4, 0, 4)
 #define REG_PSTATE_UAO_IMM		sys_reg(0, 0, 4, 0, 3)
 
-#define SET_PSTATE_PAN(x) __inst_arm(0xd5000000 | REG_PSTATE_PAN_IMM |\
-				     (!!x)<<8 | 0x1f)
-#define SET_PSTATE_UAO(x) __inst_arm(0xd5000000 | REG_PSTATE_UAO_IMM |\
-				     (!!x)<<8 | 0x1f)
+#define SET_PSTATE_PAN(x) __emit_inst(0xd5000000 | REG_PSTATE_PAN_IMM |	\
+				      (!!x)<<8 | 0x1f)
+#define SET_PSTATE_UAO(x) __emit_inst(0xd5000000 | REG_PSTATE_UAO_IMM |	\
+				      (!!x)<<8 | 0x1f)
 
 /* Common SCTLR_ELx flags. */
 #define SCTLR_ELx_EE    (1 << 25)
@@ -228,11 +253,11 @@
 	.equ	.L__reg_num_xzr, 31
 
 	.macro	mrs_s, rt, sreg
-	.inst	0xd5200000|(\sreg)|(.L__reg_num_\rt)
+	 __emit_inst(0xd5200000|(\sreg)|(.L__reg_num_\rt))
 	.endm
 
 	.macro	msr_s, sreg, rt
-	.inst	0xd5000000|(\sreg)|(.L__reg_num_\rt)
+	__emit_inst(0xd5000000|(\sreg)|(.L__reg_num_\rt))
 	.endm
 
 #else
@@ -246,11 +271,11 @@ asm(
 "	.equ	.L__reg_num_xzr, 31\n"
 "\n"
 "	.macro	mrs_s, rt, sreg\n"
-"	.inst	0xd5200000|(\\sreg)|(.L__reg_num_\\rt)\n"
+	__emit_inst(0xd5200000|(\\sreg)|(.L__reg_num_\\rt))
 "	.endm\n"
 "\n"
 "	.macro	msr_s, sreg, rt\n"
-"	.inst	0xd5000000|(\\sreg)|(.L__reg_num_\\rt)\n"
+	__emit_inst(0xd5000000|(\\sreg)|(.L__reg_num_\\rt))
 "	.endm\n"
 );
 
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index e9ea5a6bd449..46c3b93cf865 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -36,58 +36,31 @@
 
 struct task_struct;
 
+#include <asm/stack_pointer.h>
 #include <asm/types.h>
 
 typedef unsigned long mm_segment_t;
 
 /*
  * low level task data that entry.S needs immediate access to.
- * __switch_to() assumes cpu_context follows immediately after cpu_domain.
  */
 struct thread_info {
 	unsigned long		flags;		/* low level flags */
 	mm_segment_t		addr_limit;	/* address limit */
-	struct task_struct	*task;		/* main task structure */
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+	u64			ttbr0;		/* saved TTBR0_EL1 */
+#endif
 	int			preempt_count;	/* 0 => preemptable, <0 => bug */
-	int			cpu;		/* cpu */
 };
 
 #define INIT_THREAD_INFO(tsk)						\
 {									\
-	.task		= &tsk,						\
-	.flags		= 0,						\
 	.preempt_count	= INIT_PREEMPT_COUNT,				\
 	.addr_limit	= KERNEL_DS,					\
 }
 
-#define init_thread_info	(init_thread_union.thread_info)
 #define init_stack		(init_thread_union.stack)
 
-/*
- * how to get the current stack pointer from C
- */
-register unsigned long current_stack_pointer asm ("sp");
-
-/*
- * how to get the thread information struct from C
- */
-static inline struct thread_info *current_thread_info(void) __attribute_const__;
-
-/*
- * struct thread_info can be accessed directly via sp_el0.
- *
- * We don't use read_sysreg() as we want the compiler to cache the value where
- * possible.
- */
-static inline struct thread_info *current_thread_info(void)
-{
-	unsigned long sp_el0;
-
-	asm ("mrs %0, sp_el0" : "=r" (sp_el0));
-
-	return (struct thread_info *)sp_el0;
-}
-
 #define thread_saved_pc(tsk)	\
 	((unsigned long)(tsk->thread.cpu_context.pc))
 #define thread_saved_sp(tsk)	\
@@ -112,6 +85,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_NEED_RESCHED	1
 #define TIF_NOTIFY_RESUME	2	/* callback before returning to user */
 #define TIF_FOREIGN_FPSTATE	3	/* CPU's FP state is not current's */
+#define TIF_UPROBE		4	/* uprobe breakpoint or singlestep */
 #define TIF_NOHZ		7
 #define TIF_SYSCALL_TRACE	8
 #define TIF_SYSCALL_AUDIT	9
@@ -132,10 +106,12 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
+#define _TIF_UPROBE		(1 << TIF_UPROBE)
 #define _TIF_32BIT		(1 << TIF_32BIT)
 
 #define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
-				 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE)
+				 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
+				 _TIF_UPROBE)
 
 #define _TIF_SYSCALL_WORK	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
 				 _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 55d0adbf6509..d26750ca6e06 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -18,6 +18,12 @@
 #ifndef __ASM_UACCESS_H
 #define __ASM_UACCESS_H
 
+#include <asm/alternative.h>
+#include <asm/kernel-pgtable.h>
+#include <asm/sysreg.h>
+
+#ifndef __ASSEMBLY__
+
 /*
  * User space memory access functions
  */
@@ -26,10 +32,8 @@
 #include <linux/string.h>
 #include <linux/thread_info.h>
 
-#include <asm/alternative.h>
 #include <asm/cpufeature.h>
 #include <asm/ptrace.h>
-#include <asm/sysreg.h>
 #include <asm/errno.h>
 #include <asm/memory.h>
 #include <asm/compiler.h>
@@ -120,6 +124,99 @@ static inline void set_fs(mm_segment_t fs)
 	"	.popsection\n"
 
 /*
+ * User access enabling/disabling.
+ */
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+static inline void __uaccess_ttbr0_disable(void)
+{
+	unsigned long ttbr;
+
+	/* reserved_ttbr0 placed at the end of swapper_pg_dir */
+	ttbr = read_sysreg(ttbr1_el1) + SWAPPER_DIR_SIZE;
+	write_sysreg(ttbr, ttbr0_el1);
+	isb();
+}
+
+static inline void __uaccess_ttbr0_enable(void)
+{
+	unsigned long flags;
+
+	/*
+	 * Disable interrupts to avoid preemption between reading the 'ttbr0'
+	 * variable and the MSR. A context switch could trigger an ASID
+	 * roll-over and an update of 'ttbr0'.
+	 */
+	local_irq_save(flags);
+	write_sysreg(current_thread_info()->ttbr0, ttbr0_el1);
+	isb();
+	local_irq_restore(flags);
+}
+
+static inline bool uaccess_ttbr0_disable(void)
+{
+	if (!system_uses_ttbr0_pan())
+		return false;
+	__uaccess_ttbr0_disable();
+	return true;
+}
+
+static inline bool uaccess_ttbr0_enable(void)
+{
+	if (!system_uses_ttbr0_pan())
+		return false;
+	__uaccess_ttbr0_enable();
+	return true;
+}
+#else
+static inline bool uaccess_ttbr0_disable(void)
+{
+	return false;
+}
+
+static inline bool uaccess_ttbr0_enable(void)
+{
+	return false;
+}
+#endif
+
+#define __uaccess_disable(alt)						\
+do {									\
+	if (!uaccess_ttbr0_disable())					\
+		asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt,		\
+				CONFIG_ARM64_PAN));			\
+} while (0)
+
+#define __uaccess_enable(alt)						\
+do {									\
+	if (!uaccess_ttbr0_enable())					\
+		asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt,		\
+				CONFIG_ARM64_PAN));			\
+} while (0)
+
+static inline void uaccess_disable(void)
+{
+	__uaccess_disable(ARM64_HAS_PAN);
+}
+
+static inline void uaccess_enable(void)
+{
+	__uaccess_enable(ARM64_HAS_PAN);
+}
+
+/*
+ * These functions are no-ops when UAO is present.
+ */
+static inline void uaccess_disable_not_uao(void)
+{
+	__uaccess_disable(ARM64_ALT_PAN_NOT_UAO);
+}
+
+static inline void uaccess_enable_not_uao(void)
+{
+	__uaccess_enable(ARM64_ALT_PAN_NOT_UAO);
+}
+
+/*
  * The "__xxx" versions of the user access functions do not verify the address
  * space - it must have been done previously with a separate "access_ok()"
  * call.
@@ -146,8 +243,7 @@ static inline void set_fs(mm_segment_t fs)
 do {									\
 	unsigned long __gu_val;						\
 	__chk_user_ptr(ptr);						\
-	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\
-			CONFIG_ARM64_PAN));				\
+	uaccess_enable_not_uao();					\
 	switch (sizeof(*(ptr))) {					\
 	case 1:								\
 		__get_user_asm("ldrb", "ldtrb", "%w", __gu_val, (ptr),  \
@@ -168,9 +264,8 @@ do {									\
 	default:							\
 		BUILD_BUG();						\
 	}								\
+	uaccess_disable_not_uao();					\
 	(x) = (__force __typeof__(*(ptr)))__gu_val;			\
-	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\
-			CONFIG_ARM64_PAN));				\
 } while (0)
 
 #define __get_user(x, ptr)						\
@@ -215,8 +310,7 @@ do {									\
 do {									\
 	__typeof__(*(ptr)) __pu_val = (x);				\
 	__chk_user_ptr(ptr);						\
-	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\
-			CONFIG_ARM64_PAN));				\
+	uaccess_enable_not_uao();					\
 	switch (sizeof(*(ptr))) {					\
 	case 1:								\
 		__put_user_asm("strb", "sttrb", "%w", __pu_val, (ptr),	\
@@ -237,8 +331,7 @@ do {									\
 	default:							\
 		BUILD_BUG();						\
 	}								\
-	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\
-			CONFIG_ARM64_PAN));				\
+	uaccess_disable_not_uao();					\
 } while (0)
 
 #define __put_user(x, ptr)						\
@@ -331,4 +424,66 @@ extern long strncpy_from_user(char *dest, const char __user *src, long count);
 extern __must_check long strlen_user(const char __user *str);
 extern __must_check long strnlen_user(const char __user *str, long n);
 
+#else	/* __ASSEMBLY__ */
+
+#include <asm/assembler.h>
+
+/*
+ * User access enabling/disabling macros.
+ */
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+	.macro	__uaccess_ttbr0_disable, tmp1
+	mrs	\tmp1, ttbr1_el1		// swapper_pg_dir
+	add	\tmp1, \tmp1, #SWAPPER_DIR_SIZE	// reserved_ttbr0 at the end of swapper_pg_dir
+	msr	ttbr0_el1, \tmp1		// set reserved TTBR0_EL1
+	isb
+	.endm
+
+	.macro	__uaccess_ttbr0_enable, tmp1
+	get_thread_info \tmp1
+	ldr	\tmp1, [\tmp1, #TSK_TI_TTBR0]	// load saved TTBR0_EL1
+	msr	ttbr0_el1, \tmp1		// set the non-PAN TTBR0_EL1
+	isb
+	.endm
+
+	.macro	uaccess_ttbr0_disable, tmp1
+alternative_if_not ARM64_HAS_PAN
+	__uaccess_ttbr0_disable \tmp1
+alternative_else_nop_endif
+	.endm
+
+	.macro	uaccess_ttbr0_enable, tmp1, tmp2
+alternative_if_not ARM64_HAS_PAN
+	save_and_disable_irq \tmp2		// avoid preemption
+	__uaccess_ttbr0_enable \tmp1
+	restore_irq \tmp2
+alternative_else_nop_endif
+	.endm
+#else
+	.macro	uaccess_ttbr0_disable, tmp1
+	.endm
+
+	.macro	uaccess_ttbr0_enable, tmp1, tmp2
+	.endm
+#endif
+
+/*
+ * These macros are no-ops when UAO is present.
+ */
+	.macro	uaccess_disable_not_uao, tmp1
+	uaccess_ttbr0_disable \tmp1
+alternative_if ARM64_ALT_PAN_NOT_UAO
+	SET_PSTATE_PAN(1)
+alternative_else_nop_endif
+	.endm
+
+	.macro	uaccess_enable_not_uao, tmp1, tmp2
+	uaccess_ttbr0_enable \tmp1, \tmp2
+alternative_if ARM64_ALT_PAN_NOT_UAO
+	SET_PSTATE_PAN(0)
+alternative_else_nop_endif
+	.endm
+
+#endif	/* __ASSEMBLY__ */
+
 #endif /* __ASM_UACCESS_H */
diff --git a/arch/arm64/include/asm/uprobes.h b/arch/arm64/include/asm/uprobes.h
new file mode 100644
index 000000000000..8d004073d0e8
--- /dev/null
+++ b/arch/arm64/include/asm/uprobes.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2014-2016 Pratyush Anand <panand@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_UPROBES_H
+#define _ASM_UPROBES_H
+
+#include <asm/debug-monitors.h>
+#include <asm/insn.h>
+#include <asm/probes.h>
+
+#define MAX_UINSN_BYTES		AARCH64_INSN_SIZE
+
+#define UPROBE_SWBP_INSN	BRK64_OPCODE_UPROBES
+#define UPROBE_SWBP_INSN_SIZE	AARCH64_INSN_SIZE
+#define UPROBE_XOL_SLOT_BYTES	MAX_UINSN_BYTES
+
+typedef u32 uprobe_opcode_t;
+
+struct arch_uprobe_task {
+};
+
+struct arch_uprobe {
+	union {
+		u8 insn[MAX_UINSN_BYTES];
+		u8 ixol[MAX_UINSN_BYTES];
+	};
+	struct arch_probe_insn api;
+	bool simulate;
+};
+
+#endif
diff --git a/arch/arm64/include/asm/xen/hypercall.h b/arch/arm64/include/asm/xen/hypercall.h
index 74b0c423ff5b..3522cbaed316 100644
--- a/arch/arm64/include/asm/xen/hypercall.h
+++ b/arch/arm64/include/asm/xen/hypercall.h
@@ -1 +1 @@
-#include <../../arm/include/asm/xen/hypercall.h>
+#include <xen/arm/hypercall.h>
diff --git a/arch/arm64/include/asm/xen/hypervisor.h b/arch/arm64/include/asm/xen/hypervisor.h
index f263da8e8769..d6e7709d0688 100644
--- a/arch/arm64/include/asm/xen/hypervisor.h
+++ b/arch/arm64/include/asm/xen/hypervisor.h
@@ -1 +1 @@
-#include <../../arm/include/asm/xen/hypervisor.h>
+#include <xen/arm/hypervisor.h>
diff --git a/arch/arm64/include/asm/xen/interface.h b/arch/arm64/include/asm/xen/interface.h
index 44457aebeed4..88c0d75da190 100644
--- a/arch/arm64/include/asm/xen/interface.h
+++ b/arch/arm64/include/asm/xen/interface.h
@@ -1 +1 @@
-#include <../../arm/include/asm/xen/interface.h>
+#include <xen/arm/interface.h>
diff --git a/arch/arm64/include/asm/xen/page-coherent.h b/arch/arm64/include/asm/xen/page-coherent.h
index 2052102b4e02..b3ef061d8b74 100644
--- a/arch/arm64/include/asm/xen/page-coherent.h
+++ b/arch/arm64/include/asm/xen/page-coherent.h
@@ -1 +1 @@
-#include <../../arm/include/asm/xen/page-coherent.h>
+#include <xen/arm/page-coherent.h>
diff --git a/arch/arm64/include/asm/xen/page.h b/arch/arm64/include/asm/xen/page.h
index bed87ec36780..31bbc803cecb 100644
--- a/arch/arm64/include/asm/xen/page.h
+++ b/arch/arm64/include/asm/xen/page.h
@@ -1 +1 @@
-#include <../../arm/include/asm/xen/page.h>
+#include <xen/arm/page.h>
diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c
index b0988bb1bf64..04de188a36c9 100644
--- a/arch/arm64/kernel/armv8_deprecated.c
+++ b/arch/arm64/kernel/armv8_deprecated.c
@@ -14,10 +14,8 @@
 #include <linux/slab.h>
 #include <linux/sysctl.h>
 
-#include <asm/alternative.h>
 #include <asm/cpufeature.h>
 #include <asm/insn.h>
-#include <asm/opcodes.h>
 #include <asm/sysreg.h>
 #include <asm/system_misc.h>
 #include <asm/traps.h>
@@ -285,10 +283,10 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table)
 #define __SWP_LL_SC_LOOPS	4
 
 #define __user_swpX_asm(data, addr, res, temp, temp2, B)	\
+do {								\
+	uaccess_enable();					\
 	__asm__ __volatile__(					\
 	"	mov		%w3, %w7\n"			\
-	ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN,	\
-		    CONFIG_ARM64_PAN)				\
 	"0:	ldxr"B"		%w2, [%4]\n"			\
 	"1:	stxr"B"		%w0, %w1, [%4]\n"		\
 	"	cbz		%w0, 2f\n"			\
@@ -306,12 +304,12 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table)
 	"	.popsection"					\
 	_ASM_EXTABLE(0b, 4b)					\
 	_ASM_EXTABLE(1b, 4b)					\
-	ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN,	\
-		CONFIG_ARM64_PAN)				\
 	: "=&r" (res), "+r" (data), "=&r" (temp), "=&r" (temp2)	\
 	: "r" (addr), "i" (-EAGAIN), "i" (-EFAULT),		\
 	  "i" (__SWP_LL_SC_LOOPS)				\
-	: "memory")
+	: "memory");						\
+	uaccess_disable();					\
+} while (0)
 
 #define __user_swp_asm(data, addr, res, temp, temp2) \
 	__user_swpX_asm(data, addr, res, temp, temp2, "")
@@ -352,6 +350,10 @@ static int emulate_swpX(unsigned int address, unsigned int *data,
 	return res;
 }
 
+#define ARM_OPCODE_CONDTEST_FAIL   0
+#define ARM_OPCODE_CONDTEST_PASS   1
+#define ARM_OPCODE_CONDTEST_UNCOND 2
+
 #define	ARM_OPCODE_CONDITION_UNCOND	0xf
 
 static unsigned int __kprobes aarch32_check_condition(u32 opcode, u32 psr)
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 4a2f0f0fef32..bc049afc73a7 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -36,11 +36,13 @@ int main(void)
 {
   DEFINE(TSK_ACTIVE_MM,		offsetof(struct task_struct, active_mm));
   BLANK();
-  DEFINE(TI_FLAGS,		offsetof(struct thread_info, flags));
-  DEFINE(TI_PREEMPT,		offsetof(struct thread_info, preempt_count));
-  DEFINE(TI_ADDR_LIMIT,		offsetof(struct thread_info, addr_limit));
-  DEFINE(TI_TASK,		offsetof(struct thread_info, task));
-  DEFINE(TI_CPU,		offsetof(struct thread_info, cpu));
+  DEFINE(TSK_TI_FLAGS,		offsetof(struct task_struct, thread_info.flags));
+  DEFINE(TSK_TI_PREEMPT,	offsetof(struct task_struct, thread_info.preempt_count));
+  DEFINE(TSK_TI_ADDR_LIMIT,	offsetof(struct task_struct, thread_info.addr_limit));
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+  DEFINE(TSK_TI_TTBR0,		offsetof(struct task_struct, thread_info.ttbr0));
+#endif
+  DEFINE(TSK_STACK,		offsetof(struct task_struct, stack));
   BLANK();
   DEFINE(THREAD_CPU_CONTEXT,	offsetof(struct task_struct, thread.cpu_context));
   BLANK();
@@ -123,6 +125,7 @@ int main(void)
   DEFINE(TZ_DSTTIME,		offsetof(struct timezone, tz_dsttime));
   BLANK();
   DEFINE(CPU_BOOT_STACK,	offsetof(struct secondary_data, stack));
+  DEFINE(CPU_BOOT_TASK,		offsetof(struct secondary_data, task));
   BLANK();
 #ifdef CONFIG_KVM_ARM_HOST
   DEFINE(VCPU_CONTEXT,		offsetof(struct kvm_vcpu, arch.ctxt));
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index c02504ea304b..fdf8f045929f 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -47,6 +47,7 @@ unsigned int compat_elf_hwcap2 __read_mostly;
 #endif
 
 DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS);
+EXPORT_SYMBOL(cpu_hwcaps);
 
 DEFINE_STATIC_KEY_ARRAY_FALSE(cpu_hwcap_keys, ARM64_NCAPS);
 EXPORT_SYMBOL(cpu_hwcap_keys);
@@ -746,6 +747,14 @@ static bool hyp_offset_low(const struct arm64_cpu_capabilities *entry,
 	return idmap_addr > GENMASK(VA_BITS - 2, 0) && !is_kernel_in_hyp_mode();
 }
 
+static bool has_no_fpsimd(const struct arm64_cpu_capabilities *entry, int __unused)
+{
+	u64 pfr0 = read_system_reg(SYS_ID_AA64PFR0_EL1);
+
+	return cpuid_feature_extract_signed_field(pfr0,
+					ID_AA64PFR0_FP_SHIFT) < 0;
+}
+
 static const struct arm64_cpu_capabilities arm64_features[] = {
 	{
 		.desc = "GIC system register CPU interface",
@@ -829,6 +838,13 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.def_scope = SCOPE_SYSTEM,
 		.matches = hyp_offset_low,
 	},
+	{
+		/* FP/SIMD is not implemented */
+		.capability = ARM64_HAS_NO_FPSIMD,
+		.def_scope = SCOPE_SYSTEM,
+		.min_field_value = 0,
+		.matches = has_no_fpsimd,
+	},
 	{},
 };
 
@@ -1102,5 +1118,5 @@ void __init setup_cpu_features(void)
 static bool __maybe_unused
 cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry, int __unused)
 {
-	return (cpus_have_cap(ARM64_HAS_PAN) && !cpus_have_cap(ARM64_HAS_UAO));
+	return (cpus_have_const_cap(ARM64_HAS_PAN) && !cpus_have_const_cap(ARM64_HAS_UAO));
 }
diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c
index 73ae90ef434c..605df76f0a06 100644
--- a/arch/arm64/kernel/debug-monitors.c
+++ b/arch/arm64/kernel/debug-monitors.c
@@ -226,6 +226,8 @@ static void send_user_sigtrap(int si_code)
 static int single_step_handler(unsigned long addr, unsigned int esr,
 			       struct pt_regs *regs)
 {
+	bool handler_found = false;
+
 	/*
 	 * If we are stepping a pending breakpoint, call the hw_breakpoint
 	 * handler first.
@@ -233,7 +235,14 @@ static int single_step_handler(unsigned long addr, unsigned int esr,
 	if (!reinstall_suspended_bps(regs))
 		return 0;
 
-	if (user_mode(regs)) {
+#ifdef	CONFIG_KPROBES
+	if (kprobe_single_step_handler(regs, esr) == DBG_HOOK_HANDLED)
+		handler_found = true;
+#endif
+	if (!handler_found && call_step_hook(regs, esr) == DBG_HOOK_HANDLED)
+		handler_found = true;
+
+	if (!handler_found && user_mode(regs)) {
 		send_user_sigtrap(TRAP_TRACE);
 
 		/*
@@ -243,15 +252,8 @@ static int single_step_handler(unsigned long addr, unsigned int esr,
 		 * to the active-not-pending state).
 		 */
 		user_rewind_single_step(current);
-	} else {
-#ifdef	CONFIG_KPROBES
-		if (kprobe_single_step_handler(regs, esr) == DBG_HOOK_HANDLED)
-			return 0;
-#endif
-		if (call_step_hook(regs, esr) == DBG_HOOK_HANDLED)
-			return 0;
-
-		pr_warning("Unexpected kernel single-step exception at EL1\n");
+	} else if (!handler_found) {
+		pr_warn("Unexpected kernel single-step exception at EL1\n");
 		/*
 		 * Re-enable stepping since we know that we will be
 		 * returning to regs.
@@ -304,16 +306,20 @@ NOKPROBE_SYMBOL(call_break_hook);
 static int brk_handler(unsigned long addr, unsigned int esr,
 		       struct pt_regs *regs)
 {
-	if (user_mode(regs)) {
-		send_user_sigtrap(TRAP_BRKPT);
-	}
+	bool handler_found = false;
+
 #ifdef	CONFIG_KPROBES
-	else if ((esr & BRK64_ESR_MASK) == BRK64_ESR_KPROBES) {
-		if (kprobe_breakpoint_handler(regs, esr) != DBG_HOOK_HANDLED)
-			return -EFAULT;
+	if ((esr & BRK64_ESR_MASK) == BRK64_ESR_KPROBES) {
+		if (kprobe_breakpoint_handler(regs, esr) == DBG_HOOK_HANDLED)
+			handler_found = true;
 	}
 #endif
-	else if (call_break_hook(regs, esr) != DBG_HOOK_HANDLED) {
+	if (!handler_found && call_break_hook(regs, esr) == DBG_HOOK_HANDLED)
+		handler_found = true;
+
+	if (!handler_found && user_mode(regs)) {
+		send_user_sigtrap(TRAP_BRKPT);
+	} else if (!handler_found) {
 		pr_warn("Unexpected kernel BRK exception at EL1\n");
 		return -EFAULT;
 	}
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index ba9bee389fd5..5d17f377d905 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -62,8 +62,8 @@ struct screen_info screen_info __section(.data);
 int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md)
 {
 	pteval_t prot_val = create_mapping_protection(md);
-	bool allow_block_mappings = (md->type != EFI_RUNTIME_SERVICES_CODE &&
-				     md->type != EFI_RUNTIME_SERVICES_DATA);
+	bool page_mappings_only = (md->type == EFI_RUNTIME_SERVICES_CODE ||
+				   md->type == EFI_RUNTIME_SERVICES_DATA);
 
 	if (!PAGE_ALIGNED(md->phys_addr) ||
 	    !PAGE_ALIGNED(md->num_pages << EFI_PAGE_SHIFT)) {
@@ -76,12 +76,12 @@ int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md)
 		 * from the MMU routines. So avoid block mappings altogether in
 		 * that case.
 		 */
-		allow_block_mappings = false;
+		page_mappings_only = true;
 	}
 
 	create_pgd_mapping(mm, md->phys_addr, md->virt_addr,
 			   md->num_pages << EFI_PAGE_SHIFT,
-			   __pgprot(prot_val | PTE_NG), allow_block_mappings);
+			   __pgprot(prot_val | PTE_NG), page_mappings_only);
 	return 0;
 }
 
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 223d54a4d66b..4f0d76339414 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -29,7 +29,9 @@
 #include <asm/esr.h>
 #include <asm/irq.h>
 #include <asm/memory.h>
+#include <asm/ptrace.h>
 #include <asm/thread_info.h>
+#include <asm/uaccess.h>
 #include <asm/unistd.h>
 
 /*
@@ -90,9 +92,8 @@
 
 	.if	\el == 0
 	mrs	x21, sp_el0
-	mov	tsk, sp
-	and	tsk, tsk, #~(THREAD_SIZE - 1)	// Ensure MDSCR_EL1.SS is clear,
-	ldr	x19, [tsk, #TI_FLAGS]		// since we can unmask debug
+	ldr_this_cpu	tsk, __entry_task, x20	// Ensure MDSCR_EL1.SS is clear,
+	ldr	x19, [tsk, #TSK_TI_FLAGS]	// since we can unmask debug
 	disable_step_tsk x19, x20		// exceptions when scheduling.
 
 	mov	x29, xzr			// fp pointed to user-space
@@ -100,15 +101,41 @@
 	add	x21, sp, #S_FRAME_SIZE
 	get_thread_info tsk
 	/* Save the task's original addr_limit and set USER_DS (TASK_SIZE_64) */
-	ldr	x20, [tsk, #TI_ADDR_LIMIT]
+	ldr	x20, [tsk, #TSK_TI_ADDR_LIMIT]
 	str	x20, [sp, #S_ORIG_ADDR_LIMIT]
 	mov	x20, #TASK_SIZE_64
-	str	x20, [tsk, #TI_ADDR_LIMIT]
+	str	x20, [tsk, #TSK_TI_ADDR_LIMIT]
 	/* No need to reset PSTATE.UAO, hardware's already set it to 0 for us */
 	.endif /* \el == 0 */
 	mrs	x22, elr_el1
 	mrs	x23, spsr_el1
 	stp	lr, x21, [sp, #S_LR]
+
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+	/*
+	 * Set the TTBR0 PAN bit in SPSR. When the exception is taken from
+	 * EL0, there is no need to check the state of TTBR0_EL1 since
+	 * accesses are always enabled.
+	 * Note that the meaning of this bit differs from the ARMv8.1 PAN
+	 * feature as all TTBR0_EL1 accesses are disabled, not just those to
+	 * user mappings.
+	 */
+alternative_if ARM64_HAS_PAN
+	b	1f				// skip TTBR0 PAN
+alternative_else_nop_endif
+
+	.if	\el != 0
+	mrs	x21, ttbr0_el1
+	tst	x21, #0xffff << 48		// Check for the reserved ASID
+	orr	x23, x23, #PSR_PAN_BIT		// Set the emulated PAN in the saved SPSR
+	b.eq	1f				// TTBR0 access already disabled
+	and	x23, x23, #~PSR_PAN_BIT		// Clear the emulated PAN in the saved SPSR
+	.endif
+
+	__uaccess_ttbr0_disable x21
+1:
+#endif
+
 	stp	x22, x23, [sp, #S_PC]
 
 	/*
@@ -139,7 +166,7 @@
 	.if	\el != 0
 	/* Restore the task's original addr_limit. */
 	ldr	x20, [sp, #S_ORIG_ADDR_LIMIT]
-	str	x20, [tsk, #TI_ADDR_LIMIT]
+	str	x20, [tsk, #TSK_TI_ADDR_LIMIT]
 
 	/* No need to restore UAO, it will be restored from SPSR_EL1 */
 	.endif
@@ -147,6 +174,40 @@
 	ldp	x21, x22, [sp, #S_PC]		// load ELR, SPSR
 	.if	\el == 0
 	ct_user_enter
+	.endif
+
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+	/*
+	 * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR
+	 * PAN bit checking.
+	 */
+alternative_if ARM64_HAS_PAN
+	b	2f				// skip TTBR0 PAN
+alternative_else_nop_endif
+
+	.if	\el != 0
+	tbnz	x22, #22, 1f			// Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set
+	.endif
+
+	__uaccess_ttbr0_enable x0
+
+	.if	\el == 0
+	/*
+	 * Enable errata workarounds only if returning to user. The only
+	 * workaround currently required for TTBR0_EL1 changes are for the
+	 * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache
+	 * corruption).
+	 */
+	post_ttbr0_update_workaround
+	.endif
+1:
+	.if	\el != 0
+	and	x22, x22, #~PSR_PAN_BIT		// ARMv8.0 CPUs do not understand this bit
+	.endif
+2:
+#endif
+
+	.if	\el == 0
 	ldr	x23, [sp, #S_SP]		// load return stack pointer
 	msr	sp_el0, x23
 #ifdef CONFIG_ARM64_ERRATUM_845719
@@ -162,6 +223,7 @@ alternative_if ARM64_WORKAROUND_845719
 alternative_else_nop_endif
 #endif
 	.endif
+
 	msr	elr_el1, x21			// set up the return data
 	msr	spsr_el1, x22
 	ldp	x0, x1, [sp, #16 * 0]
@@ -184,23 +246,20 @@ alternative_else_nop_endif
 	eret					// return to kernel
 	.endm
 
-	.macro	get_thread_info, rd
-	mrs	\rd, sp_el0
-	.endm
-
 	.macro	irq_stack_entry
 	mov	x19, sp			// preserve the original sp
 
 	/*
-	 * Compare sp with the current thread_info, if the top
-	 * ~(THREAD_SIZE - 1) bits match, we are on a task stack, and
-	 * should switch to the irq stack.
+	 * Compare sp with the base of the task stack.
+	 * If the top ~(THREAD_SIZE - 1) bits match, we are on a task stack,
+	 * and should switch to the irq stack.
 	 */
-	and	x25, x19, #~(THREAD_SIZE - 1)
-	cmp	x25, tsk
-	b.ne	9998f
+	ldr	x25, [tsk, TSK_STACK]
+	eor	x25, x25, x19
+	and	x25, x25, #~(THREAD_SIZE - 1)
+	cbnz	x25, 9998f
 
-	this_cpu_ptr irq_stack, x25, x26
+	adr_this_cpu x25, irq_stack, x26
 	mov	x26, #IRQ_STACK_START_SP
 	add	x26, x25, x26
 
@@ -427,9 +486,9 @@ el1_irq:
 	irq_handler
 
 #ifdef CONFIG_PREEMPT
-	ldr	w24, [tsk, #TI_PREEMPT]		// get preempt count
+	ldr	w24, [tsk, #TSK_TI_PREEMPT]	// get preempt count
 	cbnz	w24, 1f				// preempt count != 0
-	ldr	x0, [tsk, #TI_FLAGS]		// get flags
+	ldr	x0, [tsk, #TSK_TI_FLAGS]	// get flags
 	tbz	x0, #TIF_NEED_RESCHED, 1f	// needs rescheduling?
 	bl	el1_preempt
 1:
@@ -444,7 +503,7 @@ ENDPROC(el1_irq)
 el1_preempt:
 	mov	x24, lr
 1:	bl	preempt_schedule_irq		// irq en/disable is done inside
-	ldr	x0, [tsk, #TI_FLAGS]		// get new tasks TI_FLAGS
+	ldr	x0, [tsk, #TSK_TI_FLAGS]	// get new tasks TI_FLAGS
 	tbnz	x0, #TIF_NEED_RESCHED, 1b	// needs rescheduling?
 	ret	x24
 #endif
@@ -674,8 +733,7 @@ ENTRY(cpu_switch_to)
 	ldp	x29, x9, [x8], #16
 	ldr	lr, [x8]
 	mov	sp, x9
-	and	x9, x9, #~(THREAD_SIZE - 1)
-	msr	sp_el0, x9
+	msr	sp_el0, x1
 	ret
 ENDPROC(cpu_switch_to)
 
@@ -686,7 +744,7 @@ ENDPROC(cpu_switch_to)
 ret_fast_syscall:
 	disable_irq				// disable interrupts
 	str	x0, [sp, #S_X0]			// returned x0
-	ldr	x1, [tsk, #TI_FLAGS]		// re-check for syscall tracing
+	ldr	x1, [tsk, #TSK_TI_FLAGS]	// re-check for syscall tracing
 	and	x2, x1, #_TIF_SYSCALL_WORK
 	cbnz	x2, ret_fast_syscall_trace
 	and	x2, x1, #_TIF_WORK_MASK
@@ -706,14 +764,14 @@ work_pending:
 #ifdef CONFIG_TRACE_IRQFLAGS
 	bl	trace_hardirqs_on		// enabled while in userspace
 #endif
-	ldr	x1, [tsk, #TI_FLAGS]		// re-check for single-step
+	ldr	x1, [tsk, #TSK_TI_FLAGS]	// re-check for single-step
 	b	finish_ret_to_user
 /*
  * "slow" syscall return path.
  */
 ret_to_user:
 	disable_irq				// disable interrupts
-	ldr	x1, [tsk, #TI_FLAGS]
+	ldr	x1, [tsk, #TSK_TI_FLAGS]
 	and	x2, x1, #_TIF_WORK_MASK
 	cbnz	x2, work_pending
 finish_ret_to_user:
@@ -746,7 +804,7 @@ el0_svc_naked:					// compat entry point
 	enable_dbg_and_irq
 	ct_user_exit 1
 
-	ldr	x16, [tsk, #TI_FLAGS]		// check for syscall hooks
+	ldr	x16, [tsk, #TSK_TI_FLAGS]	// check for syscall hooks
 	tst	x16, #_TIF_SYSCALL_WORK
 	b.ne	__sys_trace
 	cmp     scno, sc_nr                     // check upper syscall limit
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 394c61db5566..b883f1f75216 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -127,6 +127,8 @@ void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs)
 
 void fpsimd_thread_switch(struct task_struct *next)
 {
+	if (!system_supports_fpsimd())
+		return;
 	/*
 	 * Save the current FPSIMD state to memory, but only if whatever is in
 	 * the registers is in fact the most recent userland FPSIMD state of
@@ -157,6 +159,8 @@ void fpsimd_thread_switch(struct task_struct *next)
 
 void fpsimd_flush_thread(void)
 {
+	if (!system_supports_fpsimd())
+		return;
 	memset(&current->thread.fpsimd_state, 0, sizeof(struct fpsimd_state));
 	fpsimd_flush_task_state(current);
 	set_thread_flag(TIF_FOREIGN_FPSTATE);
@@ -168,6 +172,8 @@ void fpsimd_flush_thread(void)
  */
 void fpsimd_preserve_current_state(void)
 {
+	if (!system_supports_fpsimd())
+		return;
 	preempt_disable();
 	if (!test_thread_flag(TIF_FOREIGN_FPSTATE))
 		fpsimd_save_state(&current->thread.fpsimd_state);
@@ -181,6 +187,8 @@ void fpsimd_preserve_current_state(void)
  */
 void fpsimd_restore_current_state(void)
 {
+	if (!system_supports_fpsimd())
+		return;
 	preempt_disable();
 	if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) {
 		struct fpsimd_state *st = &current->thread.fpsimd_state;
@@ -199,6 +207,8 @@ void fpsimd_restore_current_state(void)
  */
 void fpsimd_update_current_state(struct fpsimd_state *state)
 {
+	if (!system_supports_fpsimd())
+		return;
 	preempt_disable();
 	fpsimd_load_state(state);
 	if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) {
@@ -228,6 +238,8 @@ static DEFINE_PER_CPU(struct fpsimd_partial_state, softirq_fpsimdstate);
  */
 void kernel_neon_begin_partial(u32 num_regs)
 {
+	if (WARN_ON(!system_supports_fpsimd()))
+		return;
 	if (in_interrupt()) {
 		struct fpsimd_partial_state *s = this_cpu_ptr(
 			in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate);
@@ -252,6 +264,8 @@ EXPORT_SYMBOL(kernel_neon_begin_partial);
 
 void kernel_neon_end(void)
 {
+	if (!system_supports_fpsimd())
+		return;
 	if (in_interrupt()) {
 		struct fpsimd_partial_state *s = this_cpu_ptr(
 			in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate);
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 332e33193ccf..4b1abac3485a 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -326,14 +326,14 @@ __create_page_tables:
 	 * dirty cache lines being evicted.
 	 */
 	adrp	x0, idmap_pg_dir
-	adrp	x1, swapper_pg_dir + SWAPPER_DIR_SIZE
+	adrp	x1, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE
 	bl	__inval_cache_range
 
 	/*
 	 * Clear the idmap and swapper page tables.
 	 */
 	adrp	x0, idmap_pg_dir
-	adrp	x6, swapper_pg_dir + SWAPPER_DIR_SIZE
+	adrp	x6, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE
 1:	stp	xzr, xzr, [x0], #16
 	stp	xzr, xzr, [x0], #16
 	stp	xzr, xzr, [x0], #16
@@ -412,7 +412,7 @@ __create_page_tables:
 	 * tables again to remove any speculatively loaded cache lines.
 	 */
 	adrp	x0, idmap_pg_dir
-	adrp	x1, swapper_pg_dir + SWAPPER_DIR_SIZE
+	adrp	x1, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE
 	dmb	sy
 	bl	__inval_cache_range
 
@@ -428,7 +428,8 @@ ENDPROC(__create_page_tables)
 __primary_switched:
 	adrp	x4, init_thread_union
 	add	sp, x4, #THREAD_SIZE
-	msr	sp_el0, x4			// Save thread_info
+	adr_l	x5, init_task
+	msr	sp_el0, x5			// Save thread_info
 
 	adr_l	x8, vectors			// load VBAR_EL1 with virtual
 	msr	vbar_el1, x8			// vector table address
@@ -524,10 +525,21 @@ set_hcr:
 	msr	hcr_el2, x0
 	isb
 
-	/* Generic timers. */
+	/*
+	 * Allow Non-secure EL1 and EL0 to access physical timer and counter.
+	 * This is not necessary for VHE, since the host kernel runs in EL2,
+	 * and EL0 accesses are configured in the later stage of boot process.
+	 * Note that when HCR_EL2.E2H == 1, CNTHCTL_EL2 has the same bit layout
+	 * as CNTKCTL_EL1, and CNTKCTL_EL1 accessing instructions are redefined
+	 * to access CNTHCTL_EL2. This allows the kernel designed to run at EL1
+	 * to transparently mess with the EL0 bits via CNTKCTL_EL1 access in
+	 * EL2.
+	 */
+	cbnz	x2, 1f
 	mrs	x0, cnthctl_el2
 	orr	x0, x0, #3			// Enable EL1 physical timers
 	msr	cnthctl_el2, x0
+1:
 	msr	cntvoff_el2, xzr		// Clear virtual offset
 
 #ifdef CONFIG_ARM_GIC_V3
@@ -699,10 +711,10 @@ __secondary_switched:
 	isb
 
 	adr_l	x0, secondary_data
-	ldr	x0, [x0, #CPU_BOOT_STACK]	// get secondary_data.stack
-	mov	sp, x0
-	and	x0, x0, #~(THREAD_SIZE - 1)
-	msr	sp_el0, x0			// save thread_info
+	ldr	x1, [x0, #CPU_BOOT_STACK]	// get secondary_data.stack
+	mov	sp, x1
+	ldr	x2, [x0, #CPU_BOOT_TASK]
+	msr	sp_el0, x2
 	mov	x29, #0
 	b	secondary_start_kernel
 ENDPROC(__secondary_switched)
diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
index d55a7b09959b..fe301cbcb442 100644
--- a/arch/arm64/kernel/hibernate.c
+++ b/arch/arm64/kernel/hibernate.c
@@ -136,7 +136,7 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size)
 
 	/* Save the mpidr of the cpu we called cpu_suspend() on... */
 	if (sleep_cpu < 0) {
-		pr_err("Failing to hibernate on an unkown CPU.\n");
+		pr_err("Failing to hibernate on an unknown CPU.\n");
 		return -ENODEV;
 	}
 	hdr->sleep_cpu_mpidr = cpu_logical_map(sleep_cpu);
@@ -547,7 +547,7 @@ out:
 int hibernate_resume_nonboot_cpu_disable(void)
 {
 	if (sleep_cpu < 0) {
-		pr_err("Failing to resume from hibernate on an unkown CPU.\n");
+		pr_err("Failing to resume from hibernate on an unknown CPU.\n");
 		return -ENODEV;
 	}
 
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index 948b73148d56..1b3c747fedda 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -317,9 +317,21 @@ static int get_hbp_len(u8 hbp_len)
 	case ARM_BREAKPOINT_LEN_2:
 		len_in_bytes = 2;
 		break;
+	case ARM_BREAKPOINT_LEN_3:
+		len_in_bytes = 3;
+		break;
 	case ARM_BREAKPOINT_LEN_4:
 		len_in_bytes = 4;
 		break;
+	case ARM_BREAKPOINT_LEN_5:
+		len_in_bytes = 5;
+		break;
+	case ARM_BREAKPOINT_LEN_6:
+		len_in_bytes = 6;
+		break;
+	case ARM_BREAKPOINT_LEN_7:
+		len_in_bytes = 7;
+		break;
 	case ARM_BREAKPOINT_LEN_8:
 		len_in_bytes = 8;
 		break;
@@ -349,7 +361,7 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp)
  * to generic breakpoint descriptions.
  */
 int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl,
-			   int *gen_len, int *gen_type)
+			   int *gen_len, int *gen_type, int *offset)
 {
 	/* Type */
 	switch (ctrl.type) {
@@ -369,17 +381,33 @@ int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl,
 		return -EINVAL;
 	}
 
+	if (!ctrl.len)
+		return -EINVAL;
+	*offset = __ffs(ctrl.len);
+
 	/* Len */
-	switch (ctrl.len) {
+	switch (ctrl.len >> *offset) {
 	case ARM_BREAKPOINT_LEN_1:
 		*gen_len = HW_BREAKPOINT_LEN_1;
 		break;
 	case ARM_BREAKPOINT_LEN_2:
 		*gen_len = HW_BREAKPOINT_LEN_2;
 		break;
+	case ARM_BREAKPOINT_LEN_3:
+		*gen_len = HW_BREAKPOINT_LEN_3;
+		break;
 	case ARM_BREAKPOINT_LEN_4:
 		*gen_len = HW_BREAKPOINT_LEN_4;
 		break;
+	case ARM_BREAKPOINT_LEN_5:
+		*gen_len = HW_BREAKPOINT_LEN_5;
+		break;
+	case ARM_BREAKPOINT_LEN_6:
+		*gen_len = HW_BREAKPOINT_LEN_6;
+		break;
+	case ARM_BREAKPOINT_LEN_7:
+		*gen_len = HW_BREAKPOINT_LEN_7;
+		break;
 	case ARM_BREAKPOINT_LEN_8:
 		*gen_len = HW_BREAKPOINT_LEN_8;
 		break;
@@ -423,9 +451,21 @@ static int arch_build_bp_info(struct perf_event *bp)
 	case HW_BREAKPOINT_LEN_2:
 		info->ctrl.len = ARM_BREAKPOINT_LEN_2;
 		break;
+	case HW_BREAKPOINT_LEN_3:
+		info->ctrl.len = ARM_BREAKPOINT_LEN_3;
+		break;
 	case HW_BREAKPOINT_LEN_4:
 		info->ctrl.len = ARM_BREAKPOINT_LEN_4;
 		break;
+	case HW_BREAKPOINT_LEN_5:
+		info->ctrl.len = ARM_BREAKPOINT_LEN_5;
+		break;
+	case HW_BREAKPOINT_LEN_6:
+		info->ctrl.len = ARM_BREAKPOINT_LEN_6;
+		break;
+	case HW_BREAKPOINT_LEN_7:
+		info->ctrl.len = ARM_BREAKPOINT_LEN_7;
+		break;
 	case HW_BREAKPOINT_LEN_8:
 		info->ctrl.len = ARM_BREAKPOINT_LEN_8;
 		break;
@@ -517,18 +557,17 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
 		default:
 			return -EINVAL;
 		}
-
-		info->address &= ~alignment_mask;
-		info->ctrl.len <<= offset;
 	} else {
 		if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE)
 			alignment_mask = 0x3;
 		else
 			alignment_mask = 0x7;
-		if (info->address & alignment_mask)
-			return -EINVAL;
+		offset = info->address & alignment_mask;
 	}
 
+	info->address &= ~alignment_mask;
+	info->ctrl.len <<= offset;
+
 	/*
 	 * Disallow per-task kernel breakpoints since these would
 	 * complicate the stepping code.
@@ -661,12 +700,47 @@ unlock:
 }
 NOKPROBE_SYMBOL(breakpoint_handler);
 
+/*
+ * Arm64 hardware does not always report a watchpoint hit address that matches
+ * one of the watchpoints set. It can also report an address "near" the
+ * watchpoint if a single instruction access both watched and unwatched
+ * addresses. There is no straight-forward way, short of disassembling the
+ * offending instruction, to map that address back to the watchpoint. This
+ * function computes the distance of the memory access from the watchpoint as a
+ * heuristic for the likelyhood that a given access triggered the watchpoint.
+ *
+ * See Section D2.10.5 "Determining the memory location that caused a Watchpoint
+ * exception" of ARMv8 Architecture Reference Manual for details.
+ *
+ * The function returns the distance of the address from the bytes watched by
+ * the watchpoint. In case of an exact match, it returns 0.
+ */
+static u64 get_distance_from_watchpoint(unsigned long addr, u64 val,
+					struct arch_hw_breakpoint_ctrl *ctrl)
+{
+	u64 wp_low, wp_high;
+	u32 lens, lene;
+
+	lens = __ffs(ctrl->len);
+	lene = __fls(ctrl->len);
+
+	wp_low = val + lens;
+	wp_high = val + lene;
+	if (addr < wp_low)
+		return wp_low - addr;
+	else if (addr > wp_high)
+		return addr - wp_high;
+	else
+		return 0;
+}
+
 static int watchpoint_handler(unsigned long addr, unsigned int esr,
 			      struct pt_regs *regs)
 {
-	int i, step = 0, *kernel_step, access;
+	int i, step = 0, *kernel_step, access, closest_match = 0;
+	u64 min_dist = -1, dist;
 	u32 ctrl_reg;
-	u64 val, alignment_mask;
+	u64 val;
 	struct perf_event *wp, **slots;
 	struct debug_info *debug_info;
 	struct arch_hw_breakpoint *info;
@@ -675,35 +749,15 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
 	slots = this_cpu_ptr(wp_on_reg);
 	debug_info = &current->thread.debug;
 
+	/*
+	 * Find all watchpoints that match the reported address. If no exact
+	 * match is found. Attribute the hit to the closest watchpoint.
+	 */
+	rcu_read_lock();
 	for (i = 0; i < core_num_wrps; ++i) {
-		rcu_read_lock();
-
 		wp = slots[i];
-
 		if (wp == NULL)
-			goto unlock;
-
-		info = counter_arch_bp(wp);
-		/* AArch32 watchpoints are either 4 or 8 bytes aligned. */
-		if (is_compat_task()) {
-			if (info->ctrl.len == ARM_BREAKPOINT_LEN_8)
-				alignment_mask = 0x7;
-			else
-				alignment_mask = 0x3;
-		} else {
-			alignment_mask = 0x7;
-		}
-
-		/* Check if the watchpoint value matches. */
-		val = read_wb_reg(AARCH64_DBG_REG_WVR, i);
-		if (val != (addr & ~alignment_mask))
-			goto unlock;
-
-		/* Possible match, check the byte address select to confirm. */
-		ctrl_reg = read_wb_reg(AARCH64_DBG_REG_WCR, i);
-		decode_ctrl_reg(ctrl_reg, &ctrl);
-		if (!((1 << (addr & alignment_mask)) & ctrl.len))
-			goto unlock;
+			continue;
 
 		/*
 		 * Check that the access type matches.
@@ -712,18 +766,41 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
 		access = (esr & AARCH64_ESR_ACCESS_MASK) ? HW_BREAKPOINT_W :
 			 HW_BREAKPOINT_R;
 		if (!(access & hw_breakpoint_type(wp)))
-			goto unlock;
+			continue;
 
+		/* Check if the watchpoint value and byte select match. */
+		val = read_wb_reg(AARCH64_DBG_REG_WVR, i);
+		ctrl_reg = read_wb_reg(AARCH64_DBG_REG_WCR, i);
+		decode_ctrl_reg(ctrl_reg, &ctrl);
+		dist = get_distance_from_watchpoint(addr, val, &ctrl);
+		if (dist < min_dist) {
+			min_dist = dist;
+			closest_match = i;
+		}
+		/* Is this an exact match? */
+		if (dist != 0)
+			continue;
+
+		info = counter_arch_bp(wp);
 		info->trigger = addr;
 		perf_bp_event(wp, regs);
 
 		/* Do we need to handle the stepping? */
 		if (is_default_overflow_handler(wp))
 			step = 1;
+	}
+	if (min_dist > 0 && min_dist != -1) {
+		/* No exact match found. */
+		wp = slots[closest_match];
+		info = counter_arch_bp(wp);
+		info->trigger = addr;
+		perf_bp_event(wp, regs);
 
-unlock:
-		rcu_read_unlock();
+		/* Do we need to handle the stepping? */
+		if (is_default_overflow_handler(wp))
+			step = 1;
 	}
+	rcu_read_unlock();
 
 	if (!step)
 		return 0;
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
index 6f2ac4fc66ca..94b62c1fa4df 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -30,7 +30,6 @@
 #include <asm/cacheflush.h>
 #include <asm/debug-monitors.h>
 #include <asm/fixmap.h>
-#include <asm/opcodes.h>
 #include <asm/insn.h>
 
 #define AARCH64_INSN_SF_BIT	BIT(31)
diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c
index e017a9493b92..d217c9e95b06 100644
--- a/arch/arm64/kernel/kgdb.c
+++ b/arch/arm64/kernel/kgdb.c
@@ -247,6 +247,9 @@ NOKPROBE_SYMBOL(kgdb_compiled_brk_fn);
 
 static int kgdb_step_brk_fn(struct pt_regs *regs, unsigned int esr)
 {
+	if (!kgdb_single_step)
+		return DBG_HOOK_ERROR;
+
 	kgdb_handle_exception(1, SIGTRAP, 0, regs);
 	return 0;
 }
diff --git a/arch/arm64/kernel/probes/Makefile b/arch/arm64/kernel/probes/Makefile
index ce06312e3d34..89b6df613dde 100644
--- a/arch/arm64/kernel/probes/Makefile
+++ b/arch/arm64/kernel/probes/Makefile
@@ -1,3 +1,5 @@
 obj-$(CONFIG_KPROBES)		+= kprobes.o decode-insn.o	\
 				   kprobes_trampoline.o		\
 				   simulate-insn.o
+obj-$(CONFIG_UPROBES)		+= uprobes.o decode-insn.o	\
+				   simulate-insn.o
diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c
index d1731bf977ef..6bf6657a5a52 100644
--- a/arch/arm64/kernel/probes/decode-insn.c
+++ b/arch/arm64/kernel/probes/decode-insn.c
@@ -17,7 +17,6 @@
 #include <linux/kprobes.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
-#include <asm/kprobes.h>
 #include <asm/insn.h>
 #include <asm/sections.h>
 
@@ -78,8 +77,8 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn)
  *   INSN_GOOD         If instruction is supported and uses instruction slot,
  *   INSN_GOOD_NO_SLOT If instruction is supported but doesn't use its slot.
  */
-static enum kprobe_insn __kprobes
-arm_probe_decode_insn(kprobe_opcode_t insn, struct arch_specific_insn *asi)
+enum probe_insn __kprobes
+arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *api)
 {
 	/*
 	 * Instructions reading or modifying the PC won't work from the XOL
@@ -89,26 +88,26 @@ arm_probe_decode_insn(kprobe_opcode_t insn, struct arch_specific_insn *asi)
 		return INSN_GOOD;
 
 	if (aarch64_insn_is_bcond(insn)) {
-		asi->handler = simulate_b_cond;
+		api->handler = simulate_b_cond;
 	} else if (aarch64_insn_is_cbz(insn) ||
 	    aarch64_insn_is_cbnz(insn)) {
-		asi->handler = simulate_cbz_cbnz;
+		api->handler = simulate_cbz_cbnz;
 	} else if (aarch64_insn_is_tbz(insn) ||
 	    aarch64_insn_is_tbnz(insn)) {
-		asi->handler = simulate_tbz_tbnz;
+		api->handler = simulate_tbz_tbnz;
 	} else if (aarch64_insn_is_adr_adrp(insn)) {
-		asi->handler = simulate_adr_adrp;
+		api->handler = simulate_adr_adrp;
 	} else if (aarch64_insn_is_b(insn) ||
 	    aarch64_insn_is_bl(insn)) {
-		asi->handler = simulate_b_bl;
+		api->handler = simulate_b_bl;
 	} else if (aarch64_insn_is_br(insn) ||
 	    aarch64_insn_is_blr(insn) ||
 	    aarch64_insn_is_ret(insn)) {
-		asi->handler = simulate_br_blr_ret;
+		api->handler = simulate_br_blr_ret;
 	} else if (aarch64_insn_is_ldr_lit(insn)) {
-		asi->handler = simulate_ldr_literal;
+		api->handler = simulate_ldr_literal;
 	} else if (aarch64_insn_is_ldrsw_lit(insn)) {
-		asi->handler = simulate_ldrsw_literal;
+		api->handler = simulate_ldrsw_literal;
 	} else {
 		/*
 		 * Instruction cannot be stepped out-of-line and we don't
@@ -120,6 +119,7 @@ arm_probe_decode_insn(kprobe_opcode_t insn, struct arch_specific_insn *asi)
 	return INSN_GOOD_NO_SLOT;
 }
 
+#ifdef CONFIG_KPROBES
 static bool __kprobes
 is_probed_address_atomic(kprobe_opcode_t *scan_start, kprobe_opcode_t *scan_end)
 {
@@ -138,12 +138,12 @@ is_probed_address_atomic(kprobe_opcode_t *scan_start, kprobe_opcode_t *scan_end)
 	return false;
 }
 
-enum kprobe_insn __kprobes
+enum probe_insn __kprobes
 arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi)
 {
-	enum kprobe_insn decoded;
-	kprobe_opcode_t insn = le32_to_cpu(*addr);
-	kprobe_opcode_t *scan_end = NULL;
+	enum probe_insn decoded;
+	probe_opcode_t insn = le32_to_cpu(*addr);
+	probe_opcode_t *scan_end = NULL;
 	unsigned long size = 0, offset = 0;
 
 	/*
@@ -162,7 +162,7 @@ arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi)
 		else
 			scan_end = addr - MAX_ATOMIC_CONTEXT_SIZE;
 	}
-	decoded = arm_probe_decode_insn(insn, asi);
+	decoded = arm_probe_decode_insn(insn, &asi->api);
 
 	if (decoded != INSN_REJECTED && scan_end)
 		if (is_probed_address_atomic(addr - 1, scan_end))
@@ -170,3 +170,4 @@ arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi)
 
 	return decoded;
 }
+#endif
diff --git a/arch/arm64/kernel/probes/decode-insn.h b/arch/arm64/kernel/probes/decode-insn.h
index d438289646a6..76d3f315407f 100644
--- a/arch/arm64/kernel/probes/decode-insn.h
+++ b/arch/arm64/kernel/probes/decode-insn.h
@@ -23,13 +23,17 @@
  */
 #define MAX_ATOMIC_CONTEXT_SIZE	(128 / sizeof(kprobe_opcode_t))
 
-enum kprobe_insn {
+enum probe_insn {
 	INSN_REJECTED,
 	INSN_GOOD_NO_SLOT,
 	INSN_GOOD,
 };
 
-enum kprobe_insn __kprobes
+#ifdef CONFIG_KPROBES
+enum probe_insn __kprobes
 arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi);
+#endif
+enum probe_insn __kprobes
+arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *asi);
 
 #endif /* _ARM_KERNEL_KPROBES_ARM64_H */
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
index f5077ea7af6d..1decd2b2c730 100644
--- a/arch/arm64/kernel/probes/kprobes.c
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -44,31 +44,31 @@ post_kprobe_handler(struct kprobe_ctlblk *, struct pt_regs *);
 static void __kprobes arch_prepare_ss_slot(struct kprobe *p)
 {
 	/* prepare insn slot */
-	p->ainsn.insn[0] = cpu_to_le32(p->opcode);
+	p->ainsn.api.insn[0] = cpu_to_le32(p->opcode);
 
-	flush_icache_range((uintptr_t) (p->ainsn.insn),
-			   (uintptr_t) (p->ainsn.insn) +
+	flush_icache_range((uintptr_t) (p->ainsn.api.insn),
+			   (uintptr_t) (p->ainsn.api.insn) +
 			   MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
 
 	/*
 	 * Needs restoring of return address after stepping xol.
 	 */
-	p->ainsn.restore = (unsigned long) p->addr +
+	p->ainsn.api.restore = (unsigned long) p->addr +
 	  sizeof(kprobe_opcode_t);
 }
 
 static void __kprobes arch_prepare_simulate(struct kprobe *p)
 {
 	/* This instructions is not executed xol. No need to adjust the PC */
-	p->ainsn.restore = 0;
+	p->ainsn.api.restore = 0;
 }
 
 static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 
-	if (p->ainsn.handler)
-		p->ainsn.handler((u32)p->opcode, (long)p->addr, regs);
+	if (p->ainsn.api.handler)
+		p->ainsn.api.handler((u32)p->opcode, (long)p->addr, regs);
 
 	/* single step simulated, now go for post processing */
 	post_kprobe_handler(kcb, regs);
@@ -98,18 +98,18 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
 		return -EINVAL;
 
 	case INSN_GOOD_NO_SLOT:	/* insn need simulation */
-		p->ainsn.insn = NULL;
+		p->ainsn.api.insn = NULL;
 		break;
 
 	case INSN_GOOD:	/* instruction uses slot */
-		p->ainsn.insn = get_insn_slot();
-		if (!p->ainsn.insn)
+		p->ainsn.api.insn = get_insn_slot();
+		if (!p->ainsn.api.insn)
 			return -ENOMEM;
 		break;
 	};
 
 	/* prepare the instruction */
-	if (p->ainsn.insn)
+	if (p->ainsn.api.insn)
 		arch_prepare_ss_slot(p);
 	else
 		arch_prepare_simulate(p);
@@ -142,9 +142,9 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 
 void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
-	if (p->ainsn.insn) {
-		free_insn_slot(p->ainsn.insn, 0);
-		p->ainsn.insn = NULL;
+	if (p->ainsn.api.insn) {
+		free_insn_slot(p->ainsn.api.insn, 0);
+		p->ainsn.api.insn = NULL;
 	}
 }
 
@@ -244,9 +244,9 @@ static void __kprobes setup_singlestep(struct kprobe *p,
 	}
 
 
-	if (p->ainsn.insn) {
+	if (p->ainsn.api.insn) {
 		/* prepare for single stepping */
-		slot = (unsigned long)p->ainsn.insn;
+		slot = (unsigned long)p->ainsn.api.insn;
 
 		set_ss_context(kcb, slot);	/* mark pending ss */
 
@@ -295,8 +295,8 @@ post_kprobe_handler(struct kprobe_ctlblk *kcb, struct pt_regs *regs)
 		return;
 
 	/* return addr restore if non-branching insn */
-	if (cur->ainsn.restore != 0)
-		instruction_pointer_set(regs, cur->ainsn.restore);
+	if (cur->ainsn.api.restore != 0)
+		instruction_pointer_set(regs, cur->ainsn.api.restore);
 
 	/* restore back original saved kprobe variables and continue */
 	if (kcb->kprobe_status == KPROBE_REENTER) {
diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c
index 8977ce9d009d..357d3efe1366 100644
--- a/arch/arm64/kernel/probes/simulate-insn.c
+++ b/arch/arm64/kernel/probes/simulate-insn.c
@@ -13,28 +13,26 @@
  * General Public License for more details.
  */
 
+#include <linux/bitops.h>
 #include <linux/kernel.h>
 #include <linux/kprobes.h>
 
 #include "simulate-insn.h"
 
-#define sign_extend(x, signbit)		\
-	((x) | (0 - ((x) & (1 << (signbit)))))
-
 #define bbl_displacement(insn)		\
-	sign_extend(((insn) & 0x3ffffff) << 2, 27)
+	sign_extend32(((insn) & 0x3ffffff) << 2, 27)
 
 #define bcond_displacement(insn)	\
-	sign_extend(((insn >> 5) & 0x7ffff) << 2, 20)
+	sign_extend32(((insn >> 5) & 0x7ffff) << 2, 20)
 
 #define cbz_displacement(insn)	\
-	sign_extend(((insn >> 5) & 0x7ffff) << 2, 20)
+	sign_extend32(((insn >> 5) & 0x7ffff) << 2, 20)
 
 #define tbz_displacement(insn)	\
-	sign_extend(((insn >> 5) & 0x3fff) << 2, 15)
+	sign_extend32(((insn >> 5) & 0x3fff) << 2, 15)
 
 #define ldr_displacement(insn)	\
-	sign_extend(((insn >> 5) & 0x7ffff) << 2, 20)
+	sign_extend32(((insn >> 5) & 0x7ffff) << 2, 20)
 
 static inline void set_x_reg(struct pt_regs *regs, int reg, u64 val)
 {
@@ -106,7 +104,7 @@ simulate_adr_adrp(u32 opcode, long addr, struct pt_regs *regs)
 
 	xn = opcode & 0x1f;
 	imm = ((opcode >> 3) & 0x1ffffc) | ((opcode >> 29) & 0x3);
-	imm = sign_extend(imm, 20);
+	imm = sign_extend64(imm, 20);
 	if (opcode & 0x80000000)
 		val = (imm<<12) + (addr & 0xfffffffffffff000);
 	else
diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c
new file mode 100644
index 000000000000..26c998534dca
--- /dev/null
+++ b/arch/arm64/kernel/probes/uprobes.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (C) 2014-2016 Pratyush Anand <panand@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/highmem.h>
+#include <linux/ptrace.h>
+#include <linux/uprobes.h>
+#include <asm/cacheflush.h>
+
+#include "decode-insn.h"
+
+#define UPROBE_INV_FAULT_CODE	UINT_MAX
+
+void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
+		void *src, unsigned long len)
+{
+	void *xol_page_kaddr = kmap_atomic(page);
+	void *dst = xol_page_kaddr + (vaddr & ~PAGE_MASK);
+
+	/* Initialize the slot */
+	memcpy(dst, src, len);
+
+	/* flush caches (dcache/icache) */
+	sync_icache_aliases(dst, len);
+
+	kunmap_atomic(xol_page_kaddr);
+}
+
+unsigned long uprobe_get_swbp_addr(struct pt_regs *regs)
+{
+	return instruction_pointer(regs);
+}
+
+int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm,
+		unsigned long addr)
+{
+	probe_opcode_t insn;
+
+	/* TODO: Currently we do not support AARCH32 instruction probing */
+	if (test_bit(TIF_32BIT, &mm->context.flags))
+		return -ENOTSUPP;
+	else if (!IS_ALIGNED(addr, AARCH64_INSN_SIZE))
+		return -EINVAL;
+
+	insn = *(probe_opcode_t *)(&auprobe->insn[0]);
+
+	switch (arm_probe_decode_insn(insn, &auprobe->api)) {
+	case INSN_REJECTED:
+		return -EINVAL;
+
+	case INSN_GOOD_NO_SLOT:
+		auprobe->simulate = true;
+		break;
+
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+	struct uprobe_task *utask = current->utask;
+
+	/* Initialize with an invalid fault code to detect if ol insn trapped */
+	current->thread.fault_code = UPROBE_INV_FAULT_CODE;
+
+	/* Instruction points to execute ol */
+	instruction_pointer_set(regs, utask->xol_vaddr);
+
+	user_enable_single_step(current);
+
+	return 0;
+}
+
+int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+	struct uprobe_task *utask = current->utask;
+
+	WARN_ON_ONCE(current->thread.fault_code != UPROBE_INV_FAULT_CODE);
+
+	/* Instruction points to execute next to breakpoint address */
+	instruction_pointer_set(regs, utask->vaddr + 4);
+
+	user_disable_single_step(current);
+
+	return 0;
+}
+bool arch_uprobe_xol_was_trapped(struct task_struct *t)
+{
+	/*
+	 * Between arch_uprobe_pre_xol and arch_uprobe_post_xol, if an xol
+	 * insn itself is trapped, then detect the case with the help of
+	 * invalid fault code which is being set in arch_uprobe_pre_xol
+	 */
+	if (t->thread.fault_code != UPROBE_INV_FAULT_CODE)
+		return true;
+
+	return false;
+}
+
+bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+	probe_opcode_t insn;
+	unsigned long addr;
+
+	if (!auprobe->simulate)
+		return false;
+
+	insn = *(probe_opcode_t *)(&auprobe->insn[0]);
+	addr = instruction_pointer(regs);
+
+	if (auprobe->api.handler)
+		auprobe->api.handler(insn, addr, regs);
+
+	return true;
+}
+
+void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+	struct uprobe_task *utask = current->utask;
+
+	/*
+	 * Task has received a fatal signal, so reset back to probbed
+	 * address.
+	 */
+	instruction_pointer_set(regs, utask->vaddr);
+
+	user_disable_single_step(current);
+}
+
+bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
+		struct pt_regs *regs)
+{
+	/*
+	 * If a simple branch instruction (B) was called for retprobed
+	 * assembly label then return true even when regs->sp and ret->stack
+	 * are same. It will ensure that cleanup and reporting of return
+	 * instances corresponding to callee label is done when
+	 * handle_trampoline for called function is executed.
+	 */
+	if (ctx == RP_CHECK_CHAIN_CALL)
+		return regs->sp <= ret->stack;
+	else
+		return regs->sp < ret->stack;
+}
+
+unsigned long
+arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr,
+				  struct pt_regs *regs)
+{
+	unsigned long orig_ret_vaddr;
+
+	orig_ret_vaddr = procedure_link_pointer(regs);
+	/* Replace the return addr with trampoline addr */
+	procedure_link_pointer_set(regs, trampoline_vaddr);
+
+	return orig_ret_vaddr;
+}
+
+int arch_uprobe_exception_notify(struct notifier_block *self,
+				 unsigned long val, void *data)
+{
+	return NOTIFY_DONE;
+}
+
+static int uprobe_breakpoint_handler(struct pt_regs *regs,
+		unsigned int esr)
+{
+	if (user_mode(regs) && uprobe_pre_sstep_notifier(regs))
+		return DBG_HOOK_HANDLED;
+
+	return DBG_HOOK_ERROR;
+}
+
+static int uprobe_single_step_handler(struct pt_regs *regs,
+		unsigned int esr)
+{
+	struct uprobe_task *utask = current->utask;
+
+	if (user_mode(regs)) {
+		WARN_ON(utask &&
+			(instruction_pointer(regs) != utask->xol_vaddr + 4));
+
+		if (uprobe_post_sstep_notifier(regs))
+			return DBG_HOOK_HANDLED;
+	}
+
+	return DBG_HOOK_ERROR;
+}
+
+/* uprobe breakpoint handler hook */
+static struct break_hook uprobes_break_hook = {
+	.esr_mask = BRK64_ESR_MASK,
+	.esr_val = BRK64_ESR_UPROBES,
+	.fn = uprobe_breakpoint_handler,
+};
+
+/* uprobe single step handler hook */
+static struct step_hook uprobes_step_hook = {
+	.fn = uprobe_single_step_handler,
+};
+
+static int __init arch_init_uprobes(void)
+{
+	register_break_hook(&uprobes_break_hook);
+	register_step_hook(&uprobes_step_hook);
+
+	return 0;
+}
+
+device_initcall(arch_init_uprobes);
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 01753cd7d3f0..a3a2816ba73a 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -45,6 +45,7 @@
 #include <linux/personality.h>
 #include <linux/notifier.h>
 #include <trace/events/power.h>
+#include <linux/percpu.h>
 
 #include <asm/alternative.h>
 #include <asm/compat.h>
@@ -282,7 +283,7 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
 		memset(childregs, 0, sizeof(struct pt_regs));
 		childregs->pstate = PSR_MODE_EL1h;
 		if (IS_ENABLED(CONFIG_ARM64_UAO) &&
-		    cpus_have_cap(ARM64_HAS_UAO))
+		    cpus_have_const_cap(ARM64_HAS_UAO))
 			childregs->pstate |= PSR_UAO_BIT;
 		p->thread.cpu_context.x19 = stack_start;
 		p->thread.cpu_context.x20 = stk_sz;
@@ -322,6 +323,20 @@ void uao_thread_switch(struct task_struct *next)
 }
 
 /*
+ * We store our current task in sp_el0, which is clobbered by userspace. Keep a
+ * shadow copy so that we can restore this upon entry from userspace.
+ *
+ * This is *only* for exception entry from EL0, and is not valid until we
+ * __switch_to() a user task.
+ */
+DEFINE_PER_CPU(struct task_struct *, __entry_task);
+
+static void entry_task_switch(struct task_struct *next)
+{
+	__this_cpu_write(__entry_task, next);
+}
+
+/*
  * Thread switching.
  */
 struct task_struct *__switch_to(struct task_struct *prev,
@@ -333,6 +348,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	tls_thread_switch(next);
 	hw_breakpoint_thread_switch(next);
 	contextidr_thread_switch(next);
+	entry_task_switch(next);
 	uao_thread_switch(next);
 
 	/*
@@ -350,27 +366,35 @@ struct task_struct *__switch_to(struct task_struct *prev,
 unsigned long get_wchan(struct task_struct *p)
 {
 	struct stackframe frame;
-	unsigned long stack_page;
+	unsigned long stack_page, ret = 0;
 	int count = 0;
 	if (!p || p == current || p->state == TASK_RUNNING)
 		return 0;
 
+	stack_page = (unsigned long)try_get_task_stack(p);
+	if (!stack_page)
+		return 0;
+
 	frame.fp = thread_saved_fp(p);
 	frame.sp = thread_saved_sp(p);
 	frame.pc = thread_saved_pc(p);
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	frame.graph = p->curr_ret_stack;
 #endif
-	stack_page = (unsigned long)task_stack_page(p);
 	do {
 		if (frame.sp < stack_page ||
 		    frame.sp >= stack_page + THREAD_SIZE ||
 		    unwind_frame(p, &frame))
-			return 0;
-		if (!in_sched_functions(frame.pc))
-			return frame.pc;
+			goto out;
+		if (!in_sched_functions(frame.pc)) {
+			ret = frame.pc;
+			goto out;
+		}
 	} while (count ++ < 16);
-	return 0;
+
+out:
+	put_task_stack(p);
+	return ret;
 }
 
 unsigned long arch_align_stack(unsigned long sp)
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index e0c81da60f76..fc35e06ccaac 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -327,13 +327,13 @@ static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type,
 				     struct arch_hw_breakpoint_ctrl ctrl,
 				     struct perf_event_attr *attr)
 {
-	int err, len, type, disabled = !ctrl.enabled;
+	int err, len, type, offset, disabled = !ctrl.enabled;
 
 	attr->disabled = disabled;
 	if (disabled)
 		return 0;
 
-	err = arch_bp_generic_fields(ctrl, &len, &type);
+	err = arch_bp_generic_fields(ctrl, &len, &type, &offset);
 	if (err)
 		return err;
 
@@ -352,6 +352,7 @@ static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type,
 
 	attr->bp_len	= len;
 	attr->bp_type	= type;
+	attr->bp_addr	+= offset;
 
 	return 0;
 }
@@ -404,7 +405,7 @@ static int ptrace_hbp_get_addr(unsigned int note_type,
 	if (IS_ERR(bp))
 		return PTR_ERR(bp);
 
-	*addr = bp ? bp->attr.bp_addr : 0;
+	*addr = bp ? counter_arch_bp(bp)->address : 0;
 	return 0;
 }
 
diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c
index 1718706fde83..12a87f2600f2 100644
--- a/arch/arm64/kernel/return_address.c
+++ b/arch/arm64/kernel/return_address.c
@@ -12,6 +12,7 @@
 #include <linux/export.h>
 #include <linux/ftrace.h>
 
+#include <asm/stack_pointer.h>
 #include <asm/stacktrace.h>
 
 struct return_address_data {
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index f534f492a268..a53f52ac81c6 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -291,6 +291,15 @@ void __init setup_arch(char **cmdline_p)
 	smp_init_cpus();
 	smp_build_mpidr_hash();
 
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+	/*
+	 * Make sure init_thread_info.ttbr0 always generates translation
+	 * faults in case uaccess_enable() is inadvertently called by the init
+	 * thread.
+	 */
+	init_task.thread_info.ttbr0 = virt_to_phys(empty_zero_page);
+#endif
+
 #ifdef CONFIG_VT
 #if defined(CONFIG_VGA_CONSOLE)
 	conswitchp = &vga_con;
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 404dd67080b9..c7b6de62f9d3 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -414,6 +414,9 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
 		} else {
 			local_irq_enable();
 
+			if (thread_flags & _TIF_UPROBE)
+				uprobe_notify_resume(regs);
+
 			if (thread_flags & _TIF_SIGPENDING)
 				do_signal(regs);
 
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index 1bec41b5fda3..df67652e46f0 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -125,9 +125,6 @@ ENTRY(_cpu_resume)
 	/* load sp from context */
 	ldr	x2, [x0, #CPU_CTX_SP]
 	mov	sp, x2
-	/* save thread_info */
-	and	x2, x2, #~(THREAD_SIZE - 1)
-	msr	sp_el0, x2
 	/*
 	 * cpu_do_resume expects x0 to contain context address pointer
 	 */
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 8507703dabe4..cb87234cfcf2 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -58,6 +58,9 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/ipi.h>
 
+DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number);
+EXPORT_PER_CPU_SYMBOL(cpu_number);
+
 /*
  * as from 2.5, kernels no longer have an init_tasks structure
  * so we need some other way of telling a new secondary core
@@ -146,6 +149,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 	 * We need to tell the secondary core where to find its stack and the
 	 * page tables.
 	 */
+	secondary_data.task = idle;
 	secondary_data.stack = task_stack_page(idle) + THREAD_START_SP;
 	update_cpu_boot_status(CPU_MMU_OFF);
 	__flush_dcache_area(&secondary_data, sizeof(secondary_data));
@@ -170,6 +174,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 		pr_err("CPU%u: failed to boot: %d\n", cpu, ret);
 	}
 
+	secondary_data.task = NULL;
 	secondary_data.stack = NULL;
 	status = READ_ONCE(secondary_data.status);
 	if (ret && status) {
@@ -208,7 +213,10 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 asmlinkage void secondary_start_kernel(void)
 {
 	struct mm_struct *mm = &init_mm;
-	unsigned int cpu = smp_processor_id();
+	unsigned int cpu;
+
+	cpu = task_cpu(current);
+	set_my_cpu_offset(per_cpu_offset(cpu));
 
 	/*
 	 * All kernel threads share the same mm context; grab a
@@ -217,8 +225,6 @@ asmlinkage void secondary_start_kernel(void)
 	atomic_inc(&mm->mm_count);
 	current->active_mm = mm;
 
-	set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
-
 	/*
 	 * TTBR0 is only used for the identity mapping at this stage. Make it
 	 * point to zero page to avoid speculatively fetching new entries.
@@ -718,6 +724,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	 */
 	for_each_possible_cpu(cpu) {
 
+		per_cpu(cpu_number, cpu) = cpu;
+
 		if (cpu == smp_processor_id())
 			continue;
 
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index c2efddfca18c..8a552a33c6ef 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -22,6 +22,7 @@
 #include <linux/stacktrace.h>
 
 #include <asm/irq.h>
+#include <asm/stack_pointer.h>
 #include <asm/stacktrace.h>
 
 /*
@@ -128,7 +129,6 @@ void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
 			break;
 	}
 }
-EXPORT_SYMBOL(walk_stackframe);
 
 #ifdef CONFIG_STACKTRACE
 struct stack_trace_data {
@@ -181,6 +181,9 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 	struct stack_trace_data data;
 	struct stackframe frame;
 
+	if (!try_get_task_stack(tsk))
+		return;
+
 	data.trace = trace;
 	data.skip = trace->skip;
 
@@ -202,6 +205,8 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 	walk_stackframe(tsk, &frame, save_trace, &data);
 	if (trace->nr_entries < trace->max_entries)
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
+
+	put_task_stack(tsk);
 }
 
 void save_stack_trace(struct stack_trace *trace)
diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
index bb0cd787a9d3..1e3be9064cfa 100644
--- a/arch/arm64/kernel/suspend.c
+++ b/arch/arm64/kernel/suspend.c
@@ -47,12 +47,6 @@ void notrace __cpu_suspend_exit(void)
 	cpu_uninstall_idmap();
 
 	/*
-	 * Restore per-cpu offset before any kernel
-	 * subsystem relying on it has a chance to run.
-	 */
-	set_my_cpu_offset(per_cpu_offset(cpu));
-
-	/*
 	 * PSTATE was not saved over suspend/resume, re-enable any detected
 	 * features that might not have been set correctly.
 	 */
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 694f6deedbab..23e9e13bd2aa 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -19,10 +19,226 @@
 #include <linux/nodemask.h>
 #include <linux/of.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/cpufreq.h>
 
+#include <asm/cpu.h>
 #include <asm/cputype.h>
 #include <asm/topology.h>
 
+static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
+static DEFINE_MUTEX(cpu_scale_mutex);
+
+unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
+{
+	return per_cpu(cpu_scale, cpu);
+}
+
+static void set_capacity_scale(unsigned int cpu, unsigned long capacity)
+{
+	per_cpu(cpu_scale, cpu) = capacity;
+}
+
+#ifdef CONFIG_PROC_SYSCTL
+static ssize_t cpu_capacity_show(struct device *dev,
+				 struct device_attribute *attr,
+				 char *buf)
+{
+	struct cpu *cpu = container_of(dev, struct cpu, dev);
+
+	return sprintf(buf, "%lu\n",
+			arch_scale_cpu_capacity(NULL, cpu->dev.id));
+}
+
+static ssize_t cpu_capacity_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf,
+				  size_t count)
+{
+	struct cpu *cpu = container_of(dev, struct cpu, dev);
+	int this_cpu = cpu->dev.id, i;
+	unsigned long new_capacity;
+	ssize_t ret;
+
+	if (count) {
+		ret = kstrtoul(buf, 0, &new_capacity);
+		if (ret)
+			return ret;
+		if (new_capacity > SCHED_CAPACITY_SCALE)
+			return -EINVAL;
+
+		mutex_lock(&cpu_scale_mutex);
+		for_each_cpu(i, &cpu_topology[this_cpu].core_sibling)
+			set_capacity_scale(i, new_capacity);
+		mutex_unlock(&cpu_scale_mutex);
+	}
+
+	return count;
+}
+
+static DEVICE_ATTR_RW(cpu_capacity);
+
+static int register_cpu_capacity_sysctl(void)
+{
+	int i;
+	struct device *cpu;
+
+	for_each_possible_cpu(i) {
+		cpu = get_cpu_device(i);
+		if (!cpu) {
+			pr_err("%s: too early to get CPU%d device!\n",
+			       __func__, i);
+			continue;
+		}
+		device_create_file(cpu, &dev_attr_cpu_capacity);
+	}
+
+	return 0;
+}
+subsys_initcall(register_cpu_capacity_sysctl);
+#endif
+
+static u32 capacity_scale;
+static u32 *raw_capacity;
+static bool cap_parsing_failed;
+
+static void __init parse_cpu_capacity(struct device_node *cpu_node, int cpu)
+{
+	int ret;
+	u32 cpu_capacity;
+
+	if (cap_parsing_failed)
+		return;
+
+	ret = of_property_read_u32(cpu_node,
+				   "capacity-dmips-mhz",
+				   &cpu_capacity);
+	if (!ret) {
+		if (!raw_capacity) {
+			raw_capacity = kcalloc(num_possible_cpus(),
+					       sizeof(*raw_capacity),
+					       GFP_KERNEL);
+			if (!raw_capacity) {
+				pr_err("cpu_capacity: failed to allocate memory for raw capacities\n");
+				cap_parsing_failed = true;
+				return;
+			}
+		}
+		capacity_scale = max(cpu_capacity, capacity_scale);
+		raw_capacity[cpu] = cpu_capacity;
+		pr_debug("cpu_capacity: %s cpu_capacity=%u (raw)\n",
+			cpu_node->full_name, raw_capacity[cpu]);
+	} else {
+		if (raw_capacity) {
+			pr_err("cpu_capacity: missing %s raw capacity\n",
+				cpu_node->full_name);
+			pr_err("cpu_capacity: partial information: fallback to 1024 for all CPUs\n");
+		}
+		cap_parsing_failed = true;
+		kfree(raw_capacity);
+	}
+}
+
+static void normalize_cpu_capacity(void)
+{
+	u64 capacity;
+	int cpu;
+
+	if (!raw_capacity || cap_parsing_failed)
+		return;
+
+	pr_debug("cpu_capacity: capacity_scale=%u\n", capacity_scale);
+	mutex_lock(&cpu_scale_mutex);
+	for_each_possible_cpu(cpu) {
+		pr_debug("cpu_capacity: cpu=%d raw_capacity=%u\n",
+			 cpu, raw_capacity[cpu]);
+		capacity = (raw_capacity[cpu] << SCHED_CAPACITY_SHIFT)
+			/ capacity_scale;
+		set_capacity_scale(cpu, capacity);
+		pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n",
+			cpu, arch_scale_cpu_capacity(NULL, cpu));
+	}
+	mutex_unlock(&cpu_scale_mutex);
+}
+
+#ifdef CONFIG_CPU_FREQ
+static cpumask_var_t cpus_to_visit;
+static bool cap_parsing_done;
+static void parsing_done_workfn(struct work_struct *work);
+static DECLARE_WORK(parsing_done_work, parsing_done_workfn);
+
+static int
+init_cpu_capacity_callback(struct notifier_block *nb,
+			   unsigned long val,
+			   void *data)
+{
+	struct cpufreq_policy *policy = data;
+	int cpu;
+
+	if (cap_parsing_failed || cap_parsing_done)
+		return 0;
+
+	switch (val) {
+	case CPUFREQ_NOTIFY:
+		pr_debug("cpu_capacity: init cpu capacity for CPUs [%*pbl] (to_visit=%*pbl)\n",
+				cpumask_pr_args(policy->related_cpus),
+				cpumask_pr_args(cpus_to_visit));
+		cpumask_andnot(cpus_to_visit,
+			       cpus_to_visit,
+			       policy->related_cpus);
+		for_each_cpu(cpu, policy->related_cpus) {
+			raw_capacity[cpu] = arch_scale_cpu_capacity(NULL, cpu) *
+					    policy->cpuinfo.max_freq / 1000UL;
+			capacity_scale = max(raw_capacity[cpu], capacity_scale);
+		}
+		if (cpumask_empty(cpus_to_visit)) {
+			normalize_cpu_capacity();
+			kfree(raw_capacity);
+			pr_debug("cpu_capacity: parsing done\n");
+			cap_parsing_done = true;
+			schedule_work(&parsing_done_work);
+		}
+	}
+	return 0;
+}
+
+static struct notifier_block init_cpu_capacity_notifier = {
+	.notifier_call = init_cpu_capacity_callback,
+};
+
+static int __init register_cpufreq_notifier(void)
+{
+	if (cap_parsing_failed)
+		return -EINVAL;
+
+	if (!alloc_cpumask_var(&cpus_to_visit, GFP_KERNEL)) {
+		pr_err("cpu_capacity: failed to allocate memory for cpus_to_visit\n");
+		return -ENOMEM;
+	}
+	cpumask_copy(cpus_to_visit, cpu_possible_mask);
+
+	return cpufreq_register_notifier(&init_cpu_capacity_notifier,
+					 CPUFREQ_POLICY_NOTIFIER);
+}
+core_initcall(register_cpufreq_notifier);
+
+static void parsing_done_workfn(struct work_struct *work)
+{
+	cpufreq_unregister_notifier(&init_cpu_capacity_notifier,
+					 CPUFREQ_POLICY_NOTIFIER);
+}
+
+#else
+static int __init free_raw_capacity(void)
+{
+	kfree(raw_capacity);
+
+	return 0;
+}
+core_initcall(free_raw_capacity);
+#endif
+
 static int __init get_cpu_for_node(struct device_node *node)
 {
 	struct device_node *cpu_node;
@@ -34,6 +250,7 @@ static int __init get_cpu_for_node(struct device_node *node)
 
 	for_each_possible_cpu(cpu) {
 		if (of_get_cpu_node(cpu, NULL) == cpu_node) {
+			parse_cpu_capacity(cpu_node, cpu);
 			of_node_put(cpu_node);
 			return cpu;
 		}
@@ -178,13 +395,17 @@ static int __init parse_dt_topology(void)
 	 * cluster with restricted subnodes.
 	 */
 	map = of_get_child_by_name(cn, "cpu-map");
-	if (!map)
+	if (!map) {
+		cap_parsing_failed = true;
 		goto out;
+	}
 
 	ret = parse_cluster(map, 0);
 	if (ret != 0)
 		goto out_map;
 
+	normalize_cpu_capacity();
+
 	/*
 	 * Check that all cores are in the topology; the SMP code will
 	 * only mark cores described in the DT as possible.
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index c9986b3e0a96..5b830be79c01 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -38,6 +38,7 @@
 #include <asm/esr.h>
 #include <asm/insn.h>
 #include <asm/traps.h>
+#include <asm/stack_pointer.h>
 #include <asm/stacktrace.h>
 #include <asm/exception.h>
 #include <asm/system_misc.h>
@@ -147,6 +148,9 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
 	if (!tsk)
 		tsk = current;
 
+	if (!try_get_task_stack(tsk))
+		return;
+
 	/*
 	 * Switching between stacks is valid when tracing current and in
 	 * non-preemptible context.
@@ -212,6 +216,8 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
 				 stack + sizeof(struct pt_regs));
 		}
 	}
+
+	put_task_stack(tsk);
 }
 
 void show_stack(struct task_struct *tsk, unsigned long *sp)
@@ -227,10 +233,9 @@ void show_stack(struct task_struct *tsk, unsigned long *sp)
 #endif
 #define S_SMP " SMP"
 
-static int __die(const char *str, int err, struct thread_info *thread,
-		 struct pt_regs *regs)
+static int __die(const char *str, int err, struct pt_regs *regs)
 {
-	struct task_struct *tsk = thread->task;
+	struct task_struct *tsk = current;
 	static int die_counter;
 	int ret;
 
@@ -245,7 +250,8 @@ static int __die(const char *str, int err, struct thread_info *thread,
 	print_modules();
 	__show_regs(regs);
 	pr_emerg("Process %.*s (pid: %d, stack limit = 0x%p)\n",
-		 TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), thread + 1);
+		 TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk),
+		 end_of_stack(tsk));
 
 	if (!user_mode(regs)) {
 		dump_mem(KERN_EMERG, "Stack: ", regs->sp,
@@ -264,7 +270,6 @@ static DEFINE_RAW_SPINLOCK(die_lock);
  */
 void die(const char *str, struct pt_regs *regs, int err)
 {
-	struct thread_info *thread = current_thread_info();
 	int ret;
 
 	oops_enter();
@@ -272,9 +277,9 @@ void die(const char *str, struct pt_regs *regs, int err)
 	raw_spin_lock_irq(&die_lock);
 	console_verbose();
 	bust_spinlocks(1);
-	ret = __die(str, err, thread, regs);
+	ret = __die(str, err, regs);
 
-	if (regs && kexec_should_crash(thread->task))
+	if (regs && kexec_should_crash(current))
 		crash_kexec(regs);
 
 	bust_spinlocks(0);
@@ -435,9 +440,10 @@ int cpu_enable_cache_maint_trap(void *__unused)
 }
 
 #define __user_cache_maint(insn, address, res)			\
-	if (untagged_addr(address) >= user_addr_max())		\
+	if (untagged_addr(address) >= user_addr_max()) {	\
 		res = -EFAULT;					\
-	else							\
+	} else {						\
+		uaccess_ttbr0_enable();				\
 		asm volatile (					\
 			"1:	" insn ", %1\n"			\
 			"	mov	%w0, #0\n"		\
@@ -449,7 +455,9 @@ int cpu_enable_cache_maint_trap(void *__unused)
 			"	.popsection\n"			\
 			_ASM_EXTABLE(1b, 3b)			\
 			: "=r" (res)				\
-			: "r" (address), "i" (-EFAULT) )
+			: "r" (address), "i" (-EFAULT));	\
+		uaccess_ttbr0_disable();			\
+	}
 
 static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs)
 {
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 1105aab1e6d6..b8deffa9e1bf 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -216,6 +216,11 @@ SECTIONS
 	swapper_pg_dir = .;
 	. += SWAPPER_DIR_SIZE;
 
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+	reserved_ttbr0 = .;
+	. += RESERVED_TTBR0_SIZE;
+#endif
+
 	_end = .;
 
 	STABS_DEBUG
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 6eaf12c1d627..52cb7ad9b2fd 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -16,9 +16,6 @@ menuconfig VIRTUALIZATION
 
 if VIRTUALIZATION
 
-config KVM_ARM_VGIC_V3_ITS
-	bool
-
 config KVM
 	bool "Kernel-based Virtual Machine (KVM) support"
 	depends on OF
@@ -34,7 +31,6 @@ config KVM
 	select KVM_VFIO
 	select HAVE_KVM_EVENTFD
 	select HAVE_KVM_IRQFD
-	select KVM_ARM_VGIC_V3_ITS
 	select KVM_ARM_PMU if HW_PERF_EVENTS
 	select HAVE_KVM_MSI
 	select HAVE_KVM_IRQCHIP
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index a204adf29f0a..1bfe30dfbfe7 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -57,6 +57,16 @@ static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	return 1;
 }
 
+/*
+ * Guest access to FP/ASIMD registers are routed to this handler only
+ * when the system doesn't support FP/ASIMD.
+ */
+static int handle_no_fpsimd(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+	kvm_inject_undefined(vcpu);
+	return 1;
+}
+
 /**
  * kvm_handle_wfx - handle a wait-for-interrupts or wait-for-event
  *		    instruction executed by a guest
@@ -144,6 +154,7 @@ static exit_handle_fn arm_exit_handlers[] = {
 	[ESR_ELx_EC_BREAKPT_LOW]= kvm_handle_guest_debug,
 	[ESR_ELx_EC_BKPT32]	= kvm_handle_guest_debug,
 	[ESR_ELx_EC_BRK64]	= kvm_handle_guest_debug,
+	[ESR_ELx_EC_FP_ASIMD]	= handle_no_fpsimd,
 };
 
 static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 4e92399f7105..5e9052f087f2 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -106,9 +106,16 @@ el1_trap:
 	 * x0: ESR_EC
 	 */
 
-	/* Guest accessed VFP/SIMD registers, save host, restore Guest */
+	/*
+	 * We trap the first access to the FP/SIMD to save the host context
+	 * and restore the guest context lazily.
+	 * If FP/SIMD is not implemented, handle the trap and inject an
+	 * undefined instruction exception to the guest.
+	 */
+alternative_if_not ARM64_HAS_NO_FPSIMD
 	cmp	x0, #ESR_ELx_EC_FP_ASIMD
 	b.eq	__fpsimd_guest_restore
+alternative_else_nop_endif
 
 	mrs	x1, tpidr_el2
 	mov	x0, #ARM_EXCEPTION_TRAP
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index 83037cd62d01..75e83dd40d43 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -21,6 +21,7 @@
 #include <asm/kvm_asm.h>
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_hyp.h>
+#include <asm/fpsimd.h>
 
 static bool __hyp_text __fpsimd_enabled_nvhe(void)
 {
@@ -76,16 +77,24 @@ static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu)
 	 * traps are only taken to EL2 if the operation would not otherwise
 	 * trap to EL1.  Therefore, always make sure that for 32-bit guests,
 	 * we set FPEXC.EN to prevent traps to EL1, when setting the TFP bit.
+	 * If FP/ASIMD is not implemented, FPEXC is UNDEFINED and any access to
+	 * it will cause an exception.
 	 */
 	val = vcpu->arch.hcr_el2;
-	if (!(val & HCR_RW)) {
+	if (!(val & HCR_RW) && system_supports_fpsimd()) {
 		write_sysreg(1 << 30, fpexc32_el2);
 		isb();
 	}
 	write_sysreg(val, hcr_el2);
 	/* Trap on AArch32 cp15 c15 accesses (EL1 or EL0) */
 	write_sysreg(1 << 15, hstr_el2);
-	/* Make sure we trap PMU access from EL0 to EL2 */
+	/*
+	 * Make sure we trap PMU access from EL0 to EL2. Also sanitize
+	 * PMSELR_EL0 to make sure it never contains the cycle
+	 * counter, which could make a PMXEVCNTR_EL0 access UNDEF at
+	 * EL1 instead of being trapped to EL2.
+	 */
+	write_sysreg(0, pmselr_el0);
 	write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0);
 	write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
 	__activate_traps_arch()();
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 5bc460884639..e95d4f68bf54 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -86,12 +86,6 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_VCPU_ATTRIBUTES:
 		r = 1;
 		break;
-	case KVM_CAP_MSI_DEVID:
-		if (!kvm)
-			r = -EINVAL;
-		else
-			r = kvm->arch.vgic.msis_require_devid;
-		break;
 	default:
 		r = 0;
 	}
diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
index 5d1cad3ce6d6..d7150e30438a 100644
--- a/arch/arm64/lib/clear_user.S
+++ b/arch/arm64/lib/clear_user.S
@@ -17,10 +17,7 @@
  */
 #include <linux/linkage.h>
 
-#include <asm/alternative.h>
-#include <asm/assembler.h>
-#include <asm/cpufeature.h>
-#include <asm/sysreg.h>
+#include <asm/uaccess.h>
 
 	.text
 
@@ -33,8 +30,7 @@
  * Alignment fixed up by hardware.
  */
 ENTRY(__clear_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
-	    CONFIG_ARM64_PAN)
+	uaccess_enable_not_uao x2, x3
 	mov	x2, x1			// save the size for fixup return
 	subs	x1, x1, #8
 	b.mi	2f
@@ -54,8 +50,7 @@ uao_user_alternative 9f, strh, sttrh, wzr, x0, 2
 	b.mi	5f
 uao_user_alternative 9f, strb, sttrb, wzr, x0, 0
 5:	mov	x0, #0
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
-	    CONFIG_ARM64_PAN)
+	uaccess_disable_not_uao x2
 	ret
 ENDPROC(__clear_user)
 
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 4fd67ea03bb0..cfe13396085b 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -16,11 +16,8 @@
 
 #include <linux/linkage.h>
 
-#include <asm/alternative.h>
-#include <asm/assembler.h>
 #include <asm/cache.h>
-#include <asm/cpufeature.h>
-#include <asm/sysreg.h>
+#include <asm/uaccess.h>
 
 /*
  * Copy from user space to a kernel buffer (alignment handled by the hardware)
@@ -67,12 +64,10 @@
 
 end	.req	x5
 ENTRY(__arch_copy_from_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
-	    CONFIG_ARM64_PAN)
+	uaccess_enable_not_uao x3, x4
 	add	end, x0, x2
 #include "copy_template.S"
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
-	    CONFIG_ARM64_PAN)
+	uaccess_disable_not_uao x3
 	mov	x0, #0				// Nothing to copy
 	ret
 ENDPROC(__arch_copy_from_user)
diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S
index f7292dd08c84..718b1c4e2f85 100644
--- a/arch/arm64/lib/copy_in_user.S
+++ b/arch/arm64/lib/copy_in_user.S
@@ -18,11 +18,8 @@
 
 #include <linux/linkage.h>
 
-#include <asm/alternative.h>
-#include <asm/assembler.h>
 #include <asm/cache.h>
-#include <asm/cpufeature.h>
-#include <asm/sysreg.h>
+#include <asm/uaccess.h>
 
 /*
  * Copy from user space to user space (alignment handled by the hardware)
@@ -68,12 +65,10 @@
 
 end	.req	x5
 ENTRY(__copy_in_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
-	    CONFIG_ARM64_PAN)
+	uaccess_enable_not_uao x3, x4
 	add	end, x0, x2
 #include "copy_template.S"
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
-	    CONFIG_ARM64_PAN)
+	uaccess_disable_not_uao x3
 	mov	x0, #0
 	ret
 ENDPROC(__copy_in_user)
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 7a7efe255034..e99e31c9acac 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -16,11 +16,8 @@
 
 #include <linux/linkage.h>
 
-#include <asm/alternative.h>
-#include <asm/assembler.h>
 #include <asm/cache.h>
-#include <asm/cpufeature.h>
-#include <asm/sysreg.h>
+#include <asm/uaccess.h>
 
 /*
  * Copy to user space from a kernel buffer (alignment handled by the hardware)
@@ -66,12 +63,10 @@
 
 end	.req	x5
 ENTRY(__arch_copy_to_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
-	    CONFIG_ARM64_PAN)
+	uaccess_enable_not_uao x3, x4
 	add	end, x0, x2
 #include "copy_template.S"
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
-	    CONFIG_ARM64_PAN)
+	uaccess_disable_not_uao x3
 	mov	x0, #0
 	ret
 ENDPROC(__arch_copy_to_user)
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile
index 54bb209cae8e..e703fb9defad 100644
--- a/arch/arm64/mm/Makefile
+++ b/arch/arm64/mm/Makefile
@@ -3,7 +3,8 @@ obj-y				:= dma-mapping.o extable.o fault.o init.o \
 				   ioremap.o mmap.o pgd.o mmu.o \
 				   context.o proc.o pageattr.o
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
-obj-$(CONFIG_ARM64_PTDUMP)	+= dump.o
+obj-$(CONFIG_ARM64_PTDUMP_CORE)	+= dump.o
+obj-$(CONFIG_ARM64_PTDUMP_DEBUGFS)	+= ptdump_debugfs.o
 obj-$(CONFIG_NUMA)		+= numa.o
 
 obj-$(CONFIG_KASAN)		+= kasan_init.o
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 58b5a906ff78..da9576932322 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -23,6 +23,7 @@
 #include <asm/assembler.h>
 #include <asm/cpufeature.h>
 #include <asm/alternative.h>
+#include <asm/uaccess.h>
 
 /*
  *	flush_icache_range(start,end)
@@ -48,6 +49,7 @@ ENTRY(flush_icache_range)
  *	- end     - virtual end address of region
  */
 ENTRY(__flush_cache_user_range)
+	uaccess_ttbr0_enable x2, x3
 	dcache_line_size x2, x3
 	sub	x3, x2, #1
 	bic	x4, x0, x3
@@ -69,10 +71,12 @@ USER(9f, ic	ivau, x4	)		// invalidate I line PoU
 	dsb	ish
 	isb
 	mov	x0, #0
+1:
+	uaccess_ttbr0_disable x1
 	ret
 9:
 	mov	x0, #-EFAULT
-	ret
+	b	1b
 ENDPROC(flush_icache_range)
 ENDPROC(__flush_cache_user_range)
 
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index efcf1f7ef1e4..4c63cb154859 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -221,7 +221,12 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
 	raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
 
 switch_mm_fastpath:
-	cpu_switch_mm(mm->pgd, mm);
+	/*
+	 * Defer TTBR0_EL1 setting for user threads to uaccess_enable() when
+	 * emulating PAN.
+	 */
+	if (!system_uses_ttbr0_pan())
+		cpu_switch_mm(mm->pgd, mm);
 }
 
 static int asids_init(void)
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 3f74d0d98de6..aa6c8f834d9e 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -938,11 +938,6 @@ static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 
 void arch_teardown_dma_ops(struct device *dev)
 {
-	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
-
-	if (WARN_ON(domain))
-		iommu_detach_device(domain, dev);
-
 	dev->archdata.dma_ops = NULL;
 }
 
diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c
index 9c3e75df2180..ca74a2aace42 100644
--- a/arch/arm64/mm/dump.c
+++ b/arch/arm64/mm/dump.c
@@ -50,6 +50,18 @@ static const struct addr_marker address_markers[] = {
 	{ -1,				NULL },
 };
 
+#define pt_dump_seq_printf(m, fmt, args...)	\
+({						\
+	if (m)					\
+		seq_printf(m, fmt, ##args);	\
+})
+
+#define pt_dump_seq_puts(m, fmt)	\
+({					\
+	if (m)				\
+		seq_printf(m, fmt);	\
+})
+
 /*
  * The page dumper groups page table entries of the same type into a single
  * description. It uses pg_state to track the range information while
@@ -62,6 +74,9 @@ struct pg_state {
 	unsigned long start_address;
 	unsigned level;
 	u64 current_prot;
+	bool check_wx;
+	unsigned long wx_pages;
+	unsigned long uxn_pages;
 };
 
 struct prot_bits {
@@ -186,10 +201,39 @@ static void dump_prot(struct pg_state *st, const struct prot_bits *bits,
 			s = bits->clear;
 
 		if (s)
-			seq_printf(st->seq, " %s", s);
+			pt_dump_seq_printf(st->seq, " %s", s);
 	}
 }
 
+static void note_prot_uxn(struct pg_state *st, unsigned long addr)
+{
+	if (!st->check_wx)
+		return;
+
+	if ((st->current_prot & PTE_UXN) == PTE_UXN)
+		return;
+
+	WARN_ONCE(1, "arm64/mm: Found non-UXN mapping at address %p/%pS\n",
+		  (void *)st->start_address, (void *)st->start_address);
+
+	st->uxn_pages += (addr - st->start_address) / PAGE_SIZE;
+}
+
+static void note_prot_wx(struct pg_state *st, unsigned long addr)
+{
+	if (!st->check_wx)
+		return;
+	if ((st->current_prot & PTE_RDONLY) == PTE_RDONLY)
+		return;
+	if ((st->current_prot & PTE_PXN) == PTE_PXN)
+		return;
+
+	WARN_ONCE(1, "arm64/mm: Found insecure W+X mapping at address %p/%pS\n",
+		  (void *)st->start_address, (void *)st->start_address);
+
+	st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
+}
+
 static void note_page(struct pg_state *st, unsigned long addr, unsigned level,
 				u64 val)
 {
@@ -200,14 +244,16 @@ static void note_page(struct pg_state *st, unsigned long addr, unsigned level,
 		st->level = level;
 		st->current_prot = prot;
 		st->start_address = addr;
-		seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
+		pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
 	} else if (prot != st->current_prot || level != st->level ||
 		   addr >= st->marker[1].start_address) {
 		const char *unit = units;
 		unsigned long delta;
 
 		if (st->current_prot) {
-			seq_printf(st->seq, "0x%016lx-0x%016lx   ",
+			note_prot_uxn(st, addr);
+			note_prot_wx(st, addr);
+			pt_dump_seq_printf(st->seq, "0x%016lx-0x%016lx   ",
 				   st->start_address, addr);
 
 			delta = (addr - st->start_address) >> 10;
@@ -215,17 +261,17 @@ static void note_page(struct pg_state *st, unsigned long addr, unsigned level,
 				delta >>= 10;
 				unit++;
 			}
-			seq_printf(st->seq, "%9lu%c %s", delta, *unit,
+			pt_dump_seq_printf(st->seq, "%9lu%c %s", delta, *unit,
 				   pg_level[st->level].name);
 			if (pg_level[st->level].bits)
 				dump_prot(st, pg_level[st->level].bits,
 					  pg_level[st->level].num);
-			seq_puts(st->seq, "\n");
+			pt_dump_seq_puts(st->seq, "\n");
 		}
 
 		if (addr >= st->marker[1].start_address) {
 			st->marker++;
-			seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
+			pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
 		}
 
 		st->start_address = addr;
@@ -235,7 +281,7 @@ static void note_page(struct pg_state *st, unsigned long addr, unsigned level,
 
 	if (addr >= st->marker[1].start_address) {
 		st->marker++;
-		seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
+		pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
 	}
 
 }
@@ -304,9 +350,8 @@ static void walk_pgd(struct pg_state *st, struct mm_struct *mm,
 	}
 }
 
-static int ptdump_show(struct seq_file *m, void *v)
+void ptdump_walk_pgd(struct seq_file *m, struct ptdump_info *info)
 {
-	struct ptdump_info *info = m->private;
 	struct pg_state st = {
 		.seq = m,
 		.marker = info->markers,
@@ -315,33 +360,16 @@ static int ptdump_show(struct seq_file *m, void *v)
 	walk_pgd(&st, info->mm, info->base_addr);
 
 	note_page(&st, 0, 0, 0);
-	return 0;
 }
 
-static int ptdump_open(struct inode *inode, struct file *file)
+static void ptdump_initialize(void)
 {
-	return single_open(file, ptdump_show, inode->i_private);
-}
-
-static const struct file_operations ptdump_fops = {
-	.open		= ptdump_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-int ptdump_register(struct ptdump_info *info, const char *name)
-{
-	struct dentry *pe;
 	unsigned i, j;
 
 	for (i = 0; i < ARRAY_SIZE(pg_level); i++)
 		if (pg_level[i].bits)
 			for (j = 0; j < pg_level[i].num; j++)
 				pg_level[i].mask |= pg_level[i].bits[j].mask;
-
-	pe = debugfs_create_file(name, 0400, NULL, info, &ptdump_fops);
-	return pe ? 0 : -ENOMEM;
 }
 
 static struct ptdump_info kernel_ptdump_info = {
@@ -350,8 +378,30 @@ static struct ptdump_info kernel_ptdump_info = {
 	.base_addr	= VA_START,
 };
 
+void ptdump_check_wx(void)
+{
+	struct pg_state st = {
+		.seq = NULL,
+		.marker = (struct addr_marker[]) {
+			{ 0, NULL},
+			{ -1, NULL},
+		},
+		.check_wx = true,
+	};
+
+	walk_pgd(&st, &init_mm, 0);
+	note_page(&st, 0, 0, 0);
+	if (st.wx_pages || st.uxn_pages)
+		pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n",
+			st.wx_pages, st.uxn_pages);
+	else
+		pr_info("Checked W+X mappings: passed, no W+X pages found\n");
+}
+
 static int ptdump_init(void)
 {
-	return ptdump_register(&kernel_ptdump_info, "kernel_page_tables");
+	ptdump_initialize();
+	return ptdump_debugfs_register(&kernel_ptdump_info,
+					"kernel_page_tables");
 }
 device_initcall(ptdump_init);
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 0f8788374815..a78a5c401806 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -269,13 +269,19 @@ out:
 	return fault;
 }
 
-static inline bool is_permission_fault(unsigned int esr)
+static inline bool is_permission_fault(unsigned int esr, struct pt_regs *regs)
 {
 	unsigned int ec       = ESR_ELx_EC(esr);
 	unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE;
 
-	return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM) ||
-	       (ec == ESR_ELx_EC_IABT_CUR && fsc_type == ESR_ELx_FSC_PERM);
+	if (ec != ESR_ELx_EC_DABT_CUR && ec != ESR_ELx_EC_IABT_CUR)
+		return false;
+
+	if (system_uses_ttbr0_pan())
+		return fsc_type == ESR_ELx_FSC_FAULT &&
+			(regs->pstate & PSR_PAN_BIT);
+	else
+		return fsc_type == ESR_ELx_FSC_PERM;
 }
 
 static bool is_el0_instruction_abort(unsigned int esr)
@@ -315,7 +321,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 		mm_flags |= FAULT_FLAG_WRITE;
 	}
 
-	if (is_permission_fault(esr) && (addr < USER_DS)) {
+	if (addr < USER_DS && is_permission_fault(esr, regs)) {
 		/* regs->orig_addr_limit may be 0 if we entered from EL0 */
 		if (regs->orig_addr_limit == KERNEL_DS)
 			die("Accessing user space memory with fs=KERNEL_DS", regs, esr);
@@ -507,10 +513,10 @@ static const struct fault_info {
 	{ do_bad,		SIGBUS,  0,		"unknown 17"			},
 	{ do_bad,		SIGBUS,  0,		"unknown 18"			},
 	{ do_bad,		SIGBUS,  0,		"unknown 19"			},
-	{ do_bad,		SIGBUS,  0,		"synchronous abort (translation table walk)" },
-	{ do_bad,		SIGBUS,  0,		"synchronous abort (translation table walk)" },
-	{ do_bad,		SIGBUS,  0,		"synchronous abort (translation table walk)" },
-	{ do_bad,		SIGBUS,  0,		"synchronous abort (translation table walk)" },
+	{ do_bad,		SIGBUS,  0,		"synchronous external abort (translation table walk)" },
+	{ do_bad,		SIGBUS,  0,		"synchronous external abort (translation table walk)" },
+	{ do_bad,		SIGBUS,  0,		"synchronous external abort (translation table walk)" },
+	{ do_bad,		SIGBUS,  0,		"synchronous external abort (translation table walk)" },
 	{ do_bad,		SIGBUS,  0,		"synchronous parity error"	},
 	{ do_bad,		SIGBUS,  0,		"unknown 25"			},
 	{ do_bad,		SIGBUS,  0,		"unknown 26"			},
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index 8377329d8c97..554a2558c12e 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -25,14 +25,7 @@
 #include <asm/cachetype.h>
 #include <asm/tlbflush.h>
 
-void flush_cache_range(struct vm_area_struct *vma, unsigned long start,
-		       unsigned long end)
-{
-	if (vma->vm_flags & VM_EXEC)
-		__flush_icache_all();
-}
-
-static void sync_icache_aliases(void *kaddr, unsigned long len)
+void sync_icache_aliases(void *kaddr, unsigned long len)
 {
 	unsigned long addr = (unsigned long)kaddr;
 
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 2e49bd252fe7..964b7549af5c 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -51,20 +51,8 @@ static int find_num_contig(struct mm_struct *mm, unsigned long addr,
 	*pgsize = PAGE_SIZE;
 	if (!pte_cont(pte))
 		return 1;
-	if (!pgd_present(*pgd)) {
-		VM_BUG_ON(!pgd_present(*pgd));
-		return 1;
-	}
 	pud = pud_offset(pgd, addr);
-	if (!pud_present(*pud)) {
-		VM_BUG_ON(!pud_present(*pud));
-		return 1;
-	}
 	pmd = pmd_offset(pud, addr);
-	if (!pmd_present(*pmd)) {
-		VM_BUG_ON(!pmd_present(*pmd));
-		return 1;
-	}
 	if ((pte_t *)pmd == ptep) {
 		*pgsize = PMD_SIZE;
 		return CONT_PMDS;
@@ -212,7 +200,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
 		ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize);
 		/* save the 1st pte to return */
 		pte = ptep_get_and_clear(mm, addr, cpte);
-		for (i = 1; i < ncontig; ++i) {
+		for (i = 1, addr += pgsize; i < ncontig; ++i, addr += pgsize) {
 			/*
 			 * If HW_AFDBM is enabled, then the HW could
 			 * turn on the dirty bit for any of the page
@@ -250,7 +238,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 		pfn = pte_pfn(*cpte);
 		ncontig = find_num_contig(vma->vm_mm, addr, cpte,
 					  *cpte, &pgsize);
-		for (i = 0; i < ncontig; ++i, ++cpte) {
+		for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) {
 			changed = ptep_set_access_flags(vma, addr, cpte,
 							pfn_pte(pfn,
 								hugeprot),
@@ -273,7 +261,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
 
 		cpte = huge_pte_offset(mm, addr);
 		ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize);
-		for (i = 0; i < ncontig; ++i, ++cpte)
+		for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize)
 			ptep_set_wrprotect(mm, addr, cpte);
 	} else {
 		ptep_set_wrprotect(mm, addr, ptep);
@@ -291,7 +279,7 @@ void huge_ptep_clear_flush(struct vm_area_struct *vma,
 		cpte = huge_pte_offset(vma->vm_mm, addr);
 		ncontig = find_num_contig(vma->vm_mm, addr, cpte,
 					  *cpte, &pgsize);
-		for (i = 0; i < ncontig; ++i, ++cpte)
+		for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize)
 			ptep_clear_flush(vma, addr, cpte);
 	} else {
 		ptep_clear_flush(vma, addr, ptep);
@@ -323,7 +311,7 @@ __setup("hugepagesz=", setup_hugepagesz);
 static __init int add_default_hugepagesz(void)
 {
 	if (size_to_hstate(CONT_PTES * PAGE_SIZE) == NULL)
-		hugetlb_add_hstate(CONT_PMD_SHIFT);
+		hugetlb_add_hstate(CONT_PTE_SHIFT);
 	return 0;
 }
 arch_initcall(add_default_hugepagesz);
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 05615a3fdc6f..17243e43184e 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -28,8 +28,6 @@
 #include <linux/memblock.h>
 #include <linux/fs.h>
 #include <linux/io.h>
-#include <linux/slab.h>
-#include <linux/stop_machine.h>
 
 #include <asm/barrier.h>
 #include <asm/cputype.h>
@@ -42,6 +40,7 @@
 #include <asm/tlb.h>
 #include <asm/memblock.h>
 #include <asm/mmu_context.h>
+#include <asm/ptdump.h>
 
 u64 idmap_t0sz = TCR_T0SZ(VA_BITS);
 
@@ -95,11 +94,24 @@ static phys_addr_t __init early_pgtable_alloc(void)
 	return phys;
 }
 
+static bool pgattr_change_is_safe(u64 old, u64 new)
+{
+	/*
+	 * The following mapping attributes may be updated in live
+	 * kernel mappings without the need for break-before-make.
+	 */
+	static const pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE;
+
+	return old  == 0 || new  == 0 || ((old ^ new) & ~mask) == 0;
+}
+
 static void alloc_init_pte(pmd_t *pmd, unsigned long addr,
 				  unsigned long end, unsigned long pfn,
 				  pgprot_t prot,
-				  phys_addr_t (*pgtable_alloc)(void))
+				  phys_addr_t (*pgtable_alloc)(void),
+				  bool page_mappings_only)
 {
+	pgprot_t __prot = prot;
 	pte_t *pte;
 
 	BUG_ON(pmd_sect(*pmd));
@@ -115,8 +127,28 @@ static void alloc_init_pte(pmd_t *pmd, unsigned long addr,
 
 	pte = pte_set_fixmap_offset(pmd, addr);
 	do {
-		set_pte(pte, pfn_pte(pfn, prot));
+		pte_t old_pte = *pte;
+
+		/*
+		 * Set the contiguous bit for the subsequent group of PTEs if
+		 * its size and alignment are appropriate.
+		 */
+		if (((addr | PFN_PHYS(pfn)) & ~CONT_PTE_MASK) == 0) {
+			if (end - addr >= CONT_PTE_SIZE && !page_mappings_only)
+				__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
+			else
+				__prot = prot;
+		}
+
+		set_pte(pte, pfn_pte(pfn, __prot));
 		pfn++;
+
+		/*
+		 * After the PTE entry has been populated once, we
+		 * only allow updates to the permission attributes.
+		 */
+		BUG_ON(!pgattr_change_is_safe(pte_val(old_pte), pte_val(*pte)));
+
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 
 	pte_clear_fixmap();
@@ -125,8 +157,9 @@ static void alloc_init_pte(pmd_t *pmd, unsigned long addr,
 static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end,
 				  phys_addr_t phys, pgprot_t prot,
 				  phys_addr_t (*pgtable_alloc)(void),
-				  bool allow_block_mappings)
+				  bool page_mappings_only)
 {
+	pgprot_t __prot = prot;
 	pmd_t *pmd;
 	unsigned long next;
 
@@ -146,27 +179,39 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end,
 
 	pmd = pmd_set_fixmap_offset(pud, addr);
 	do {
+		pmd_t old_pmd = *pmd;
+
 		next = pmd_addr_end(addr, end);
+
 		/* try section mapping first */
 		if (((addr | next | phys) & ~SECTION_MASK) == 0 &&
-		      allow_block_mappings) {
-			pmd_t old_pmd =*pmd;
-			pmd_set_huge(pmd, phys, prot);
+		      !page_mappings_only) {
 			/*
-			 * Check for previous table entries created during
-			 * boot (__create_page_tables) and flush them.
+			 * Set the contiguous bit for the subsequent group of
+			 * PMDs if its size and alignment are appropriate.
 			 */
-			if (!pmd_none(old_pmd)) {
-				flush_tlb_all();
-				if (pmd_table(old_pmd)) {
-					phys_addr_t table = pmd_page_paddr(old_pmd);
-					if (!WARN_ON_ONCE(slab_is_available()))
-						memblock_free(table, PAGE_SIZE);
-				}
+			if (((addr | phys) & ~CONT_PMD_MASK) == 0) {
+				if (end - addr >= CONT_PMD_SIZE)
+					__prot = __pgprot(pgprot_val(prot) |
+							  PTE_CONT);
+				else
+					__prot = prot;
 			}
+			pmd_set_huge(pmd, phys, __prot);
+
+			/*
+			 * After the PMD entry has been populated once, we
+			 * only allow updates to the permission attributes.
+			 */
+			BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd),
+						      pmd_val(*pmd)));
 		} else {
 			alloc_init_pte(pmd, addr, next, __phys_to_pfn(phys),
-				       prot, pgtable_alloc);
+				       prot, pgtable_alloc,
+				       page_mappings_only);
+
+			BUG_ON(pmd_val(old_pmd) != 0 &&
+			       pmd_val(old_pmd) != pmd_val(*pmd));
 		}
 		phys += next - addr;
 	} while (pmd++, addr = next, addr != end);
@@ -189,7 +234,7 @@ static inline bool use_1G_block(unsigned long addr, unsigned long next,
 static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end,
 				  phys_addr_t phys, pgprot_t prot,
 				  phys_addr_t (*pgtable_alloc)(void),
-				  bool allow_block_mappings)
+				  bool page_mappings_only)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -204,33 +249,28 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end,
 
 	pud = pud_set_fixmap_offset(pgd, addr);
 	do {
+		pud_t old_pud = *pud;
+
 		next = pud_addr_end(addr, end);
 
 		/*
 		 * For 4K granule only, attempt to put down a 1GB block
 		 */
-		if (use_1G_block(addr, next, phys) && allow_block_mappings) {
-			pud_t old_pud = *pud;
+		if (use_1G_block(addr, next, phys) && !page_mappings_only) {
 			pud_set_huge(pud, phys, prot);
 
 			/*
-			 * If we have an old value for a pud, it will
-			 * be pointing to a pmd table that we no longer
-			 * need (from swapper_pg_dir).
-			 *
-			 * Look up the old pmd table and free it.
+			 * After the PUD entry has been populated once, we
+			 * only allow updates to the permission attributes.
 			 */
-			if (!pud_none(old_pud)) {
-				flush_tlb_all();
-				if (pud_table(old_pud)) {
-					phys_addr_t table = pud_page_paddr(old_pud);
-					if (!WARN_ON_ONCE(slab_is_available()))
-						memblock_free(table, PAGE_SIZE);
-				}
-			}
+			BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
+						      pud_val(*pud)));
 		} else {
 			alloc_init_pmd(pud, addr, next, phys, prot,
-				       pgtable_alloc, allow_block_mappings);
+				       pgtable_alloc, page_mappings_only);
+
+			BUG_ON(pud_val(old_pud) != 0 &&
+			       pud_val(old_pud) != pud_val(*pud));
 		}
 		phys += next - addr;
 	} while (pud++, addr = next, addr != end);
@@ -242,7 +282,7 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
 				 unsigned long virt, phys_addr_t size,
 				 pgprot_t prot,
 				 phys_addr_t (*pgtable_alloc)(void),
-				 bool allow_block_mappings)
+				 bool page_mappings_only)
 {
 	unsigned long addr, length, end, next;
 	pgd_t *pgd = pgd_offset_raw(pgdir, virt);
@@ -262,7 +302,7 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
 	do {
 		next = pgd_addr_end(addr, end);
 		alloc_init_pud(pgd, addr, next, phys, prot, pgtable_alloc,
-			       allow_block_mappings);
+			       page_mappings_only);
 		phys += next - addr;
 	} while (pgd++, addr = next, addr != end);
 }
@@ -291,17 +331,17 @@ static void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt,
 			&phys, virt);
 		return;
 	}
-	__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, true);
+	__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, false);
 }
 
 void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
 			       unsigned long virt, phys_addr_t size,
-			       pgprot_t prot, bool allow_block_mappings)
+			       pgprot_t prot, bool page_mappings_only)
 {
 	BUG_ON(mm == &init_mm);
 
 	__create_pgd_mapping(mm->pgd, phys, virt, size, prot,
-			     pgd_pgtable_alloc, allow_block_mappings);
+			     pgd_pgtable_alloc, page_mappings_only);
 }
 
 static void create_mapping_late(phys_addr_t phys, unsigned long virt,
@@ -314,7 +354,7 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt,
 	}
 
 	__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot,
-			     NULL, !debug_pagealloc_enabled());
+			     NULL, debug_pagealloc_enabled());
 }
 
 static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end)
@@ -332,7 +372,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end
 		__create_pgd_mapping(pgd, start, __phys_to_virt(start),
 				     end - start, PAGE_KERNEL,
 				     early_pgtable_alloc,
-				     !debug_pagealloc_enabled());
+				     debug_pagealloc_enabled());
 		return;
 	}
 
@@ -345,13 +385,13 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end
 				     __phys_to_virt(start),
 				     kernel_start - start, PAGE_KERNEL,
 				     early_pgtable_alloc,
-				     !debug_pagealloc_enabled());
+				     debug_pagealloc_enabled());
 	if (kernel_end < end)
 		__create_pgd_mapping(pgd, kernel_end,
 				     __phys_to_virt(kernel_end),
 				     end - kernel_end, PAGE_KERNEL,
 				     early_pgtable_alloc,
-				     !debug_pagealloc_enabled());
+				     debug_pagealloc_enabled());
 
 	/*
 	 * Map the linear alias of the [_text, __init_begin) interval as
@@ -361,7 +401,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end
 	 */
 	__create_pgd_mapping(pgd, kernel_start, __phys_to_virt(kernel_start),
 			     kernel_end - kernel_start, PAGE_KERNEL_RO,
-			     early_pgtable_alloc, !debug_pagealloc_enabled());
+			     early_pgtable_alloc, debug_pagealloc_enabled());
 }
 
 static void __init map_mem(pgd_t *pgd)
@@ -396,6 +436,11 @@ void mark_rodata_ro(void)
 	section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata;
 	create_mapping_late(__pa(__start_rodata), (unsigned long)__start_rodata,
 			    section_size, PAGE_KERNEL_RO);
+
+	/* flush the TLBs after updating live kernel mappings */
+	flush_tlb_all();
+
+	debug_checkwx();
 }
 
 static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end,
@@ -408,7 +453,7 @@ static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end,
 	BUG_ON(!PAGE_ALIGNED(size));
 
 	__create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot,
-			     early_pgtable_alloc, !debug_pagealloc_enabled());
+			     early_pgtable_alloc, debug_pagealloc_enabled());
 
 	vma->addr	= va_start;
 	vma->phys_addr	= pa_start;
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 352c73b6a59e..32682be978e0 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -70,11 +70,14 @@ ENTRY(cpu_do_suspend)
 	mrs	x8, mdscr_el1
 	mrs	x9, oslsr_el1
 	mrs	x10, sctlr_el1
+	mrs	x11, tpidr_el1
+	mrs	x12, sp_el0
 	stp	x2, x3, [x0]
 	stp	x4, xzr, [x0, #16]
 	stp	x5, x6, [x0, #32]
 	stp	x7, x8, [x0, #48]
 	stp	x9, x10, [x0, #64]
+	stp	x11, x12, [x0, #80]
 	ret
 ENDPROC(cpu_do_suspend)
 
@@ -90,6 +93,7 @@ ENTRY(cpu_do_resume)
 	ldp	x6, x8, [x0, #32]
 	ldp	x9, x10, [x0, #48]
 	ldp	x11, x12, [x0, #64]
+	ldp	x13, x14, [x0, #80]
 	msr	tpidr_el0, x2
 	msr	tpidrro_el0, x3
 	msr	contextidr_el1, x4
@@ -112,6 +116,8 @@ ENTRY(cpu_do_resume)
 	msr	mdscr_el1, x10
 
 	msr	sctlr_el1, x12
+	msr	tpidr_el1, x13
+	msr	sp_el0, x14
 	/*
 	 * Restore oslsr_el1 by writing oslar_el1
 	 */
@@ -136,11 +142,7 @@ ENTRY(cpu_do_switch_mm)
 	bfi	x0, x1, #48, #16		// set the ASID
 	msr	ttbr0_el1, x0			// set TTBR0
 	isb
-alternative_if ARM64_WORKAROUND_CAVIUM_27456
-	ic	iallu
-	dsb	nsh
-	isb
-alternative_else_nop_endif
+	post_ttbr0_update_workaround
 	ret
 ENDPROC(cpu_do_switch_mm)
 
diff --git a/arch/arm64/mm/ptdump_debugfs.c b/arch/arm64/mm/ptdump_debugfs.c
new file mode 100644
index 000000000000..eee4d864350c
--- /dev/null
+++ b/arch/arm64/mm/ptdump_debugfs.c
@@ -0,0 +1,31 @@
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <asm/ptdump.h>
+
+static int ptdump_show(struct seq_file *m, void *v)
+{
+	struct ptdump_info *info = m->private;
+	ptdump_walk_pgd(m, info);
+	return 0;
+}
+
+static int ptdump_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ptdump_show, inode->i_private);
+}
+
+static const struct file_operations ptdump_fops = {
+	.open		= ptdump_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+int ptdump_debugfs_register(struct ptdump_info *info, const char *name)
+{
+	struct dentry *pe;
+	pe = debugfs_create_file(name, 0400, NULL, info, &ptdump_fops);
+	return pe ? 0 : -ENOMEM;
+
+}
diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S
index 329c8027b0a9..b41aff25426d 100644
--- a/arch/arm64/xen/hypercall.S
+++ b/arch/arm64/xen/hypercall.S
@@ -49,6 +49,7 @@
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+#include <asm/uaccess.h>
 #include <xen/interface/xen.h>
 
 
@@ -91,6 +92,20 @@ ENTRY(privcmd_call)
 	mov x2, x3
 	mov x3, x4
 	mov x4, x5
+	/*
+	 * Privcmd calls are issued by the userspace. The kernel needs to
+	 * enable access to TTBR0_EL1 as the hypervisor would issue stage 1
+	 * translations to user memory via AT instructions. Since AT
+	 * instructions are not affected by the PAN bit (ARMv8.1), we only
+	 * need the explicit uaccess_enable/disable if the TTBR0 PAN emulation
+	 * is enabled (it implies that hardware UAO and PAN disabled).
+	 */
+	uaccess_ttbr0_enable x6, x7
 	hvc XEN_IMM
+
+	/*
+	 * Disable userspace access from kernel once the hyp call completed.
+	 */
+	uaccess_ttbr0_disable x6
 	ret
 ENDPROC(privcmd_call);
diff --git a/arch/blackfin/mach-bf561/coreb.c b/arch/blackfin/mach-bf561/coreb.c
index 8a2543c654b3..cf27554e76bf 100644
--- a/arch/blackfin/mach-bf561/coreb.c
+++ b/arch/blackfin/mach-bf561/coreb.c
@@ -1,5 +1,7 @@
 /* Load firmware into Core B on a BF561
  *
+ * Author: Bas Vermeulen <bvermeul@blackstar.xs4all.nl>
+ *
  * Copyright 2004-2009 Analog Devices Inc.
  * Licensed under the GPL-2 or later.
  */
@@ -14,9 +16,9 @@
 
 #include <linux/device.h>
 #include <linux/fs.h>
+#include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/miscdevice.h>
-#include <linux/module.h>
 
 #define CMD_COREB_START		_IO('b', 0)
 #define CMD_COREB_STOP		_IO('b', 1)
@@ -59,8 +61,4 @@ static struct miscdevice coreb_dev = {
 	.name  = "coreb",
 	.fops  = &coreb_fops,
 };
-module_misc_device(coreb_dev);
-
-MODULE_AUTHOR("Bas Vermeulen <bvermeul@blackstar.xs4all.nl>");
-MODULE_DESCRIPTION("BF561 Core B Support");
-MODULE_LICENSE("GPL");
+builtin_misc_device(coreb_dev);
diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c
index fae2f9447792..6080582a26d1 100644
--- a/arch/mips/kernel/asm-offsets.c
+++ b/arch/mips/kernel/asm-offsets.c
@@ -341,7 +341,7 @@ void output_pm_defines(void)
 
 void output_kvm_defines(void)
 {
-	COMMENT(" KVM/MIPS Specfic offsets. ");
+	COMMENT(" KVM/MIPS Specific offsets. ");
 
 	OFFSET(VCPU_FPR0, kvm_vcpu_arch, fpu.fpr[0]);
 	OFFSET(VCPU_FPR1, kvm_vcpu_arch, fpu.fpr[1]);
diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index e407af2b7333..2e6a823fa502 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -70,7 +70,9 @@
 
 #define HPTE_V_SSIZE_SHIFT	62
 #define HPTE_V_AVPN_SHIFT	7
+#define HPTE_V_COMMON_BITS	ASM_CONST(0x000fffffffffffff)
 #define HPTE_V_AVPN		ASM_CONST(0x3fffffffffffff80)
+#define HPTE_V_AVPN_3_0		ASM_CONST(0x000fffffffffff80)
 #define HPTE_V_AVPN_VAL(x)	(((x) & HPTE_V_AVPN) >> HPTE_V_AVPN_SHIFT)
 #define HPTE_V_COMPARE(x,y)	(!(((x) ^ (y)) & 0xffffffffffffff80UL))
 #define HPTE_V_BOLTED		ASM_CONST(0x0000000000000010)
@@ -80,14 +82,16 @@
 #define HPTE_V_VALID		ASM_CONST(0x0000000000000001)
 
 /*
- * ISA 3.0 have a different HPTE format.
+ * ISA 3.0 has a different HPTE format.
  */
 #define HPTE_R_3_0_SSIZE_SHIFT	58
+#define HPTE_R_3_0_SSIZE_MASK	(3ull << HPTE_R_3_0_SSIZE_SHIFT)
 #define HPTE_R_PP0		ASM_CONST(0x8000000000000000)
 #define HPTE_R_TS		ASM_CONST(0x4000000000000000)
 #define HPTE_R_KEY_HI		ASM_CONST(0x3000000000000000)
 #define HPTE_R_RPN_SHIFT	12
 #define HPTE_R_RPN		ASM_CONST(0x0ffffffffffff000)
+#define HPTE_R_RPN_3_0		ASM_CONST(0x01fffffffffff000)
 #define HPTE_R_PP		ASM_CONST(0x0000000000000003)
 #define HPTE_R_PPP		ASM_CONST(0x8000000000000003)
 #define HPTE_R_N		ASM_CONST(0x0000000000000004)
@@ -316,12 +320,43 @@ static inline unsigned long hpte_encode_avpn(unsigned long vpn, int psize,
 	 */
 	v = (vpn >> (23 - VPN_SHIFT)) & ~(mmu_psize_defs[psize].avpnm);
 	v <<= HPTE_V_AVPN_SHIFT;
-	if (!cpu_has_feature(CPU_FTR_ARCH_300))
-		v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT;
+	v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT;
 	return v;
 }
 
 /*
+ * ISA v3.0 defines a new HPTE format, which differs from the old
+ * format in having smaller AVPN and ARPN fields, and the B field
+ * in the second dword instead of the first.
+ */
+static inline unsigned long hpte_old_to_new_v(unsigned long v)
+{
+	/* trim AVPN, drop B */
+	return v & HPTE_V_COMMON_BITS;
+}
+
+static inline unsigned long hpte_old_to_new_r(unsigned long v, unsigned long r)
+{
+	/* move B field from 1st to 2nd dword, trim ARPN */
+	return (r & ~HPTE_R_3_0_SSIZE_MASK) |
+		(((v) >> HPTE_V_SSIZE_SHIFT) << HPTE_R_3_0_SSIZE_SHIFT);
+}
+
+static inline unsigned long hpte_new_to_old_v(unsigned long v, unsigned long r)
+{
+	/* insert B field */
+	return (v & HPTE_V_COMMON_BITS) |
+		((r & HPTE_R_3_0_SSIZE_MASK) <<
+		 (HPTE_V_SSIZE_SHIFT - HPTE_R_3_0_SSIZE_SHIFT));
+}
+
+static inline unsigned long hpte_new_to_old_r(unsigned long r)
+{
+	/* clear out B field */
+	return r & ~HPTE_R_3_0_SSIZE_MASK;
+}
+
+/*
  * This function sets the AVPN and L fields of the HPTE  appropriately
  * using the base page size and actual page size.
  */
@@ -341,12 +376,8 @@ static inline unsigned long hpte_encode_v(unsigned long vpn, int base_psize,
  * aligned for the requested page size
  */
 static inline unsigned long hpte_encode_r(unsigned long pa, int base_psize,
-					  int actual_psize, int ssize)
+					  int actual_psize)
 {
-
-	if (cpu_has_feature(CPU_FTR_ARCH_300))
-		pa |= ((unsigned long) ssize) << HPTE_R_3_0_SSIZE_SHIFT;
-
 	/* A 4K page needs no special encoding */
 	if (actual_psize == MMU_PAGE_4K)
 		return pa & HPTE_R_RPN;
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index 05cabed3d1bd..09a802bb702f 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -99,6 +99,7 @@
 #define BOOK3S_INTERRUPT_H_EMUL_ASSIST	0xe40
 #define BOOK3S_INTERRUPT_HMI		0xe60
 #define BOOK3S_INTERRUPT_H_DOORBELL	0xe80
+#define BOOK3S_INTERRUPT_H_VIRT		0xea0
 #define BOOK3S_INTERRUPT_PERFMON	0xf00
 #define BOOK3S_INTERRUPT_ALTIVEC	0xf20
 #define BOOK3S_INTERRUPT_VSX		0xf40
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 28350a294b1e..e59b172666cd 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -48,7 +48,7 @@
 #ifdef CONFIG_KVM_MMIO
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #endif
-#define KVM_HALT_POLL_NS_DEFAULT 500000
+#define KVM_HALT_POLL_NS_DEFAULT 10000	/* 10 us */
 
 /* These values are internal and can be increased later */
 #define KVM_NR_IRQCHIPS          1
@@ -244,8 +244,10 @@ struct kvm_arch_memory_slot {
 struct kvm_arch {
 	unsigned int lpid;
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	unsigned int tlb_sets;
 	unsigned long hpt_virt;
 	struct revmap_entry *revmap;
+	atomic64_t mmio_update;
 	unsigned int host_lpid;
 	unsigned long host_lpcr;
 	unsigned long sdr1;
@@ -408,6 +410,24 @@ struct kvmppc_passthru_irqmap {
 #define KVMPPC_IRQ_MPIC		1
 #define KVMPPC_IRQ_XICS		2
 
+#define MMIO_HPTE_CACHE_SIZE	4
+
+struct mmio_hpte_cache_entry {
+	unsigned long hpte_v;
+	unsigned long hpte_r;
+	unsigned long rpte;
+	unsigned long pte_index;
+	unsigned long eaddr;
+	unsigned long slb_v;
+	long mmio_update;
+	unsigned int slb_base_pshift;
+};
+
+struct mmio_hpte_cache {
+	struct mmio_hpte_cache_entry entry[MMIO_HPTE_CACHE_SIZE];
+	unsigned int index;
+};
+
 struct openpic;
 
 struct kvm_vcpu_arch {
@@ -498,6 +518,8 @@ struct kvm_vcpu_arch {
 	ulong tcscr;
 	ulong acop;
 	ulong wort;
+	ulong tid;
+	ulong psscr;
 	ulong shadow_srr1;
 #endif
 	u32 vrsave; /* also USPRG0 */
@@ -546,6 +568,7 @@ struct kvm_vcpu_arch {
 	u64 tfiar;
 
 	u32 cr_tm;
+	u64 xer_tm;
 	u64 lr_tm;
 	u64 ctr_tm;
 	u64 amr_tm;
@@ -655,9 +678,11 @@ struct kvm_vcpu_arch {
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 	struct kvm_vcpu_arch_shared shregs;
 
+	struct mmio_hpte_cache mmio_cache;
 	unsigned long pgfault_addr;
 	long pgfault_index;
 	unsigned long pgfault_hpte[2];
+	struct mmio_hpte_cache_entry *pgfault_cache;
 
 	struct task_struct *run_task;
 	struct kvm_run *kvm_run;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index f6e49640dbe1..2da67bf1f2ec 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -483,9 +483,10 @@ extern void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long guest_irq,
 				   unsigned long host_irq);
 extern void kvmppc_xics_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
 				   unsigned long host_irq);
-extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, u32 xirr,
-				 struct kvmppc_irq_map *irq_map,
-				 struct kvmppc_passthru_irqmap *pimap);
+extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, __be32 xirr,
+					struct kvmppc_irq_map *irq_map,
+					struct kvmppc_passthru_irqmap *pimap,
+					bool *again);
 extern int h_ipi_redirect;
 #else
 static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
@@ -510,6 +511,48 @@ static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
 #endif
 
 /*
+ * Prototypes for functions called only from assembler code.
+ * Having prototypes reduces sparse errors.
+ */
+long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+			 unsigned long ioba, unsigned long tce);
+long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+				  unsigned long liobn, unsigned long ioba,
+				  unsigned long tce_list, unsigned long npages);
+long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
+			   unsigned long liobn, unsigned long ioba,
+			   unsigned long tce_value, unsigned long npages);
+long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
+                            unsigned int yield_count);
+long kvmppc_h_random(struct kvm_vcpu *vcpu);
+void kvmhv_commence_exit(int trap);
+long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu);
+void kvmppc_subcore_enter_guest(void);
+void kvmppc_subcore_exit_guest(void);
+long kvmppc_realmode_hmi_handler(void);
+long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
+                    long pte_index, unsigned long pteh, unsigned long ptel);
+long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
+                     unsigned long pte_index, unsigned long avpn);
+long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu);
+long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
+                      unsigned long pte_index, unsigned long avpn,
+                      unsigned long va);
+long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
+                   unsigned long pte_index);
+long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
+                        unsigned long pte_index);
+long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
+                        unsigned long pte_index);
+long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
+                          unsigned long slb_v, unsigned int status, bool data);
+unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu);
+int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+                    unsigned long mfrr);
+int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
+int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr);
+
+/*
  * Host-side operations we want to set up while running in real
  * mode in the guest operating on the xics.
  * Currently only VCPU wakeup is supported.
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index e311c25751a4..8d1499334257 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -214,6 +214,11 @@ extern u64 ppc64_rma_size;
 /* Cleanup function used by kexec */
 extern void mmu_cleanup_all(void);
 extern void radix__mmu_cleanup_all(void);
+
+/* Functions for creating and updating partition table on POWER9 */
+extern void mmu_partition_table_init(void);
+extern void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
+					  unsigned long dw1);
 #endif /* CONFIG_PPC64 */
 
 struct mm_struct;
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index e958b7096f19..5c7db0f1a708 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -220,9 +220,12 @@ int64_t opal_pci_set_power_state(uint64_t async_token, uint64_t id,
 int64_t opal_pci_poll2(uint64_t id, uint64_t data);
 
 int64_t opal_int_get_xirr(uint32_t *out_xirr, bool just_poll);
+int64_t opal_rm_int_get_xirr(__be32 *out_xirr, bool just_poll);
 int64_t opal_int_set_cppr(uint8_t cppr);
 int64_t opal_int_eoi(uint32_t xirr);
+int64_t opal_rm_int_eoi(uint32_t xirr);
 int64_t opal_int_set_mfrr(uint32_t cpu, uint8_t mfrr);
+int64_t opal_rm_int_set_mfrr(uint32_t cpu, uint8_t mfrr);
 int64_t opal_pci_tce_kill(uint64_t phb_id, uint32_t kill_type,
 			  uint32_t pe_num, uint32_t tce_size,
 			  uint64_t dma_addr, uint32_t npages);
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 9e1499f98def..04aa1ee8cdb6 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -153,6 +153,8 @@
 #define PSSCR_EC		0x00100000 /* Exit Criterion */
 #define PSSCR_ESL		0x00200000 /* Enable State Loss */
 #define PSSCR_SD		0x00400000 /* Status Disable */
+#define PSSCR_PLS	0xf000000000000000 /* Power-saving Level Status */
+#define PSSCR_GUEST_VIS	0xf0000000000003ff /* Guest-visible PSSCR fields */
 
 /* Floating Point Status and Control Register (FPSCR) Fields */
 #define FPSCR_FX	0x80000000	/* FPU exception summary */
@@ -236,6 +238,7 @@
 #define SPRN_TEXASRU	0x83	/* ''	   ''	   ''	 Upper 32  */
 #define   TEXASR_FS	__MASK(63-36) /* TEXASR Failure Summary */
 #define SPRN_TFHAR	0x80	/* Transaction Failure Handler Addr */
+#define SPRN_TIDR	144	/* Thread ID register */
 #define SPRN_CTRLF	0x088
 #define SPRN_CTRLT	0x098
 #define   CTRL_CT	0xc0000000	/* current thread */
@@ -294,6 +297,7 @@
 #define SPRN_HSRR1	0x13B	/* Hypervisor Save/Restore 1 */
 #define SPRN_LMRR	0x32D	/* Load Monitor Region Register */
 #define SPRN_LMSER	0x32E	/* Load Monitor Section Enable Register */
+#define SPRN_ASDR	0x330	/* Access segment descriptor register */
 #define SPRN_IC		0x350	/* Virtual Instruction Count */
 #define SPRN_VTB	0x351	/* Virtual Time Base */
 #define SPRN_LDBAR	0x352	/* LD Base Address Register */
@@ -305,6 +309,7 @@
 
 /* HFSCR and FSCR bit numbers are the same */
 #define FSCR_LM_LG	11	/* Enable Load Monitor Registers */
+#define FSCR_MSGP_LG	10	/* Enable MSGP */
 #define FSCR_TAR_LG	8	/* Enable Target Address Register */
 #define FSCR_EBB_LG	7	/* Enable Event Based Branching */
 #define FSCR_TM_LG	5	/* Enable Transactional Memory */
@@ -320,6 +325,7 @@
 #define   FSCR_DSCR	__MASK(FSCR_DSCR_LG)
 #define SPRN_HFSCR	0xbe	/* HV=1 Facility Status & Control Register */
 #define   HFSCR_LM	__MASK(FSCR_LM_LG)
+#define   HFSCR_MSGP	__MASK(FSCR_MSGP_LG)
 #define   HFSCR_TAR	__MASK(FSCR_TAR_LG)
 #define   HFSCR_EBB	__MASK(FSCR_EBB_LG)
 #define   HFSCR_TM	__MASK(FSCR_TM_LG)
@@ -358,6 +364,7 @@
 #define     LPCR_PECE_HVEE	ASM_CONST(0x0000400000000000)	/* P9 Wakeup on HV interrupts */
 #define   LPCR_MER		ASM_CONST(0x0000000000000800)	/* Mediated External Exception */
 #define   LPCR_MER_SH		11
+#define	  LPCR_GTSE		ASM_CONST(0x0000000000000400)  	/* Guest Translation Shootdown Enable */
 #define   LPCR_TC		ASM_CONST(0x0000000000000200)	/* Translation control */
 #define   LPCR_LPES		0x0000000c
 #define   LPCR_LPES0		ASM_CONST(0x0000000000000008)      /* LPAR Env selector 0 */
@@ -378,6 +385,12 @@
 #define   PCR_VEC_DIS	(1ul << (63-0))	/* Vec. disable (bit NA since POWER8) */
 #define   PCR_VSX_DIS	(1ul << (63-1))	/* VSX disable (bit NA since POWER8) */
 #define   PCR_TM_DIS	(1ul << (63-2))	/* Trans. memory disable (POWER8) */
+/*
+ * These bits are used in the function kvmppc_set_arch_compat() to specify and
+ * determine both the compatibility level which we want to emulate and the
+ * compatibility level which the host is capable of emulating.
+ */
+#define   PCR_ARCH_207	0x8		/* Architecture 2.07 */
 #define   PCR_ARCH_206	0x4		/* Architecture 2.06 */
 #define   PCR_ARCH_205	0x2		/* Architecture 2.05 */
 #define	SPRN_HEIR	0x153	/* Hypervisor Emulated Instruction Register */
@@ -1219,6 +1232,7 @@
 #define PVR_ARCH_206	0x0f000003
 #define PVR_ARCH_206p	0x0f100003
 #define PVR_ARCH_207	0x0f000004
+#define PVR_ARCH_300	0x0f000005
 
 /* Macros for setting and retrieving special purpose registers */
 #ifndef __ASSEMBLY__
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index c93cf35ce379..3603b6f51b11 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -573,6 +573,10 @@ struct kvm_get_htab_header {
 #define KVM_REG_PPC_SPRG9	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xba)
 #define KVM_REG_PPC_DBSR	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbb)
 
+/* POWER9 registers */
+#define KVM_REG_PPC_TIDR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbc)
+#define KVM_REG_PPC_PSSCR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbd)
+
 /* Transactional Memory checkpointed state:
  * This is all GPRs, all VSX regs and a subset of SPRs
  */
@@ -596,6 +600,7 @@ struct kvm_get_htab_header {
 #define KVM_REG_PPC_TM_VSCR	(KVM_REG_PPC_TM | KVM_REG_SIZE_U32 | 0x67)
 #define KVM_REG_PPC_TM_DSCR	(KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x68)
 #define KVM_REG_PPC_TM_TAR	(KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x69)
+#define KVM_REG_PPC_TM_XER	(KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x6a)
 
 /* PPC64 eXternal Interrupt Controller Specification */
 #define KVM_DEV_XICS_GRP_SOURCES	1	/* 64-bit source attributes */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index caec7bf3b99a..195a9fc8f81c 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -487,6 +487,7 @@ int main(void)
 
 	/* book3s */
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	DEFINE(KVM_TLB_SETS, offsetof(struct kvm, arch.tlb_sets));
 	DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1));
 	DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid));
 	DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
@@ -548,6 +549,8 @@ int main(void)
 	DEFINE(VCPU_TCSCR, offsetof(struct kvm_vcpu, arch.tcscr));
 	DEFINE(VCPU_ACOP, offsetof(struct kvm_vcpu, arch.acop));
 	DEFINE(VCPU_WORT, offsetof(struct kvm_vcpu, arch.wort));
+	DEFINE(VCPU_TID, offsetof(struct kvm_vcpu, arch.tid));
+	DEFINE(VCPU_PSSCR, offsetof(struct kvm_vcpu, arch.psscr));
 	DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_map));
 	DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
 	DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads));
@@ -569,6 +572,7 @@ int main(void)
 	DEFINE(VCPU_VRS_TM, offsetof(struct kvm_vcpu, arch.vr_tm.vr));
 	DEFINE(VCPU_VRSAVE_TM, offsetof(struct kvm_vcpu, arch.vrsave_tm));
 	DEFINE(VCPU_CR_TM, offsetof(struct kvm_vcpu, arch.cr_tm));
+	DEFINE(VCPU_XER_TM, offsetof(struct kvm_vcpu, arch.xer_tm));
 	DEFINE(VCPU_LR_TM, offsetof(struct kvm_vcpu, arch.lr_tm));
 	DEFINE(VCPU_CTR_TM, offsetof(struct kvm_vcpu, arch.ctr_tm));
 	DEFINE(VCPU_AMR_TM, offsetof(struct kvm_vcpu, arch.amr_tm));
diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S
index 37c027ca83b2..f3e1f5d29dce 100644
--- a/arch/powerpc/kernel/cpu_setup_power.S
+++ b/arch/powerpc/kernel/cpu_setup_power.S
@@ -174,7 +174,7 @@ __init_FSCR:
 __init_HFSCR:
 	mfspr	r3,SPRN_HFSCR
 	ori	r3,r3,HFSCR_TAR|HFSCR_TM|HFSCR_BHRB|HFSCR_PM|\
-		      HFSCR_DSCR|HFSCR_VECVSX|HFSCR_FP|HFSCR_EBB
+		      HFSCR_DSCR|HFSCR_VECVSX|HFSCR_FP|HFSCR_EBB|HFSCR_MSGP
 	mtspr	SPRN_HFSCR,r3
 	blr
 
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 05f09ae82587..b795dd1ac2ef 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -88,6 +88,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
 	/* 128 (2**7) bytes in each HPTEG */
 	kvm->arch.hpt_mask = (1ul << (order - 7)) - 1;
 
+	atomic64_set(&kvm->arch.mmio_update, 0);
+
 	/* Allocate reverse map array */
 	rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte);
 	if (!rev) {
@@ -255,7 +257,7 @@ static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
 	kvmppc_set_msr(vcpu, msr);
 }
 
-long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
+static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
 				long pte_index, unsigned long pteh,
 				unsigned long ptel, unsigned long *pte_idx_ret)
 {
@@ -312,7 +314,7 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 	struct kvmppc_slb *slbe;
 	unsigned long slb_v;
 	unsigned long pp, key;
-	unsigned long v, gr;
+	unsigned long v, orig_v, gr;
 	__be64 *hptep;
 	int index;
 	int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
@@ -337,10 +339,12 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 		return -ENOENT;
 	}
 	hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
-	v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
+	v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1]));
 	gr = kvm->arch.revmap[index].guest_rpte;
 
-	unlock_hpte(hptep, v);
+	unlock_hpte(hptep, orig_v);
 	preempt_enable();
 
 	gpte->eaddr = eaddr;
@@ -438,6 +442,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 {
 	struct kvm *kvm = vcpu->kvm;
 	unsigned long hpte[3], r;
+	unsigned long hnow_v, hnow_r;
 	__be64 *hptep;
 	unsigned long mmu_seq, psize, pte_size;
 	unsigned long gpa_base, gfn_base;
@@ -451,6 +456,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	unsigned int writing, write_ok;
 	struct vm_area_struct *vma;
 	unsigned long rcbits;
+	long mmio_update;
 
 	/*
 	 * Real-mode code has already searched the HPT and found the
@@ -460,6 +466,19 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	 */
 	if (ea != vcpu->arch.pgfault_addr)
 		return RESUME_GUEST;
+
+	if (vcpu->arch.pgfault_cache) {
+		mmio_update = atomic64_read(&kvm->arch.mmio_update);
+		if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) {
+			r = vcpu->arch.pgfault_cache->rpte;
+			psize = hpte_page_size(vcpu->arch.pgfault_hpte[0], r);
+			gpa_base = r & HPTE_R_RPN & ~(psize - 1);
+			gfn_base = gpa_base >> PAGE_SHIFT;
+			gpa = gpa_base | (ea & (psize - 1));
+			return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
+						dsisr & DSISR_ISSTORE);
+		}
+	}
 	index = vcpu->arch.pgfault_index;
 	hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
 	rev = &kvm->arch.revmap[index];
@@ -472,6 +491,10 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	unlock_hpte(hptep, hpte[0]);
 	preempt_enable();
 
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		hpte[0] = hpte_new_to_old_v(hpte[0], hpte[1]);
+		hpte[1] = hpte_new_to_old_r(hpte[1]);
+	}
 	if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
 	    hpte[1] != vcpu->arch.pgfault_hpte[1])
 		return RESUME_GUEST;
@@ -575,16 +598,22 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	 */
 	if (psize < PAGE_SIZE)
 		psize = PAGE_SIZE;
-	r = (r & ~(HPTE_R_PP0 - psize)) | ((pfn << PAGE_SHIFT) & ~(psize - 1));
+	r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) |
+					((pfn << PAGE_SHIFT) & ~(psize - 1));
 	if (hpte_is_writable(r) && !write_ok)
 		r = hpte_make_readonly(r);
 	ret = RESUME_GUEST;
 	preempt_disable();
 	while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
 		cpu_relax();
-	if ((be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK) != hpte[0] ||
-		be64_to_cpu(hptep[1]) != hpte[1] ||
-		rev->guest_rpte != hpte[2])
+	hnow_v = be64_to_cpu(hptep[0]);
+	hnow_r = be64_to_cpu(hptep[1]);
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		hnow_v = hpte_new_to_old_v(hnow_v, hnow_r);
+		hnow_r = hpte_new_to_old_r(hnow_r);
+	}
+	if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] ||
+	    rev->guest_rpte != hpte[2])
 		/* HPTE has been changed under us; let the guest retry */
 		goto out_unlock;
 	hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
@@ -615,6 +644,10 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
 	}
 
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		r = hpte_old_to_new_r(hpte[0], r);
+		hpte[0] = hpte_old_to_new_v(hpte[0]);
+	}
 	hptep[1] = cpu_to_be64(r);
 	eieio();
 	__unlock_hpte(hptep, hpte[0]);
@@ -758,6 +791,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 		    hpte_rpn(ptel, psize) == gfn) {
 			hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
 			kvmppc_invalidate_hpte(kvm, hptep, i);
+			hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
 			/* Harvest R and C */
 			rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
 			*rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
@@ -1165,7 +1199,7 @@ static long record_hpte(unsigned long flags, __be64 *hptp,
 			unsigned long *hpte, struct revmap_entry *revp,
 			int want_valid, int first_pass)
 {
-	unsigned long v, r;
+	unsigned long v, r, hr;
 	unsigned long rcbits_unset;
 	int ok = 1;
 	int valid, dirty;
@@ -1192,6 +1226,11 @@ static long record_hpte(unsigned long flags, __be64 *hptp,
 		while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
 			cpu_relax();
 		v = be64_to_cpu(hptp[0]);
+		hr = be64_to_cpu(hptp[1]);
+		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+			v = hpte_new_to_old_v(v, hr);
+			hr = hpte_new_to_old_r(hr);
+		}
 
 		/* re-evaluate valid and dirty from synchronized HPTE value */
 		valid = !!(v & HPTE_V_VALID);
@@ -1199,8 +1238,8 @@ static long record_hpte(unsigned long flags, __be64 *hptp,
 
 		/* Harvest R and C into guest view if necessary */
 		rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
-		if (valid && (rcbits_unset & be64_to_cpu(hptp[1]))) {
-			revp->guest_rpte |= (be64_to_cpu(hptp[1]) &
+		if (valid && (rcbits_unset & hr)) {
+			revp->guest_rpte |= (hr &
 				(HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED;
 			dirty = 1;
 		}
@@ -1608,7 +1647,7 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
 	return ret;
 }
 
-ssize_t debugfs_htab_write(struct file *file, const char __user *buf,
+static ssize_t debugfs_htab_write(struct file *file, const char __user *buf,
 			   size_t len, loff_t *ppos)
 {
 	return -EACCES;
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index d461c440889a..e4c4ea973e57 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -39,7 +39,6 @@
 #include <asm/udbg.h>
 #include <asm/iommu.h>
 #include <asm/tce.h>
-#include <asm/iommu.h>
 
 #define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64))
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 39ef1f4a7b02..8dcbe37a4dac 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -54,6 +54,9 @@
 #include <asm/dbell.h>
 #include <asm/hmi.h>
 #include <asm/pnv-pci.h>
+#include <asm/mmu.h>
+#include <asm/opal.h>
+#include <asm/xics.h>
 #include <linux/gfp.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
@@ -62,6 +65,7 @@
 #include <linux/irqbypass.h>
 #include <linux/module.h>
 #include <linux/compiler.h>
+#include <linux/of.h>
 
 #include "book3s.h"
 
@@ -104,23 +108,6 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
 MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
 #endif
 
-/* Maximum halt poll interval defaults to KVM_HALT_POLL_NS_DEFAULT */
-static unsigned int halt_poll_max_ns = KVM_HALT_POLL_NS_DEFAULT;
-module_param(halt_poll_max_ns, uint, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(halt_poll_max_ns, "Maximum halt poll time in ns");
-
-/* Factor by which the vcore halt poll interval is grown, default is to double
- */
-static unsigned int halt_poll_ns_grow = 2;
-module_param(halt_poll_ns_grow, int, S_IRUGO);
-MODULE_PARM_DESC(halt_poll_ns_grow, "Factor halt poll time is grown by");
-
-/* Factor by which the vcore halt poll interval is shrunk, default is to reset
- */
-static unsigned int halt_poll_ns_shrink;
-module_param(halt_poll_ns_shrink, int, S_IRUGO);
-MODULE_PARM_DESC(halt_poll_ns_shrink, "Factor halt poll time is shrunk by");
-
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
@@ -146,12 +133,21 @@ static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc,
 
 static bool kvmppc_ipi_thread(int cpu)
 {
+	unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
+
+	/* On POWER9 we can use msgsnd to IPI any cpu */
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		msg |= get_hard_smp_processor_id(cpu);
+		smp_mb();
+		__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
+		return true;
+	}
+
 	/* On POWER8 for IPIs to threads in the same core, use msgsnd */
 	if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
 		preempt_disable();
 		if (cpu_first_thread_sibling(cpu) ==
 		    cpu_first_thread_sibling(smp_processor_id())) {
-			unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
 			msg |= cpu_thread_in_core(cpu);
 			smp_mb();
 			__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
@@ -162,8 +158,12 @@ static bool kvmppc_ipi_thread(int cpu)
 	}
 
 #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
-	if (cpu >= 0 && cpu < nr_cpu_ids && paca[cpu].kvm_hstate.xics_phys) {
-		xics_wake_cpu(cpu);
+	if (cpu >= 0 && cpu < nr_cpu_ids) {
+		if (paca[cpu].kvm_hstate.xics_phys) {
+			xics_wake_cpu(cpu);
+			return true;
+		}
+		opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY);
 		return true;
 	}
 #endif
@@ -299,41 +299,54 @@ static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
 	vcpu->arch.pvr = pvr;
 }
 
+/* Dummy value used in computing PCR value below */
+#define PCR_ARCH_300	(PCR_ARCH_207 << 1)
+
 static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
 {
-	unsigned long pcr = 0;
+	unsigned long host_pcr_bit = 0, guest_pcr_bit = 0;
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
+	/* We can (emulate) our own architecture version and anything older */
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		host_pcr_bit = PCR_ARCH_300;
+	else if (cpu_has_feature(CPU_FTR_ARCH_207S))
+		host_pcr_bit = PCR_ARCH_207;
+	else if (cpu_has_feature(CPU_FTR_ARCH_206))
+		host_pcr_bit = PCR_ARCH_206;
+	else
+		host_pcr_bit = PCR_ARCH_205;
+
+	/* Determine lowest PCR bit needed to run guest in given PVR level */
+	guest_pcr_bit = host_pcr_bit;
 	if (arch_compat) {
 		switch (arch_compat) {
 		case PVR_ARCH_205:
-			/*
-			 * If an arch bit is set in PCR, all the defined
-			 * higher-order arch bits also have to be set.
-			 */
-			pcr = PCR_ARCH_206 | PCR_ARCH_205;
+			guest_pcr_bit = PCR_ARCH_205;
 			break;
 		case PVR_ARCH_206:
 		case PVR_ARCH_206p:
-			pcr = PCR_ARCH_206;
+			guest_pcr_bit = PCR_ARCH_206;
 			break;
 		case PVR_ARCH_207:
+			guest_pcr_bit = PCR_ARCH_207;
+			break;
+		case PVR_ARCH_300:
+			guest_pcr_bit = PCR_ARCH_300;
 			break;
 		default:
 			return -EINVAL;
 		}
-
-		if (!cpu_has_feature(CPU_FTR_ARCH_207S)) {
-			/* POWER7 can't emulate POWER8 */
-			if (!(pcr & PCR_ARCH_206))
-				return -EINVAL;
-			pcr &= ~PCR_ARCH_206;
-		}
 	}
 
+	/* Check requested PCR bits don't exceed our capabilities */
+	if (guest_pcr_bit > host_pcr_bit)
+		return -EINVAL;
+
 	spin_lock(&vc->lock);
 	vc->arch_compat = arch_compat;
-	vc->pcr = pcr;
+	/* Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit */
+	vc->pcr = host_pcr_bit - guest_pcr_bit;
 	spin_unlock(&vc->lock);
 
 	return 0;
@@ -945,6 +958,7 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		break;
 	case BOOK3S_INTERRUPT_EXTERNAL:
 	case BOOK3S_INTERRUPT_H_DOORBELL:
+	case BOOK3S_INTERRUPT_H_VIRT:
 		vcpu->stat.ext_intr_exits++;
 		r = RESUME_GUEST;
 		break;
@@ -1229,6 +1243,12 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_WORT:
 		*val = get_reg_val(id, vcpu->arch.wort);
 		break;
+	case KVM_REG_PPC_TIDR:
+		*val = get_reg_val(id, vcpu->arch.tid);
+		break;
+	case KVM_REG_PPC_PSSCR:
+		*val = get_reg_val(id, vcpu->arch.psscr);
+		break;
 	case KVM_REG_PPC_VPA_ADDR:
 		spin_lock(&vcpu->arch.vpa_update_lock);
 		*val = get_reg_val(id, vcpu->arch.vpa.next_gpa);
@@ -1288,6 +1308,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_TM_CR:
 		*val = get_reg_val(id, vcpu->arch.cr_tm);
 		break;
+	case KVM_REG_PPC_TM_XER:
+		*val = get_reg_val(id, vcpu->arch.xer_tm);
+		break;
 	case KVM_REG_PPC_TM_LR:
 		*val = get_reg_val(id, vcpu->arch.lr_tm);
 		break;
@@ -1427,6 +1450,12 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_WORT:
 		vcpu->arch.wort = set_reg_val(id, *val);
 		break;
+	case KVM_REG_PPC_TIDR:
+		vcpu->arch.tid = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_PSSCR:
+		vcpu->arch.psscr = set_reg_val(id, *val) & PSSCR_GUEST_VIS;
+		break;
 	case KVM_REG_PPC_VPA_ADDR:
 		addr = set_reg_val(id, *val);
 		r = -EINVAL;
@@ -1498,6 +1527,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_TM_CR:
 		vcpu->arch.cr_tm = set_reg_val(id, *val);
 		break;
+	case KVM_REG_PPC_TM_XER:
+		vcpu->arch.xer_tm = set_reg_val(id, *val);
+		break;
 	case KVM_REG_PPC_TM_LR:
 		vcpu->arch.lr_tm = set_reg_val(id, *val);
 		break;
@@ -1540,6 +1572,20 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 	return r;
 }
 
+/*
+ * On POWER9, threads are independent and can be in different partitions.
+ * Therefore we consider each thread to be a subcore.
+ * There is a restriction that all threads have to be in the same
+ * MMU mode (radix or HPT), unfortunately, but since we only support
+ * HPT guests on a HPT host so far, that isn't an impediment yet.
+ */
+static int threads_per_vcore(void)
+{
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		return 1;
+	return threads_per_subcore;
+}
+
 static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
 {
 	struct kvmppc_vcore *vcore;
@@ -1554,7 +1600,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
 	init_swait_queue_head(&vcore->wq);
 	vcore->preempt_tb = TB_NIL;
 	vcore->lpcr = kvm->arch.lpcr;
-	vcore->first_vcpuid = core * threads_per_subcore;
+	vcore->first_vcpuid = core * threads_per_vcore();
 	vcore->kvm = kvm;
 	INIT_LIST_HEAD(&vcore->preempt_list);
 
@@ -1717,7 +1763,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
 	int core;
 	struct kvmppc_vcore *vcore;
 
-	core = id / threads_per_subcore;
+	core = id / threads_per_vcore();
 	if (core >= KVM_MAX_VCORES)
 		goto out;
 
@@ -1935,7 +1981,10 @@ static void kvmppc_wait_for_nap(void)
 {
 	int cpu = smp_processor_id();
 	int i, loops;
+	int n_threads = threads_per_vcore();
 
+	if (n_threads <= 1)
+		return;
 	for (loops = 0; loops < 1000000; ++loops) {
 		/*
 		 * Check if all threads are finished.
@@ -1943,17 +1992,17 @@ static void kvmppc_wait_for_nap(void)
 		 * and the thread clears it when finished, so we look
 		 * for any threads that still have a non-NULL vcore ptr.
 		 */
-		for (i = 1; i < threads_per_subcore; ++i)
+		for (i = 1; i < n_threads; ++i)
 			if (paca[cpu + i].kvm_hstate.kvm_vcore)
 				break;
-		if (i == threads_per_subcore) {
+		if (i == n_threads) {
 			HMT_medium();
 			return;
 		}
 		HMT_low();
 	}
 	HMT_medium();
-	for (i = 1; i < threads_per_subcore; ++i)
+	for (i = 1; i < n_threads; ++i)
 		if (paca[cpu + i].kvm_hstate.kvm_vcore)
 			pr_err("KVM: CPU %d seems to be stuck\n", cpu + i);
 }
@@ -2019,7 +2068,7 @@ static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc)
 
 	vc->vcore_state = VCORE_PREEMPT;
 	vc->pcpu = smp_processor_id();
-	if (vc->num_threads < threads_per_subcore) {
+	if (vc->num_threads < threads_per_vcore()) {
 		spin_lock(&lp->lock);
 		list_add_tail(&vc->preempt_list, &lp->list);
 		spin_unlock(&lp->lock);
@@ -2123,8 +2172,7 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
 	cip->subcore_threads[sub] = vc->num_threads;
 	cip->subcore_vm[sub] = vc->kvm;
 	init_master_vcore(vc);
-	list_del(&vc->preempt_list);
-	list_add_tail(&vc->preempt_list, &cip->vcs[sub]);
+	list_move_tail(&vc->preempt_list, &cip->vcs[sub]);
 
 	return true;
 }
@@ -2309,6 +2357,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	unsigned long cmd_bit, stat_bit;
 	int pcpu, thr;
 	int target_threads;
+	int controlled_threads;
 
 	/*
 	 * Remove from the list any threads that have a signal pending
@@ -2327,11 +2376,18 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	vc->preempt_tb = TB_NIL;
 
 	/*
+	 * Number of threads that we will be controlling: the same as
+	 * the number of threads per subcore, except on POWER9,
+	 * where it's 1 because the threads are (mostly) independent.
+	 */
+	controlled_threads = threads_per_vcore();
+
+	/*
 	 * Make sure we are running on primary threads, and that secondary
 	 * threads are offline.  Also check if the number of threads in this
 	 * guest are greater than the current system threads per guest.
 	 */
-	if ((threads_per_core > 1) &&
+	if ((controlled_threads > 1) &&
 	    ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
 		for_each_runnable_thread(i, vcpu, vc) {
 			vcpu->arch.ret = -EBUSY;
@@ -2347,7 +2403,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	 */
 	init_core_info(&core_info, vc);
 	pcpu = smp_processor_id();
-	target_threads = threads_per_subcore;
+	target_threads = controlled_threads;
 	if (target_smt_mode && target_smt_mode < target_threads)
 		target_threads = target_smt_mode;
 	if (vc->num_threads < target_threads)
@@ -2383,7 +2439,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 		smp_wmb();
 	}
 	pcpu = smp_processor_id();
-	for (thr = 0; thr < threads_per_subcore; ++thr)
+	for (thr = 0; thr < controlled_threads; ++thr)
 		paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip;
 
 	/* Initiate micro-threading (split-core) if required */
@@ -2493,7 +2549,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	}
 
 	/* Let secondaries go back to the offline loop */
-	for (i = 0; i < threads_per_subcore; ++i) {
+	for (i = 0; i < controlled_threads; ++i) {
 		kvmppc_release_hwthread(pcpu + i);
 		if (sip && sip->napped[i])
 			kvmppc_ipi_thread(pcpu + i);
@@ -2545,9 +2601,6 @@ static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
 		vc->halt_poll_ns = 10000;
 	else
 		vc->halt_poll_ns *= halt_poll_ns_grow;
-
-	if (vc->halt_poll_ns > halt_poll_max_ns)
-		vc->halt_poll_ns = halt_poll_max_ns;
 }
 
 static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
@@ -2558,7 +2611,8 @@ static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
 		vc->halt_poll_ns /= halt_poll_ns_shrink;
 }
 
-/* Check to see if any of the runnable vcpus on the vcore have pending
+/*
+ * Check to see if any of the runnable vcpus on the vcore have pending
  * exceptions or are no longer ceded
  */
 static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
@@ -2657,16 +2711,18 @@ out:
 	}
 
 	/* Adjust poll time */
-	if (halt_poll_max_ns) {
+	if (halt_poll_ns) {
 		if (block_ns <= vc->halt_poll_ns)
 			;
 		/* We slept and blocked for longer than the max halt time */
-		else if (vc->halt_poll_ns && block_ns > halt_poll_max_ns)
+		else if (vc->halt_poll_ns && block_ns > halt_poll_ns)
 			shrink_halt_poll_ns(vc);
 		/* We slept and our poll time is too small */
-		else if (vc->halt_poll_ns < halt_poll_max_ns &&
-				block_ns < halt_poll_max_ns)
+		else if (vc->halt_poll_ns < halt_poll_ns &&
+				block_ns < halt_poll_ns)
 			grow_halt_poll_ns(vc);
+		if (vc->halt_poll_ns > halt_poll_ns)
+			vc->halt_poll_ns = halt_poll_ns;
 	} else
 		vc->halt_poll_ns = 0;
 
@@ -2973,6 +3029,15 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
 
+	/*
+	 * If we are making a new memslot, it might make
+	 * some address that was previously cached as emulated
+	 * MMIO be no longer emulated MMIO, so invalidate
+	 * all the caches of emulated MMIO translations.
+	 */
+	if (npages)
+		atomic64_inc(&kvm->arch.mmio_update);
+
 	if (npages && old->npages) {
 		/*
 		 * If modifying a memslot, reset all the rmap dirty bits.
@@ -3017,6 +3082,22 @@ static void kvmppc_mmu_destroy_hv(struct kvm_vcpu *vcpu)
 	return;
 }
 
+static void kvmppc_setup_partition_table(struct kvm *kvm)
+{
+	unsigned long dw0, dw1;
+
+	/* PS field - page size for VRMA */
+	dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) |
+		((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1);
+	/* HTABSIZE and HTABORG fields */
+	dw0 |= kvm->arch.sdr1;
+
+	/* Second dword has GR=0; other fields are unused since UPRT=0 */
+	dw1 = 0;
+
+	mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
+}
+
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 {
 	int err = 0;
@@ -3068,17 +3149,20 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 	      psize == 0x1000000))
 		goto out_srcu;
 
-	/* Update VRMASD field in the LPCR */
 	senc = slb_pgsize_encoding(psize);
 	kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
 		(VRMA_VSID << SLB_VSID_SHIFT_1T);
-	/* the -4 is to account for senc values starting at 0x10 */
-	lpcr = senc << (LPCR_VRMASD_SH - 4);
-
 	/* Create HPTEs in the hash page table for the VRMA */
 	kvmppc_map_vrma(vcpu, memslot, porder);
 
-	kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
+	/* Update VRMASD field in the LPCR */
+	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+		/* the -4 is to account for senc values starting at 0x10 */
+		lpcr = senc << (LPCR_VRMASD_SH - 4);
+		kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
+	} else {
+		kvmppc_setup_partition_table(kvm);
+	}
 
 	/* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */
 	smp_wmb();
@@ -3193,14 +3277,18 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 	 * Since we don't flush the TLB when tearing down a VM,
 	 * and this lpid might have previously been used,
 	 * make sure we flush on each core before running the new VM.
+	 * On POWER9, the tlbie in mmu_partition_table_set_entry()
+	 * does this flush for us.
 	 */
-	cpumask_setall(&kvm->arch.need_tlb_flush);
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		cpumask_setall(&kvm->arch.need_tlb_flush);
 
 	/* Start out with the default set of hcalls enabled */
 	memcpy(kvm->arch.enabled_hcalls, default_enabled_hcalls,
 	       sizeof(kvm->arch.enabled_hcalls));
 
-	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
 
 	/* Init LPCR for virtual RMA mode */
 	kvm->arch.host_lpid = mfspr(SPRN_LPID);
@@ -3213,9 +3301,29 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 	/* On POWER8 turn on online bit to enable PURR/SPURR */
 	if (cpu_has_feature(CPU_FTR_ARCH_207S))
 		lpcr |= LPCR_ONL;
+	/*
+	 * On POWER9, VPM0 bit is reserved (VPM0=1 behaviour is assumed)
+	 * Set HVICE bit to enable hypervisor virtualization interrupts.
+	 */
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		lpcr &= ~LPCR_VPM0;
+		lpcr |= LPCR_HVICE;
+	}
+
 	kvm->arch.lpcr = lpcr;
 
 	/*
+	 * Work out how many sets the TLB has, for the use of
+	 * the TLB invalidation loop in book3s_hv_rmhandlers.S.
+	 */
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH;	/* 256 */
+	else if (cpu_has_feature(CPU_FTR_ARCH_207S))
+		kvm->arch.tlb_sets = POWER8_TLB_SETS;		/* 512 */
+	else
+		kvm->arch.tlb_sets = POWER7_TLB_SETS;		/* 128 */
+
+	/*
 	 * Track that we now have a HV mode VM active. This blocks secondary
 	 * CPU threads from coming online.
 	 */
@@ -3279,9 +3387,9 @@ static int kvmppc_core_check_processor_compat_hv(void)
 	    !cpu_has_feature(CPU_FTR_ARCH_206))
 		return -EIO;
 	/*
-	 * Disable KVM for Power9, untill the required bits merged.
+	 * Disable KVM for Power9 in radix mode.
 	 */
-	if (cpu_has_feature(CPU_FTR_ARCH_300))
+	if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled())
 		return -EIO;
 
 	return 0;
@@ -3635,6 +3743,23 @@ static int kvmppc_book3s_init_hv(void)
 	if (r)
 		return r;
 
+	/*
+	 * We need a way of accessing the XICS interrupt controller,
+	 * either directly, via paca[cpu].kvm_hstate.xics_phys, or
+	 * indirectly, via OPAL.
+	 */
+#ifdef CONFIG_SMP
+	if (!get_paca()->kvm_hstate.xics_phys) {
+		struct device_node *np;
+
+		np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc");
+		if (!np) {
+			pr_err("KVM-HV: Cannot determine method for accessing XICS\n");
+			return -ENODEV;
+		}
+	}
+#endif
+
 	kvm_ops_hv.owner = THIS_MODULE;
 	kvmppc_hv_ops = &kvm_ops_hv;
 
@@ -3657,3 +3782,4 @@ module_exit(kvmppc_book3s_exit_hv);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_MISCDEV(KVM_MINOR);
 MODULE_ALIAS("devname:kvm");
+
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 0c84d6bc8356..5bb24be0b346 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -26,6 +26,8 @@
 #include <asm/dbell.h>
 #include <asm/cputhreads.h>
 #include <asm/io.h>
+#include <asm/opal.h>
+#include <asm/smp.h>
 
 #define KVM_CMA_CHUNK_ORDER	18
 
@@ -205,12 +207,18 @@ static inline void rm_writeb(unsigned long paddr, u8 val)
 void kvmhv_rm_send_ipi(int cpu)
 {
 	unsigned long xics_phys;
+	unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
 
-	/* On POWER8 for IPIs to threads in the same core, use msgsnd */
+	/* On POWER9 we can use msgsnd for any destination cpu. */
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		msg |= get_hard_smp_processor_id(cpu);
+		__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
+		return;
+	}
+	/* On POWER8 for IPIs to threads in the same core, use msgsnd. */
 	if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
 	    cpu_first_thread_sibling(cpu) ==
 	    cpu_first_thread_sibling(raw_smp_processor_id())) {
-		unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
 		msg |= cpu_thread_in_core(cpu);
 		__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
 		return;
@@ -218,7 +226,11 @@ void kvmhv_rm_send_ipi(int cpu)
 
 	/* Else poke the target with an IPI */
 	xics_phys = paca[cpu].kvm_hstate.xics_phys;
-	rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
+	if (xics_phys)
+		rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
+	else
+		opal_rm_int_set_mfrr(get_hard_smp_processor_id(cpu),
+				     IPI_PRIORITY);
 }
 
 /*
@@ -329,7 +341,7 @@ static struct kvmppc_irq_map *get_irqmap(struct kvmppc_passthru_irqmap *pimap,
  * saved a copy of the XIRR in the PACA, it will be picked up by
  * the host ICP driver.
  */
-static int kvmppc_check_passthru(u32 xisr, __be32 xirr)
+static int kvmppc_check_passthru(u32 xisr, __be32 xirr, bool *again)
 {
 	struct kvmppc_passthru_irqmap *pimap;
 	struct kvmppc_irq_map *irq_map;
@@ -348,11 +360,11 @@ static int kvmppc_check_passthru(u32 xisr, __be32 xirr)
 	/* We're handling this interrupt, generic code doesn't need to */
 	local_paca->kvm_hstate.saved_xirr = 0;
 
-	return kvmppc_deliver_irq_passthru(vcpu, xirr, irq_map, pimap);
+	return kvmppc_deliver_irq_passthru(vcpu, xirr, irq_map, pimap, again);
 }
 
 #else
-static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr)
+static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr, bool *again)
 {
 	return 1;
 }
@@ -367,14 +379,31 @@ static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr)
  *	-1 if there was a guest wakeup IPI (which has now been cleared)
  *	-2 if there is PCI passthrough external interrupt that was handled
  */
+static long kvmppc_read_one_intr(bool *again);
 
 long kvmppc_read_intr(void)
 {
+	long ret = 0;
+	long rc;
+	bool again;
+
+	do {
+		again = false;
+		rc = kvmppc_read_one_intr(&again);
+		if (rc && (ret == 0 || rc > ret))
+			ret = rc;
+	} while (again);
+	return ret;
+}
+
+static long kvmppc_read_one_intr(bool *again)
+{
 	unsigned long xics_phys;
 	u32 h_xirr;
 	__be32 xirr;
 	u32 xisr;
 	u8 host_ipi;
+	int64_t rc;
 
 	/* see if a host IPI is pending */
 	host_ipi = local_paca->kvm_hstate.host_ipi;
@@ -383,8 +412,14 @@ long kvmppc_read_intr(void)
 
 	/* Now read the interrupt from the ICP */
 	xics_phys = local_paca->kvm_hstate.xics_phys;
-	if (unlikely(!xics_phys))
-		return 1;
+	if (!xics_phys) {
+		/* Use OPAL to read the XIRR */
+		rc = opal_rm_int_get_xirr(&xirr, false);
+		if (rc < 0)
+			return 1;
+	} else {
+		xirr = _lwzcix(xics_phys + XICS_XIRR);
+	}
 
 	/*
 	 * Save XIRR for later. Since we get control in reverse endian
@@ -392,7 +427,6 @@ long kvmppc_read_intr(void)
 	 * host endian. Note that xirr is the value read from the
 	 * XIRR register, while h_xirr is the host endian version.
 	 */
-	xirr = _lwzcix(xics_phys + XICS_XIRR);
 	h_xirr = be32_to_cpu(xirr);
 	local_paca->kvm_hstate.saved_xirr = h_xirr;
 	xisr = h_xirr & 0xffffff;
@@ -411,8 +445,16 @@ long kvmppc_read_intr(void)
 	 * If it is an IPI, clear the MFRR and EOI it.
 	 */
 	if (xisr == XICS_IPI) {
-		_stbcix(xics_phys + XICS_MFRR, 0xff);
-		_stwcix(xics_phys + XICS_XIRR, xirr);
+		if (xics_phys) {
+			_stbcix(xics_phys + XICS_MFRR, 0xff);
+			_stwcix(xics_phys + XICS_XIRR, xirr);
+		} else {
+			opal_rm_int_set_mfrr(hard_smp_processor_id(), 0xff);
+			rc = opal_rm_int_eoi(h_xirr);
+			/* If rc > 0, there is another interrupt pending */
+			*again = rc > 0;
+		}
+
 		/*
 		 * Need to ensure side effects of above stores
 		 * complete before proceeding.
@@ -429,7 +471,11 @@ long kvmppc_read_intr(void)
 			/* We raced with the host,
 			 * we need to resend that IPI, bummer
 			 */
-			_stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY);
+			if (xics_phys)
+				_stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY);
+			else
+				opal_rm_int_set_mfrr(hard_smp_processor_id(),
+						     IPI_PRIORITY);
 			/* Let side effects complete */
 			smp_mb();
 			return 1;
@@ -440,5 +486,5 @@ long kvmppc_read_intr(void)
 		return -1;
 	}
 
-	return kvmppc_check_passthru(xisr, xirr);
+	return kvmppc_check_passthru(xisr, xirr, again);
 }
diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c
index 0fa70a9618d7..7ef0993214f3 100644
--- a/arch/powerpc/kvm/book3s_hv_ras.c
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -16,6 +16,7 @@
 #include <asm/machdep.h>
 #include <asm/cputhreads.h>
 #include <asm/hmi.h>
+#include <asm/kvm_ppc.h>
 
 /* SRR1 bits for machine check on POWER7 */
 #define SRR1_MC_LDSTERR		(1ul << (63-42))
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 99b4e9d5dd23..9ef3c4be952f 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -264,8 +264,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 
 	if (pa)
 		pteh |= HPTE_V_VALID;
-	else
+	else {
 		pteh |= HPTE_V_ABSENT;
+		ptel &= ~(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
+	}
 
 	/*If we had host pte mapping then  Check WIMG */
 	if (ptep && !hpte_cache_flags_ok(ptel, is_ci)) {
@@ -351,6 +353,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 			/* inval in progress, write a non-present HPTE */
 			pteh |= HPTE_V_ABSENT;
 			pteh &= ~HPTE_V_VALID;
+			ptel &= ~(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
 			unlock_rmap(rmap);
 		} else {
 			kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index,
@@ -361,6 +364,11 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 		}
 	}
 
+	/* Convert to new format on P9 */
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		ptel = hpte_old_to_new_r(pteh, ptel);
+		pteh = hpte_old_to_new_v(pteh);
+	}
 	hpte[1] = cpu_to_be64(ptel);
 
 	/* Write the first HPTE dword, unlocking the HPTE and making it valid */
@@ -386,6 +394,13 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 #define LOCK_TOKEN	(*(u32 *)(&get_paca()->paca_index))
 #endif
 
+static inline int is_mmio_hpte(unsigned long v, unsigned long r)
+{
+	return ((v & HPTE_V_ABSENT) &&
+		(r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) ==
+		(HPTE_R_KEY_HI | HPTE_R_KEY_LO));
+}
+
 static inline int try_lock_tlbie(unsigned int *lock)
 {
 	unsigned int tmp, old;
@@ -409,13 +424,18 @@ static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues,
 {
 	long i;
 
+	/*
+	 * We use the POWER9 5-operand versions of tlbie and tlbiel here.
+	 * Since we are using RIC=0 PRS=0 R=0, and P7/P8 tlbiel ignores
+	 * the RS field, this is backwards-compatible with P7 and P8.
+	 */
 	if (global) {
 		while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
 			cpu_relax();
 		if (need_sync)
 			asm volatile("ptesync" : : : "memory");
 		for (i = 0; i < npages; ++i)
-			asm volatile(PPC_TLBIE(%1,%0) : :
+			asm volatile(PPC_TLBIE_5(%0,%1,0,0,0) : :
 				     "r" (rbvalues[i]), "r" (kvm->arch.lpid));
 		asm volatile("eieio; tlbsync; ptesync" : : : "memory");
 		kvm->arch.tlbie_lock = 0;
@@ -423,7 +443,8 @@ static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues,
 		if (need_sync)
 			asm volatile("ptesync" : : : "memory");
 		for (i = 0; i < npages; ++i)
-			asm volatile("tlbiel %0" : : "r" (rbvalues[i]));
+			asm volatile(PPC_TLBIEL(%0,%1,0,0,0) : :
+				     "r" (rbvalues[i]), "r" (0));
 		asm volatile("ptesync" : : : "memory");
 	}
 }
@@ -435,18 +456,23 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 	__be64 *hpte;
 	unsigned long v, r, rb;
 	struct revmap_entry *rev;
-	u64 pte;
+	u64 pte, orig_pte, pte_r;
 
 	if (pte_index >= kvm->arch.hpt_npte)
 		return H_PARAMETER;
 	hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
 	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 		cpu_relax();
-	pte = be64_to_cpu(hpte[0]);
+	pte = orig_pte = be64_to_cpu(hpte[0]);
+	pte_r = be64_to_cpu(hpte[1]);
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		pte = hpte_new_to_old_v(pte, pte_r);
+		pte_r = hpte_new_to_old_r(pte_r);
+	}
 	if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
 	    ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn) ||
 	    ((flags & H_ANDCOND) && (pte & avpn) != 0)) {
-		__unlock_hpte(hpte, pte);
+		__unlock_hpte(hpte, orig_pte);
 		return H_NOT_FOUND;
 	}
 
@@ -454,7 +480,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 	v = pte & ~HPTE_V_HVLOCK;
 	if (v & HPTE_V_VALID) {
 		hpte[0] &= ~cpu_to_be64(HPTE_V_VALID);
-		rb = compute_tlbie_rb(v, be64_to_cpu(hpte[1]), pte_index);
+		rb = compute_tlbie_rb(v, pte_r, pte_index);
 		do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true);
 		/*
 		 * The reference (R) and change (C) bits in a HPT
@@ -472,6 +498,9 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 	note_hpte_modification(kvm, rev);
 	unlock_hpte(hpte, 0);
 
+	if (is_mmio_hpte(v, pte_r))
+		atomic64_inc(&kvm->arch.mmio_update);
+
 	if (v & HPTE_V_ABSENT)
 		v = (v & ~HPTE_V_ABSENT) | HPTE_V_VALID;
 	hpret[0] = v;
@@ -498,7 +527,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 	int global;
 	long int ret = H_SUCCESS;
 	struct revmap_entry *rev, *revs[4];
-	u64 hp0;
+	u64 hp0, hp1;
 
 	global = global_invalidates(kvm, 0);
 	for (i = 0; i < 4 && ret == H_SUCCESS; ) {
@@ -531,6 +560,11 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 			}
 			found = 0;
 			hp0 = be64_to_cpu(hp[0]);
+			hp1 = be64_to_cpu(hp[1]);
+			if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+				hp0 = hpte_new_to_old_v(hp0, hp1);
+				hp1 = hpte_new_to_old_r(hp1);
+			}
 			if (hp0 & (HPTE_V_ABSENT | HPTE_V_VALID)) {
 				switch (flags & 3) {
 				case 0:		/* absolute */
@@ -561,13 +595,14 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 				rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
 				args[j] |= rcbits << (56 - 5);
 				hp[0] = 0;
+				if (is_mmio_hpte(hp0, hp1))
+					atomic64_inc(&kvm->arch.mmio_update);
 				continue;
 			}
 
 			/* leave it locked */
 			hp[0] &= ~cpu_to_be64(HPTE_V_VALID);
-			tlbrb[n] = compute_tlbie_rb(be64_to_cpu(hp[0]),
-				be64_to_cpu(hp[1]), pte_index);
+			tlbrb[n] = compute_tlbie_rb(hp0, hp1, pte_index);
 			indexes[n] = j;
 			hptes[n] = hp;
 			revs[n] = rev;
@@ -605,7 +640,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 	__be64 *hpte;
 	struct revmap_entry *rev;
 	unsigned long v, r, rb, mask, bits;
-	u64 pte;
+	u64 pte_v, pte_r;
 
 	if (pte_index >= kvm->arch.hpt_npte)
 		return H_PARAMETER;
@@ -613,14 +648,16 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 	hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
 	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 		cpu_relax();
-	pte = be64_to_cpu(hpte[0]);
-	if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
-	    ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn)) {
-		__unlock_hpte(hpte, pte);
+	v = pte_v = be64_to_cpu(hpte[0]);
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		v = hpte_new_to_old_v(v, be64_to_cpu(hpte[1]));
+	if ((v & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
+	    ((flags & H_AVPN) && (v & ~0x7fUL) != avpn)) {
+		__unlock_hpte(hpte, pte_v);
 		return H_NOT_FOUND;
 	}
 
-	v = pte;
+	pte_r = be64_to_cpu(hpte[1]);
 	bits = (flags << 55) & HPTE_R_PP0;
 	bits |= (flags << 48) & HPTE_R_KEY_HI;
 	bits |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
@@ -642,22 +679,26 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 		 * readonly to writable.  If it should be writable, we'll
 		 * take a trap and let the page fault code sort it out.
 		 */
-		pte = be64_to_cpu(hpte[1]);
-		r = (pte & ~mask) | bits;
-		if (hpte_is_writable(r) && !hpte_is_writable(pte))
+		r = (pte_r & ~mask) | bits;
+		if (hpte_is_writable(r) && !hpte_is_writable(pte_r))
 			r = hpte_make_readonly(r);
 		/* If the PTE is changing, invalidate it first */
-		if (r != pte) {
+		if (r != pte_r) {
 			rb = compute_tlbie_rb(v, r, pte_index);
-			hpte[0] = cpu_to_be64((v & ~HPTE_V_VALID) |
+			hpte[0] = cpu_to_be64((pte_v & ~HPTE_V_VALID) |
 					      HPTE_V_ABSENT);
 			do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags),
 				  true);
+			/* Don't lose R/C bit updates done by hardware */
+			r |= be64_to_cpu(hpte[1]) & (HPTE_R_R | HPTE_R_C);
 			hpte[1] = cpu_to_be64(r);
 		}
 	}
-	unlock_hpte(hpte, v & ~HPTE_V_HVLOCK);
+	unlock_hpte(hpte, pte_v & ~HPTE_V_HVLOCK);
 	asm volatile("ptesync" : : : "memory");
+	if (is_mmio_hpte(v, pte_r))
+		atomic64_inc(&kvm->arch.mmio_update);
+
 	return H_SUCCESS;
 }
 
@@ -681,6 +722,10 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
 		hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
 		v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
 		r = be64_to_cpu(hpte[1]);
+		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+			v = hpte_new_to_old_v(v, r);
+			r = hpte_new_to_old_r(r);
+		}
 		if (v & HPTE_V_ABSENT) {
 			v &= ~HPTE_V_ABSENT;
 			v |= HPTE_V_VALID;
@@ -798,10 +843,16 @@ void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
 			unsigned long pte_index)
 {
 	unsigned long rb;
+	u64 hp0, hp1;
 
 	hptep[0] &= ~cpu_to_be64(HPTE_V_VALID);
-	rb = compute_tlbie_rb(be64_to_cpu(hptep[0]), be64_to_cpu(hptep[1]),
-			      pte_index);
+	hp0 = be64_to_cpu(hptep[0]);
+	hp1 = be64_to_cpu(hptep[1]);
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		hp0 = hpte_new_to_old_v(hp0, hp1);
+		hp1 = hpte_new_to_old_r(hp1);
+	}
+	rb = compute_tlbie_rb(hp0, hp1, pte_index);
 	do_tlbies(kvm, &rb, 1, 1, true);
 }
 EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte);
@@ -811,9 +862,15 @@ void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep,
 {
 	unsigned long rb;
 	unsigned char rbyte;
+	u64 hp0, hp1;
 
-	rb = compute_tlbie_rb(be64_to_cpu(hptep[0]), be64_to_cpu(hptep[1]),
-			      pte_index);
+	hp0 = be64_to_cpu(hptep[0]);
+	hp1 = be64_to_cpu(hptep[1]);
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		hp0 = hpte_new_to_old_v(hp0, hp1);
+		hp1 = hpte_new_to_old_r(hp1);
+	}
+	rb = compute_tlbie_rb(hp0, hp1, pte_index);
 	rbyte = (be64_to_cpu(hptep[1]) & ~HPTE_R_R) >> 8;
 	/* modify only the second-last byte, which contains the ref bit */
 	*((char *)hptep + 14) = rbyte;
@@ -828,6 +885,37 @@ static int slb_base_page_shift[4] = {
 	20,	/* 1M, unsupported */
 };
 
+static struct mmio_hpte_cache_entry *mmio_cache_search(struct kvm_vcpu *vcpu,
+		unsigned long eaddr, unsigned long slb_v, long mmio_update)
+{
+	struct mmio_hpte_cache_entry *entry = NULL;
+	unsigned int pshift;
+	unsigned int i;
+
+	for (i = 0; i < MMIO_HPTE_CACHE_SIZE; i++) {
+		entry = &vcpu->arch.mmio_cache.entry[i];
+		if (entry->mmio_update == mmio_update) {
+			pshift = entry->slb_base_pshift;
+			if ((entry->eaddr >> pshift) == (eaddr >> pshift) &&
+			    entry->slb_v == slb_v)
+				return entry;
+		}
+	}
+	return NULL;
+}
+
+static struct mmio_hpte_cache_entry *
+			next_mmio_cache_entry(struct kvm_vcpu *vcpu)
+{
+	unsigned int index = vcpu->arch.mmio_cache.index;
+
+	vcpu->arch.mmio_cache.index++;
+	if (vcpu->arch.mmio_cache.index == MMIO_HPTE_CACHE_SIZE)
+		vcpu->arch.mmio_cache.index = 0;
+
+	return &vcpu->arch.mmio_cache.entry[index];
+}
+
 /* When called from virtmode, this func should be protected by
  * preempt_disable(), otherwise, the holding of HPTE_V_HVLOCK
  * can trigger deadlock issue.
@@ -842,7 +930,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
 	unsigned long avpn;
 	__be64 *hpte;
 	unsigned long mask, val;
-	unsigned long v, r;
+	unsigned long v, r, orig_v;
 
 	/* Get page shift, work out hash and AVPN etc. */
 	mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY;
@@ -877,6 +965,8 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
 		for (i = 0; i < 16; i += 2) {
 			/* Read the PTE racily */
 			v = be64_to_cpu(hpte[i]) & ~HPTE_V_HVLOCK;
+			if (cpu_has_feature(CPU_FTR_ARCH_300))
+				v = hpte_new_to_old_v(v, be64_to_cpu(hpte[i+1]));
 
 			/* Check valid/absent, hash, segment size and AVPN */
 			if (!(v & valid) || (v & mask) != val)
@@ -885,8 +975,12 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
 			/* Lock the PTE and read it under the lock */
 			while (!try_lock_hpte(&hpte[i], HPTE_V_HVLOCK))
 				cpu_relax();
-			v = be64_to_cpu(hpte[i]) & ~HPTE_V_HVLOCK;
+			v = orig_v = be64_to_cpu(hpte[i]) & ~HPTE_V_HVLOCK;
 			r = be64_to_cpu(hpte[i+1]);
+			if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+				v = hpte_new_to_old_v(v, r);
+				r = hpte_new_to_old_r(r);
+			}
 
 			/*
 			 * Check the HPTE again, including base page size
@@ -896,7 +990,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
 				/* Return with the HPTE still locked */
 				return (hash << 3) + (i >> 1);
 
-			__unlock_hpte(&hpte[i], v);
+			__unlock_hpte(&hpte[i], orig_v);
 		}
 
 		if (val & HPTE_V_SECONDARY)
@@ -924,30 +1018,45 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 {
 	struct kvm *kvm = vcpu->kvm;
 	long int index;
-	unsigned long v, r, gr;
+	unsigned long v, r, gr, orig_v;
 	__be64 *hpte;
 	unsigned long valid;
 	struct revmap_entry *rev;
 	unsigned long pp, key;
+	struct mmio_hpte_cache_entry *cache_entry = NULL;
+	long mmio_update = 0;
 
 	/* For protection fault, expect to find a valid HPTE */
 	valid = HPTE_V_VALID;
-	if (status & DSISR_NOHPTE)
+	if (status & DSISR_NOHPTE) {
 		valid |= HPTE_V_ABSENT;
-
-	index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid);
-	if (index < 0) {
-		if (status & DSISR_NOHPTE)
-			return status;	/* there really was no HPTE */
-		return 0;		/* for prot fault, HPTE disappeared */
+		mmio_update = atomic64_read(&kvm->arch.mmio_update);
+		cache_entry = mmio_cache_search(vcpu, addr, slb_v, mmio_update);
 	}
-	hpte = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
-	v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
-	r = be64_to_cpu(hpte[1]);
-	rev = real_vmalloc_addr(&kvm->arch.revmap[index]);
-	gr = rev->guest_rpte;
+	if (cache_entry) {
+		index = cache_entry->pte_index;
+		v = cache_entry->hpte_v;
+		r = cache_entry->hpte_r;
+		gr = cache_entry->rpte;
+	} else {
+		index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid);
+		if (index < 0) {
+			if (status & DSISR_NOHPTE)
+				return status;	/* there really was no HPTE */
+			return 0;	/* for prot fault, HPTE disappeared */
+		}
+		hpte = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
+		v = orig_v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
+		r = be64_to_cpu(hpte[1]);
+		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+			v = hpte_new_to_old_v(v, r);
+			r = hpte_new_to_old_r(r);
+		}
+		rev = real_vmalloc_addr(&kvm->arch.revmap[index]);
+		gr = rev->guest_rpte;
 
-	unlock_hpte(hpte, v);
+		unlock_hpte(hpte, orig_v);
+	}
 
 	/* For not found, if the HPTE is valid by now, retry the instruction */
 	if ((status & DSISR_NOHPTE) && (v & HPTE_V_VALID))
@@ -985,12 +1094,32 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 	vcpu->arch.pgfault_index = index;
 	vcpu->arch.pgfault_hpte[0] = v;
 	vcpu->arch.pgfault_hpte[1] = r;
+	vcpu->arch.pgfault_cache = cache_entry;
 
 	/* Check the storage key to see if it is possibly emulated MMIO */
-	if (data && (vcpu->arch.shregs.msr & MSR_IR) &&
-	    (r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) ==
-	    (HPTE_R_KEY_HI | HPTE_R_KEY_LO))
-		return -2;	/* MMIO emulation - load instr word */
+	if ((r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) ==
+	    (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) {
+		if (!cache_entry) {
+			unsigned int pshift = 12;
+			unsigned int pshift_index;
+
+			if (slb_v & SLB_VSID_L) {
+				pshift_index = ((slb_v & SLB_VSID_LP) >> 4);
+				pshift = slb_base_page_shift[pshift_index];
+			}
+			cache_entry = next_mmio_cache_entry(vcpu);
+			cache_entry->eaddr = addr;
+			cache_entry->slb_base_pshift = pshift;
+			cache_entry->pte_index = index;
+			cache_entry->hpte_v = v;
+			cache_entry->hpte_r = r;
+			cache_entry->rpte = gr;
+			cache_entry->slb_v = slb_v;
+			cache_entry->mmio_update = mmio_update;
+		}
+		if (data && (vcpu->arch.shregs.msr & MSR_IR))
+			return -2;	/* MMIO emulation - load instr word */
+	}
 
 	return -1;		/* send fault up to host kernel mode */
 }
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index a0ea63ac2b52..06edc4366639 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -70,7 +70,11 @@ static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu)
 	hcpu = hcore << threads_shift;
 	kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu;
 	smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION);
-	icp_native_cause_ipi_rm(hcpu);
+	if (paca[hcpu].kvm_hstate.xics_phys)
+		icp_native_cause_ipi_rm(hcpu);
+	else
+		opal_rm_int_set_mfrr(get_hard_smp_processor_id(hcpu),
+				     IPI_PRIORITY);
 }
 #else
 static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) { }
@@ -737,7 +741,7 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 
 unsigned long eoi_rc;
 
-static void icp_eoi(struct irq_chip *c, u32 hwirq, u32 xirr)
+static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
 {
 	unsigned long xics_phys;
 	int64_t rc;
@@ -751,7 +755,12 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, u32 xirr)
 
 	/* EOI it */
 	xics_phys = local_paca->kvm_hstate.xics_phys;
-	_stwcix(xics_phys + XICS_XIRR, xirr);
+	if (xics_phys) {
+		_stwcix(xics_phys + XICS_XIRR, xirr);
+	} else {
+		rc = opal_rm_int_eoi(be32_to_cpu(xirr));
+		*again = rc > 0;
+	}
 }
 
 static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu)
@@ -809,9 +818,10 @@ static void kvmppc_rm_handle_irq_desc(struct irq_desc *desc)
 }
 
 long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
-				 u32 xirr,
+				 __be32 xirr,
 				 struct kvmppc_irq_map *irq_map,
-				 struct kvmppc_passthru_irqmap *pimap)
+				 struct kvmppc_passthru_irqmap *pimap,
+				 bool *again)
 {
 	struct kvmppc_xics *xics;
 	struct kvmppc_icp *icp;
@@ -825,7 +835,8 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
 	icp_rm_deliver_irq(xics, icp, irq);
 
 	/* EOI the interrupt */
-	icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr);
+	icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr,
+		again);
 
 	if (check_too_hard(xics, icp) == H_TOO_HARD)
 		return 2;
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index c3c1d1bcfc67..9338a818e05c 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -501,17 +501,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	cmpwi	r0, 0
 	beq	57f
 	li	r3, (LPCR_PECEDH | LPCR_PECE0) >> 4
-	mfspr	r4, SPRN_LPCR
-	rlwimi	r4, r3, 4, (LPCR_PECEDP | LPCR_PECEDH | LPCR_PECE0 | LPCR_PECE1)
-	mtspr	SPRN_LPCR, r4
-	isync
-	std	r0, HSTATE_SCRATCH0(r13)
-	ptesync
-	ld	r0, HSTATE_SCRATCH0(r13)
-1:	cmpd	r0, r0
-	bne	1b
-	nap
-	b	.
+	mfspr	r5, SPRN_LPCR
+	rlwimi	r5, r3, 4, (LPCR_PECEDP | LPCR_PECEDH | LPCR_PECE0 | LPCR_PECE1)
+	b	kvm_nap_sequence
 
 57:	li	r0, 0
 	stbx	r0, r3, r4
@@ -523,6 +515,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
  *                                                                            *
  *****************************************************************************/
 
+/* Stack frame offsets */
+#define STACK_SLOT_TID		(112-16)
+#define STACK_SLOT_PSSCR	(112-24)
+
 .global kvmppc_hv_entry
 kvmppc_hv_entry:
 
@@ -581,12 +577,14 @@ kvmppc_hv_entry:
 	ld	r9,VCORE_KVM(r5)	/* pointer to struct kvm */
 	cmpwi	r6,0
 	bne	10f
-	ld	r6,KVM_SDR1(r9)
 	lwz	r7,KVM_LPID(r9)
+BEGIN_FTR_SECTION
+	ld	r6,KVM_SDR1(r9)
 	li	r0,LPID_RSVD		/* switch to reserved LPID */
 	mtspr	SPRN_LPID,r0
 	ptesync
 	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 	mtspr	SPRN_LPID,r7
 	isync
 
@@ -607,12 +605,8 @@ kvmppc_hv_entry:
 	stdcx.	r7,0,r6
 	bne	23b
 	/* Flush the TLB of any entries for this LPID */
-	/* use arch 2.07S as a proxy for POWER8 */
-BEGIN_FTR_SECTION
-	li	r6,512			/* POWER8 has 512 sets */
-FTR_SECTION_ELSE
-	li	r6,128			/* POWER7 has 128 sets */
-ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S)
+	lwz	r6,KVM_TLB_SETS(r9)
+	li	r0,0			/* RS for P9 version of tlbiel */
 	mtctr	r6
 	li	r7,0x800		/* IS field = 0b10 */
 	ptesync
@@ -698,6 +692,14 @@ kvmppc_got_guest:
 	mtspr	SPRN_PURR,r7
 	mtspr	SPRN_SPURR,r8
 
+	/* Save host values of some registers */
+BEGIN_FTR_SECTION
+	mfspr	r5, SPRN_TIDR
+	mfspr	r6, SPRN_PSSCR
+	std	r5, STACK_SLOT_TID(r1)
+	std	r6, STACK_SLOT_PSSCR(r1)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+
 BEGIN_FTR_SECTION
 	/* Set partition DABR */
 	/* Do this before re-enabling PMU to avoid P7 DABR corruption bug */
@@ -750,14 +752,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
 BEGIN_FTR_SECTION
 	ld	r5, VCPU_MMCR + 24(r4)
 	ld	r6, VCPU_SIER(r4)
+	mtspr	SPRN_MMCR2, r5
+	mtspr	SPRN_SIER, r6
+BEGIN_FTR_SECTION_NESTED(96)
 	lwz	r7, VCPU_PMC + 24(r4)
 	lwz	r8, VCPU_PMC + 28(r4)
 	ld	r9, VCPU_MMCR + 32(r4)
-	mtspr	SPRN_MMCR2, r5
-	mtspr	SPRN_SIER, r6
 	mtspr	SPRN_SPMC1, r7
 	mtspr	SPRN_SPMC2, r8
 	mtspr	SPRN_MMCRS, r9
+END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	mtspr	SPRN_MMCR0, r3
 	isync
@@ -813,20 +817,30 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 	mtspr	SPRN_EBBHR, r8
 	ld	r5, VCPU_EBBRR(r4)
 	ld	r6, VCPU_BESCR(r4)
-	ld	r7, VCPU_CSIGR(r4)
-	ld	r8, VCPU_TACR(r4)
+	lwz	r7, VCPU_GUEST_PID(r4)
+	ld	r8, VCPU_WORT(r4)
 	mtspr	SPRN_EBBRR, r5
 	mtspr	SPRN_BESCR, r6
-	mtspr	SPRN_CSIGR, r7
-	mtspr	SPRN_TACR, r8
+	mtspr	SPRN_PID, r7
+	mtspr	SPRN_WORT, r8
+BEGIN_FTR_SECTION
+	/* POWER8-only registers */
 	ld	r5, VCPU_TCSCR(r4)
 	ld	r6, VCPU_ACOP(r4)
-	lwz	r7, VCPU_GUEST_PID(r4)
-	ld	r8, VCPU_WORT(r4)
+	ld	r7, VCPU_CSIGR(r4)
+	ld	r8, VCPU_TACR(r4)
 	mtspr	SPRN_TCSCR, r5
 	mtspr	SPRN_ACOP, r6
-	mtspr	SPRN_PID, r7
-	mtspr	SPRN_WORT, r8
+	mtspr	SPRN_CSIGR, r7
+	mtspr	SPRN_TACR, r8
+FTR_SECTION_ELSE
+	/* POWER9-only registers */
+	ld	r5, VCPU_TID(r4)
+	ld	r6, VCPU_PSSCR(r4)
+	oris	r6, r6, PSSCR_EC@h	/* This makes stop trap to HV */
+	mtspr	SPRN_TIDR, r5
+	mtspr	SPRN_PSSCR, r6
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 8:
 
 	/*
@@ -1341,20 +1355,29 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 	std	r8, VCPU_EBBHR(r9)
 	mfspr	r5, SPRN_EBBRR
 	mfspr	r6, SPRN_BESCR
-	mfspr	r7, SPRN_CSIGR
-	mfspr	r8, SPRN_TACR
+	mfspr	r7, SPRN_PID
+	mfspr	r8, SPRN_WORT
 	std	r5, VCPU_EBBRR(r9)
 	std	r6, VCPU_BESCR(r9)
-	std	r7, VCPU_CSIGR(r9)
-	std	r8, VCPU_TACR(r9)
+	stw	r7, VCPU_GUEST_PID(r9)
+	std	r8, VCPU_WORT(r9)
+BEGIN_FTR_SECTION
 	mfspr	r5, SPRN_TCSCR
 	mfspr	r6, SPRN_ACOP
-	mfspr	r7, SPRN_PID
-	mfspr	r8, SPRN_WORT
+	mfspr	r7, SPRN_CSIGR
+	mfspr	r8, SPRN_TACR
 	std	r5, VCPU_TCSCR(r9)
 	std	r6, VCPU_ACOP(r9)
-	stw	r7, VCPU_GUEST_PID(r9)
-	std	r8, VCPU_WORT(r9)
+	std	r7, VCPU_CSIGR(r9)
+	std	r8, VCPU_TACR(r9)
+FTR_SECTION_ELSE
+	mfspr	r5, SPRN_TIDR
+	mfspr	r6, SPRN_PSSCR
+	std	r5, VCPU_TID(r9)
+	rldicl	r6, r6, 4, 50		/* r6 &= PSSCR_GUEST_VIS */
+	rotldi	r6, r6, 60
+	std	r6, VCPU_PSSCR(r9)
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 	/*
 	 * Restore various registers to 0, where non-zero values
 	 * set by the guest could disrupt the host.
@@ -1363,12 +1386,14 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 	mtspr	SPRN_IAMR, r0
 	mtspr	SPRN_CIABR, r0
 	mtspr	SPRN_DAWRX, r0
-	mtspr	SPRN_TCSCR, r0
 	mtspr	SPRN_WORT, r0
+BEGIN_FTR_SECTION
+	mtspr	SPRN_TCSCR, r0
 	/* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */
 	li	r0, 1
 	sldi	r0, r0, 31
 	mtspr	SPRN_MMCRS, r0
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 8:
 
 	/* Save and reset AMR and UAMOR before turning on the MMU */
@@ -1502,15 +1527,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	stw	r8, VCPU_PMC + 20(r9)
 BEGIN_FTR_SECTION
 	mfspr	r5, SPRN_SIER
+	std	r5, VCPU_SIER(r9)
+BEGIN_FTR_SECTION_NESTED(96)
 	mfspr	r6, SPRN_SPMC1
 	mfspr	r7, SPRN_SPMC2
 	mfspr	r8, SPRN_MMCRS
-	std	r5, VCPU_SIER(r9)
 	stw	r6, VCPU_PMC + 24(r9)
 	stw	r7, VCPU_PMC + 28(r9)
 	std	r8, VCPU_MMCR + 32(r9)
 	lis	r4, 0x8000
 	mtspr	SPRN_MMCRS, r4
+END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 22:
 	/* Clear out SLB */
@@ -1519,6 +1546,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	slbia
 	ptesync
 
+	/* Restore host values of some registers */
+BEGIN_FTR_SECTION
+	ld	r5, STACK_SLOT_TID(r1)
+	ld	r6, STACK_SLOT_PSSCR(r1)
+	mtspr	SPRN_TIDR, r5
+	mtspr	SPRN_PSSCR, r6
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+
 	/*
 	 * POWER7/POWER8 guest -> host partition switch code.
 	 * We don't have to lock against tlbies but we do
@@ -1552,12 +1587,14 @@ kvmhv_switch_to_host:
 	beq	19f
 
 	/* Primary thread switches back to host partition */
-	ld	r6,KVM_HOST_SDR1(r4)
 	lwz	r7,KVM_HOST_LPID(r4)
+BEGIN_FTR_SECTION
+	ld	r6,KVM_HOST_SDR1(r4)
 	li	r8,LPID_RSVD		/* switch to reserved LPID */
 	mtspr	SPRN_LPID,r8
 	ptesync
-	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
+	mtspr	SPRN_SDR1,r6		/* switch to host page table */
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 	mtspr	SPRN_LPID,r7
 	isync
 
@@ -2211,6 +2248,21 @@ BEGIN_FTR_SECTION
 	ori	r5, r5, LPCR_PECEDH
 	rlwimi	r5, r3, 0, LPCR_PECEDP
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+kvm_nap_sequence:		/* desired LPCR value in r5 */
+BEGIN_FTR_SECTION
+	/*
+	 * PSSCR bits:	exit criterion = 1 (wakeup based on LPCR at sreset)
+	 *		enable state loss = 1 (allow SMT mode switch)
+	 *		requested level = 0 (just stop dispatching)
+	 */
+	lis	r3, (PSSCR_EC | PSSCR_ESL)@h
+	mtspr	SPRN_PSSCR, r3
+	/* Set LPCR_PECE_HVEE bit to enable wakeup by HV interrupts */
+	li	r4, LPCR_PECE_HVEE@higher
+	sldi	r4, r4, 32
+	or	r5, r5, r4
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 	mtspr	SPRN_LPCR,r5
 	isync
 	li	r0, 0
@@ -2219,7 +2271,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	ld	r0, HSTATE_SCRATCH0(r13)
 1:	cmpd	r0, r0
 	bne	1b
+BEGIN_FTR_SECTION
 	nap
+FTR_SECTION_ELSE
+	PPC_STOP
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 	b	.
 
 33:	mr	r4, r3
@@ -2600,11 +2656,13 @@ kvmppc_save_tm:
 	mfctr	r7
 	mfspr	r8, SPRN_AMR
 	mfspr	r10, SPRN_TAR
+	mfxer	r11
 	std	r5, VCPU_LR_TM(r9)
 	stw	r6, VCPU_CR_TM(r9)
 	std	r7, VCPU_CTR_TM(r9)
 	std	r8, VCPU_AMR_TM(r9)
 	std	r10, VCPU_TAR_TM(r9)
+	std	r11, VCPU_XER_TM(r9)
 
 	/* Restore r12 as trap number. */
 	lwz	r12, VCPU_TRAP(r9)
@@ -2697,11 +2755,13 @@ kvmppc_restore_tm:
 	ld	r7, VCPU_CTR_TM(r4)
 	ld	r8, VCPU_AMR_TM(r4)
 	ld	r9, VCPU_TAR_TM(r4)
+	ld	r10, VCPU_XER_TM(r4)
 	mtlr	r5
 	mtcr	r6
 	mtctr	r7
 	mtspr	SPRN_AMR, r8
 	mtspr	SPRN_TAR, r9
+	mtxer	r10
 
 	/*
 	 * Load up PPR and DSCR values but don't put them in the actual SPRs
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 70963c845e96..efd1183a6b16 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -536,7 +536,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 #ifdef CONFIG_PPC_BOOK3S_64
 	case KVM_CAP_SPAPR_TCE:
 	case KVM_CAP_SPAPR_TCE_64:
-	case KVM_CAP_PPC_ALLOC_HTAB:
 	case KVM_CAP_PPC_RTAS:
 	case KVM_CAP_PPC_FIXUP_HCALL:
 	case KVM_CAP_PPC_ENABLE_HCALL:
@@ -545,13 +544,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 #endif
 		r = 1;
 		break;
+
+	case KVM_CAP_PPC_ALLOC_HTAB:
+		r = hv_enabled;
+		break;
 #endif /* CONFIG_PPC_BOOK3S_64 */
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 	case KVM_CAP_PPC_SMT:
-		if (hv_enabled)
-			r = threads_per_subcore;
-		else
-			r = 0;
+		r = 0;
+		if (hv_enabled) {
+			if (cpu_has_feature(CPU_FTR_ARCH_300))
+				r = 1;
+			else
+				r = threads_per_subcore;
+		}
 		break;
 	case KVM_CAP_PPC_RMA:
 		r = 0;
diff --git a/arch/powerpc/kvm/trace_hv.h b/arch/powerpc/kvm/trace_hv.h
index fb21990c0fb4..ebc6dd449556 100644
--- a/arch/powerpc/kvm/trace_hv.h
+++ b/arch/powerpc/kvm/trace_hv.h
@@ -449,7 +449,7 @@ TRACE_EVENT(kvmppc_vcore_wakeup,
 		__entry->tgid   = current->tgid;
 	),
 
-	TP_printk("%s time %lld ns, tgid=%d",
+	TP_printk("%s time %llu ns, tgid=%d",
 		__entry->waited ? "wait" : "poll",
 		__entry->ns, __entry->tgid)
 );
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 83ddc0e171b0..ad9fd5245be2 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -221,13 +221,18 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
 		return -1;
 
 	hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
-	hpte_r = hpte_encode_r(pa, psize, apsize, ssize) | rflags;
+	hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
 
 	if (!(vflags & HPTE_V_BOLTED)) {
 		DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n",
 			i, hpte_v, hpte_r);
 	}
 
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		hpte_r = hpte_old_to_new_r(hpte_v, hpte_r);
+		hpte_v = hpte_old_to_new_v(hpte_v);
+	}
+
 	hptep->r = cpu_to_be64(hpte_r);
 	/* Guarantee the second dword is visible before the valid bit */
 	eieio();
@@ -295,6 +300,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
 		vpn, want_v & HPTE_V_AVPN, slot, newpp);
 
 	hpte_v = be64_to_cpu(hptep->v);
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
 	/*
 	 * We need to invalidate the TLB always because hpte_remove doesn't do
 	 * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
@@ -309,6 +316,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
 		native_lock_hpte(hptep);
 		/* recheck with locks held */
 		hpte_v = be64_to_cpu(hptep->v);
+		if (cpu_has_feature(CPU_FTR_ARCH_300))
+			hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
 		if (unlikely(!HPTE_V_COMPARE(hpte_v, want_v) ||
 			     !(hpte_v & HPTE_V_VALID))) {
 			ret = -1;
@@ -350,6 +359,8 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize)
 	for (i = 0; i < HPTES_PER_GROUP; i++) {
 		hptep = htab_address + slot;
 		hpte_v = be64_to_cpu(hptep->v);
+		if (cpu_has_feature(CPU_FTR_ARCH_300))
+			hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
 
 		if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
 			/* HPTE matches */
@@ -409,6 +420,8 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
 	want_v = hpte_encode_avpn(vpn, bpsize, ssize);
 	native_lock_hpte(hptep);
 	hpte_v = be64_to_cpu(hptep->v);
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
 
 	/*
 	 * We need to invalidate the TLB always because hpte_remove doesn't do
@@ -467,6 +480,8 @@ static void native_hugepage_invalidate(unsigned long vsid,
 		want_v = hpte_encode_avpn(vpn, psize, ssize);
 		native_lock_hpte(hptep);
 		hpte_v = be64_to_cpu(hptep->v);
+		if (cpu_has_feature(CPU_FTR_ARCH_300))
+			hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
 
 		/* Even if we miss, we need to invalidate the TLB */
 		if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
@@ -504,6 +519,10 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
 	/* Look at the 8 bit LP value */
 	unsigned int lp = (hpte_r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
 
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		hpte_v = hpte_new_to_old_v(hpte_v, hpte_r);
+		hpte_r = hpte_new_to_old_r(hpte_r);
+	}
 	if (!(hpte_v & HPTE_V_LARGE)) {
 		size   = MMU_PAGE_4K;
 		a_size = MMU_PAGE_4K;
@@ -512,11 +531,7 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
 		a_size = hpte_page_sizes[lp] >> 4;
 	}
 	/* This works for all page sizes, and for 256M and 1T segments */
-	if (cpu_has_feature(CPU_FTR_ARCH_300))
-		*ssize = hpte_r >> HPTE_R_3_0_SSIZE_SHIFT;
-	else
-		*ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
-
+	*ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
 	shift = mmu_psize_defs[size].shift;
 
 	avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm);
@@ -639,6 +654,9 @@ static void native_flush_hash_range(unsigned long number, int local)
 			want_v = hpte_encode_avpn(vpn, psize, ssize);
 			native_lock_hpte(hptep);
 			hpte_v = be64_to_cpu(hptep->v);
+			if (cpu_has_feature(CPU_FTR_ARCH_300))
+				hpte_v = hpte_new_to_old_v(hpte_v,
+						be64_to_cpu(hptep->r));
 			if (!HPTE_V_COMPARE(hpte_v, want_v) ||
 			    !(hpte_v & HPTE_V_VALID))
 				native_unlock_hpte(hptep);
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 78dabf065ba9..8410b4bb36ed 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -796,37 +796,17 @@ static void update_hid_for_hash(void)
 static void __init hash_init_partition_table(phys_addr_t hash_table,
 					     unsigned long htab_size)
 {
-	unsigned long ps_field;
-	unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
+	mmu_partition_table_init();
 
 	/*
-	 * slb llp encoding for the page size used in VPM real mode.
-	 * We can ignore that for lpid 0
+	 * PS field (VRMA page size) is not used for LPID 0, hence set to 0.
+	 * For now, UPRT is 0 and we have no segment table.
 	 */
-	ps_field = 0;
 	htab_size =  __ilog2(htab_size) - 18;
-
-	BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large.");
-	partition_tb = __va(memblock_alloc_base(patb_size, patb_size,
-						MEMBLOCK_ALLOC_ANYWHERE));
-
-	/* Initialize the Partition Table with no entries */
-	memset((void *)partition_tb, 0, patb_size);
-	partition_tb->patb0 = cpu_to_be64(ps_field | hash_table | htab_size);
-	/*
-	 * FIXME!! This should be done via update_partition table
-	 * For now UPRT is 0 for us.
-	 */
-	partition_tb->patb1 = 0;
+	mmu_partition_table_set_entry(0, hash_table | htab_size, 0);
 	pr_info("Partition table %p\n", partition_tb);
 	if (cpu_has_feature(CPU_FTR_POWER9_DD1))
 		update_hid_for_hash();
-	/*
-	 * update partition table control register,
-	 * 64 K size.
-	 */
-	mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
-
 }
 
 static void __init htab_initialize(void)
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 688b54517655..8d941c692eb3 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -177,23 +177,15 @@ redo:
 
 static void __init radix_init_partition_table(void)
 {
-	unsigned long rts_field;
+	unsigned long rts_field, dw0;
 
+	mmu_partition_table_init();
 	rts_field = radix__get_tree_size();
+	dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
+	mmu_partition_table_set_entry(0, dw0, 0);
 
-	BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large.");
-	partition_tb = early_alloc_pgtable(1UL << PATB_SIZE_SHIFT);
-	partition_tb->patb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) |
-					  RADIX_PGD_INDEX_SIZE | PATB_HR);
 	pr_info("Initializing Radix MMU\n");
 	pr_info("Partition table %p\n", partition_tb);
-
-	memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
-	/*
-	 * update partition table control register,
-	 * 64 K size.
-	 */
-	mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
 }
 
 void __init radix_init_native(void)
@@ -378,6 +370,8 @@ void __init radix__early_init_mmu(void)
 		radix_init_partition_table();
 	}
 
+	memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
+
 	radix_init_pgtable();
 }
 
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index f5e8d4edb808..8bca7f58afc4 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -431,3 +431,37 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
 	}
 }
 #endif
+
+#ifdef CONFIG_PPC_BOOK3S_64
+void __init mmu_partition_table_init(void)
+{
+	unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
+
+	BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large.");
+	partition_tb = __va(memblock_alloc_base(patb_size, patb_size,
+						MEMBLOCK_ALLOC_ANYWHERE));
+
+	/* Initialize the Partition Table with no entries */
+	memset((void *)partition_tb, 0, patb_size);
+
+	/*
+	 * update partition table control register,
+	 * 64 K size.
+	 */
+	mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
+}
+
+void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
+				   unsigned long dw1)
+{
+	partition_tb[lpid].patb0 = cpu_to_be64(dw0);
+	partition_tb[lpid].patb1 = cpu_to_be64(dw1);
+
+	/* Global flush of TLBs and partition table caches for this lpid */
+	asm volatile("ptesync" : : : "memory");
+	asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
+		     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+	asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+}
+EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry);
+#endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 44d2d842cee7..3aa40f1b20f5 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -304,8 +304,11 @@ OPAL_CALL(opal_pci_get_presence_state,		OPAL_PCI_GET_PRESENCE_STATE);
 OPAL_CALL(opal_pci_get_power_state,		OPAL_PCI_GET_POWER_STATE);
 OPAL_CALL(opal_pci_set_power_state,		OPAL_PCI_SET_POWER_STATE);
 OPAL_CALL(opal_int_get_xirr,			OPAL_INT_GET_XIRR);
+OPAL_CALL_REAL(opal_rm_int_get_xirr,		OPAL_INT_GET_XIRR);
 OPAL_CALL(opal_int_set_cppr,			OPAL_INT_SET_CPPR);
 OPAL_CALL(opal_int_eoi,				OPAL_INT_EOI);
+OPAL_CALL_REAL(opal_rm_int_eoi,			OPAL_INT_EOI);
 OPAL_CALL(opal_int_set_mfrr,			OPAL_INT_SET_MFRR);
+OPAL_CALL_REAL(opal_rm_int_set_mfrr,		OPAL_INT_SET_MFRR);
 OPAL_CALL(opal_pci_tce_kill,			OPAL_PCI_TCE_KILL);
 OPAL_CALL_REAL(opal_rm_pci_tce_kill,		OPAL_PCI_TCE_KILL);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 6c9a65b52e63..b3b8930ac52f 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -896,3 +896,5 @@ EXPORT_SYMBOL_GPL(opal_leds_get_ind);
 EXPORT_SYMBOL_GPL(opal_leds_set_ind);
 /* Export this symbol for PowerNV Operator Panel class driver */
 EXPORT_SYMBOL_GPL(opal_write_oppanel_async);
+/* Export this for KVM */
+EXPORT_SYMBOL_GPL(opal_int_set_mfrr);
diff --git a/arch/powerpc/platforms/ps3/htab.c b/arch/powerpc/platforms/ps3/htab.c
index cb3c50328de8..cc2b281a3766 100644
--- a/arch/powerpc/platforms/ps3/htab.c
+++ b/arch/powerpc/platforms/ps3/htab.c
@@ -63,7 +63,7 @@ static long ps3_hpte_insert(unsigned long hpte_group, unsigned long vpn,
 	vflags &= ~HPTE_V_SECONDARY;
 
 	hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
-	hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize, apsize, ssize) | rflags;
+	hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize, apsize) | rflags;
 
 	spin_lock_irqsave(&ps3_htab_lock, flags);
 
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index aa35245d8d6d..f2c98f6c1c9c 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -145,7 +145,7 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
 			 hpte_group, vpn,  pa, rflags, vflags, psize);
 
 	hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
-	hpte_r = hpte_encode_r(pa, psize, apsize, ssize) | rflags;
+	hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
 
 	if (!(vflags & HPTE_V_BOLTED))
 		pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 028f97be5bae..c6722112527d 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -136,6 +136,7 @@ config S390
 	select HAVE_CMPXCHG_LOCAL
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DMA_API_DEBUG
+	select HAVE_DMA_CONTIGUOUS
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
@@ -169,6 +170,7 @@ config S390
 	select OLD_SIGSUSPEND3
 	select SPARSE_IRQ
 	select SYSCTL_EXCEPTION_TRACE
+	select THREAD_INFO_IN_TASK
 	select TTY
 	select VIRT_CPU_ACCOUNTING
 	select ARCH_HAS_SCALED_CPUTIME
diff --git a/arch/s390/boot/compressed/Makefile b/arch/s390/boot/compressed/Makefile
index 0daa070d6c9d..6bd2c9022be3 100644
--- a/arch/s390/boot/compressed/Makefile
+++ b/arch/s390/boot/compressed/Makefile
@@ -10,7 +10,7 @@ targets	:= vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2
 targets += vmlinux.bin.xz vmlinux.bin.lzma vmlinux.bin.lzo vmlinux.bin.lz4
 targets += misc.o piggy.o sizes.h head.o
 
-KBUILD_CFLAGS := -m64 -D__KERNEL__ $(LINUX_INCLUDE) -O2
+KBUILD_CFLAGS := -m64 -D__KERNEL__ -O2
 KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
 KBUILD_CFLAGS += $(cflags-y) -fno-delete-null-pointer-checks -msoft-float
 KBUILD_CFLAGS += $(call cc-option,-mpacked-stack)
diff --git a/arch/s390/boot/compressed/head.S b/arch/s390/boot/compressed/head.S
index 28c4f96a2d9c..11f6254c561e 100644
--- a/arch/s390/boot/compressed/head.S
+++ b/arch/s390/boot/compressed/head.S
@@ -46,7 +46,7 @@ mover_end:
 
 	.align	8
 .Lstack:
-	.quad	0x8000 + (1<<(PAGE_SHIFT+THREAD_ORDER))
+	.quad	0x8000 + (1<<(PAGE_SHIFT+THREAD_SIZE_ORDER))
 .Loffset:
 	.quad	0x11000
 .Lmvsize:
diff --git a/arch/s390/configs/default_defconfig b/arch/s390/configs/default_defconfig
index 45968686f918..e659daffe368 100644
--- a/arch/s390/configs/default_defconfig
+++ b/arch/s390/configs/default_defconfig
@@ -66,6 +66,8 @@ CONFIG_TRANSPARENT_HUGEPAGE=y
 CONFIG_CLEANCACHE=y
 CONFIG_FRONTSWAP=y
 CONFIG_CMA=y
+CONFIG_CMA_DEBUG=y
+CONFIG_CMA_DEBUGFS=y
 CONFIG_MEM_SOFT_DIRTY=y
 CONFIG_ZPOOL=m
 CONFIG_ZBUD=m
@@ -366,6 +368,8 @@ CONFIG_BPF_JIT=y
 CONFIG_NET_PKTGEN=m
 CONFIG_NET_TCPPROBE=m
 CONFIG_DEVTMPFS=y
+CONFIG_DMA_CMA=y
+CONFIG_CMA_SIZE_MBYTES=0
 CONFIG_CONNECTOR=y
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_CRYPTOLOOP=m
@@ -438,7 +442,6 @@ CONFIG_TUN=m
 CONFIG_VETH=m
 CONFIG_VIRTIO_NET=m
 CONFIG_NLMON=m
-CONFIG_VHOST_NET=m
 # CONFIG_NET_VENDOR_ARC is not set
 # CONFIG_NET_VENDOR_CHELSIO is not set
 # CONFIG_NET_VENDOR_INTEL is not set
@@ -693,3 +696,4 @@ CONFIG_CMM=m
 CONFIG_APPLDATA_BASE=y
 CONFIG_KVM=m
 CONFIG_KVM_S390_UCONTROL=y
+CONFIG_VHOST_NET=m
diff --git a/arch/s390/configs/gcov_defconfig b/arch/s390/configs/gcov_defconfig
index 1dd05e345c4d..95ceac50bc65 100644
--- a/arch/s390/configs/gcov_defconfig
+++ b/arch/s390/configs/gcov_defconfig
@@ -362,6 +362,8 @@ CONFIG_BPF_JIT=y
 CONFIG_NET_PKTGEN=m
 CONFIG_NET_TCPPROBE=m
 CONFIG_DEVTMPFS=y
+CONFIG_DMA_CMA=y
+CONFIG_CMA_SIZE_MBYTES=0
 CONFIG_CONNECTOR=y
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_CRYPTOLOOP=m
@@ -434,7 +436,6 @@ CONFIG_TUN=m
 CONFIG_VETH=m
 CONFIG_VIRTIO_NET=m
 CONFIG_NLMON=m
-CONFIG_VHOST_NET=m
 # CONFIG_NET_VENDOR_ARC is not set
 # CONFIG_NET_VENDOR_CHELSIO is not set
 # CONFIG_NET_VENDOR_INTEL is not set
@@ -633,3 +634,4 @@ CONFIG_CMM=m
 CONFIG_APPLDATA_BASE=y
 CONFIG_KVM=m
 CONFIG_KVM_S390_UCONTROL=y
+CONFIG_VHOST_NET=m
diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig
index 29d1178666f0..bc7b176f5795 100644
--- a/arch/s390/configs/performance_defconfig
+++ b/arch/s390/configs/performance_defconfig
@@ -362,6 +362,8 @@ CONFIG_BPF_JIT=y
 CONFIG_NET_PKTGEN=m
 CONFIG_NET_TCPPROBE=m
 CONFIG_DEVTMPFS=y
+CONFIG_DMA_CMA=y
+CONFIG_CMA_SIZE_MBYTES=0
 CONFIG_CONNECTOR=y
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_CRYPTOLOOP=m
@@ -434,7 +436,6 @@ CONFIG_TUN=m
 CONFIG_VETH=m
 CONFIG_VIRTIO_NET=m
 CONFIG_NLMON=m
-CONFIG_VHOST_NET=m
 # CONFIG_NET_VENDOR_ARC is not set
 # CONFIG_NET_VENDOR_CHELSIO is not set
 # CONFIG_NET_VENDOR_INTEL is not set
@@ -632,3 +633,4 @@ CONFIG_CMM=m
 CONFIG_APPLDATA_BASE=y
 CONFIG_KVM=m
 CONFIG_KVM_S390_UCONTROL=y
+CONFIG_VHOST_NET=m
diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c
index 9cc050f9536c..1113389d0a39 100644
--- a/arch/s390/crypto/prng.c
+++ b/arch/s390/crypto/prng.c
@@ -507,8 +507,10 @@ static ssize_t prng_tdes_read(struct file *file, char __user *ubuf,
 		prng_data->prngws.byte_counter += n;
 		prng_data->prngws.reseed_counter += n;
 
-		if (copy_to_user(ubuf, prng_data->buf, chunk))
-			return -EFAULT;
+		if (copy_to_user(ubuf, prng_data->buf, chunk)) {
+			ret = -EFAULT;
+			break;
+		}
 
 		nbytes -= chunk;
 		ret += chunk;
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index 09bccb224d03..cf8a2d92467f 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -3,6 +3,7 @@
  *
  *    Copyright IBM Corp. 2006, 2008
  *    Author(s): Michael Holzheu <holzheu@de.ibm.com>
+ *    License: GPL
  */
 
 #define KMSG_COMPONENT "hypfs"
@@ -18,7 +19,8 @@
 #include <linux/time.h>
 #include <linux/parser.h>
 #include <linux/sysfs.h>
-#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kobject.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include <linux/uio.h>
@@ -443,7 +445,6 @@ static struct file_system_type hypfs_type = {
 	.mount		= hypfs_mount,
 	.kill_sb	= hypfs_kill_super
 };
-MODULE_ALIAS_FS("s390_hypfs");
 
 static const struct super_operations hypfs_s_ops = {
 	.statfs		= simple_statfs,
@@ -497,21 +498,4 @@ fail_dbfs_exit:
 	pr_err("Initialization of hypfs failed with rc=%i\n", rc);
 	return rc;
 }
-
-static void __exit hypfs_exit(void)
-{
-	unregister_filesystem(&hypfs_type);
-	sysfs_remove_mount_point(hypervisor_kobj, "s390");
-	hypfs_diag0c_exit();
-	hypfs_sprp_exit();
-	hypfs_vm_exit();
-	hypfs_diag_exit();
-	hypfs_dbfs_exit();
-}
-
-module_init(hypfs_init)
-module_exit(hypfs_exit)
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Michael Holzheu <holzheu@de.ibm.com>");
-MODULE_DESCRIPTION("s390 Hypervisor Filesystem");
+device_initcall(hypfs_init)
diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild
index 20f196b82a6e..8aea32fe8bd2 100644
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -1,6 +1,6 @@
-
-
+generic-y += asm-offsets.h
 generic-y += clkdev.h
+generic-y += dma-contiguous.h
 generic-y += export.h
 generic-y += irq_work.h
 generic-y += mcs_spinlock.h
diff --git a/arch/s390/include/asm/asm-offsets.h b/arch/s390/include/asm/asm-offsets.h
deleted file mode 100644
index d370ee36a182..000000000000
--- a/arch/s390/include/asm/asm-offsets.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <generated/asm-offsets.h>
diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h
index d28cc2f5b7b2..f7f69dfd2db2 100644
--- a/arch/s390/include/asm/atomic.h
+++ b/arch/s390/include/asm/atomic.h
@@ -1,13 +1,8 @@
 /*
- * Copyright IBM Corp. 1999, 2009
+ * Copyright IBM Corp. 1999, 2016
  * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>,
  *	      Denis Joseph Barrow,
- *	      Arnd Bergmann <arndb@de.ibm.com>,
- *
- * Atomic operations that C can't guarantee us.
- * Useful for resource counting etc.
- * s390 uses 'Compare And Swap' for atomicity in SMP environment.
- *
+ *	      Arnd Bergmann,
  */
 
 #ifndef __ARCH_S390_ATOMIC__
@@ -15,62 +10,12 @@
 
 #include <linux/compiler.h>
 #include <linux/types.h>
+#include <asm/atomic_ops.h>
 #include <asm/barrier.h>
 #include <asm/cmpxchg.h>
 
 #define ATOMIC_INIT(i)  { (i) }
 
-#define __ATOMIC_NO_BARRIER	"\n"
-
-#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
-
-#define __ATOMIC_OR	"lao"
-#define __ATOMIC_AND	"lan"
-#define __ATOMIC_ADD	"laa"
-#define __ATOMIC_XOR	"lax"
-#define __ATOMIC_BARRIER "bcr	14,0\n"
-
-#define __ATOMIC_LOOP(ptr, op_val, op_string, __barrier)		\
-({									\
-	int old_val;							\
-									\
-	typecheck(atomic_t *, ptr);					\
-	asm volatile(							\
-		op_string "	%0,%2,%1\n"				\
-		__barrier						\
-		: "=d" (old_val), "+Q" ((ptr)->counter)			\
-		: "d" (op_val)						\
-		: "cc", "memory");					\
-	old_val;							\
-})
-
-#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */
-
-#define __ATOMIC_OR	"or"
-#define __ATOMIC_AND	"nr"
-#define __ATOMIC_ADD	"ar"
-#define __ATOMIC_XOR	"xr"
-#define __ATOMIC_BARRIER "\n"
-
-#define __ATOMIC_LOOP(ptr, op_val, op_string, __barrier)		\
-({									\
-	int old_val, new_val;						\
-									\
-	typecheck(atomic_t *, ptr);					\
-	asm volatile(							\
-		"	l	%0,%2\n"				\
-		"0:	lr	%1,%0\n"				\
-		op_string "	%1,%3\n"				\
-		"	cs	%0,%1,%2\n"				\
-		"	jl	0b"					\
-		: "=&d" (old_val), "=&d" (new_val), "+Q" ((ptr)->counter)\
-		: "d" (op_val)						\
-		: "cc", "memory");					\
-	old_val;							\
-})
-
-#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */
-
 static inline int atomic_read(const atomic_t *v)
 {
 	int c;
@@ -90,27 +35,23 @@ static inline void atomic_set(atomic_t *v, int i)
 
 static inline int atomic_add_return(int i, atomic_t *v)
 {
-	return __ATOMIC_LOOP(v, i, __ATOMIC_ADD, __ATOMIC_BARRIER) + i;
+	return __atomic_add_barrier(i, &v->counter) + i;
 }
 
 static inline int atomic_fetch_add(int i, atomic_t *v)
 {
-	return __ATOMIC_LOOP(v, i, __ATOMIC_ADD, __ATOMIC_BARRIER);
+	return __atomic_add_barrier(i, &v->counter);
 }
 
 static inline void atomic_add(int i, atomic_t *v)
 {
 #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
 	if (__builtin_constant_p(i) && (i > -129) && (i < 128)) {
-		asm volatile(
-			"asi	%0,%1\n"
-			: "+Q" (v->counter)
-			: "i" (i)
-			: "cc", "memory");
+		__atomic_add_const(i, &v->counter);
 		return;
 	}
 #endif
-	__ATOMIC_LOOP(v, i, __ATOMIC_ADD, __ATOMIC_NO_BARRIER);
+	__atomic_add(i, &v->counter);
 }
 
 #define atomic_add_negative(_i, _v)	(atomic_add_return(_i, _v) < 0)
@@ -125,19 +66,19 @@ static inline void atomic_add(int i, atomic_t *v)
 #define atomic_dec_return(_v)		atomic_sub_return(1, _v)
 #define atomic_dec_and_test(_v)		(atomic_sub_return(1, _v) == 0)
 
-#define ATOMIC_OPS(op, OP)						\
+#define ATOMIC_OPS(op)							\
 static inline void atomic_##op(int i, atomic_t *v)			\
 {									\
-	__ATOMIC_LOOP(v, i, __ATOMIC_##OP, __ATOMIC_NO_BARRIER);	\
+	__atomic_##op(i, &v->counter);					\
 }									\
 static inline int atomic_fetch_##op(int i, atomic_t *v)			\
 {									\
-	return __ATOMIC_LOOP(v, i, __ATOMIC_##OP, __ATOMIC_BARRIER);	\
+	return __atomic_##op##_barrier(i, &v->counter);			\
 }
 
-ATOMIC_OPS(and, AND)
-ATOMIC_OPS(or, OR)
-ATOMIC_OPS(xor, XOR)
+ATOMIC_OPS(and)
+ATOMIC_OPS(or)
+ATOMIC_OPS(xor)
 
 #undef ATOMIC_OPS
 
@@ -145,12 +86,7 @@ ATOMIC_OPS(xor, XOR)
 
 static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 {
-	asm volatile(
-		"	cs	%0,%2,%1"
-		: "+d" (old), "+Q" (v->counter)
-		: "d" (new)
-		: "cc", "memory");
-	return old;
+	return __atomic_cmpxchg(&v->counter, old, new);
 }
 
 static inline int __atomic_add_unless(atomic_t *v, int a, int u)
@@ -168,65 +104,11 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
 	return c;
 }
 
-
-#undef __ATOMIC_LOOP
-
 #define ATOMIC64_INIT(i)  { (i) }
 
-#define __ATOMIC64_NO_BARRIER	"\n"
-
-#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
-
-#define __ATOMIC64_OR	"laog"
-#define __ATOMIC64_AND	"lang"
-#define __ATOMIC64_ADD	"laag"
-#define __ATOMIC64_XOR	"laxg"
-#define __ATOMIC64_BARRIER "bcr	14,0\n"
-
-#define __ATOMIC64_LOOP(ptr, op_val, op_string, __barrier)		\
-({									\
-	long long old_val;						\
-									\
-	typecheck(atomic64_t *, ptr);					\
-	asm volatile(							\
-		op_string "	%0,%2,%1\n"				\
-		__barrier						\
-		: "=d" (old_val), "+Q" ((ptr)->counter)			\
-		: "d" (op_val)						\
-		: "cc", "memory");					\
-	old_val;							\
-})
-
-#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */
-
-#define __ATOMIC64_OR	"ogr"
-#define __ATOMIC64_AND	"ngr"
-#define __ATOMIC64_ADD	"agr"
-#define __ATOMIC64_XOR	"xgr"
-#define __ATOMIC64_BARRIER "\n"
-
-#define __ATOMIC64_LOOP(ptr, op_val, op_string, __barrier)		\
-({									\
-	long long old_val, new_val;					\
-									\
-	typecheck(atomic64_t *, ptr);					\
-	asm volatile(							\
-		"	lg	%0,%2\n"				\
-		"0:	lgr	%1,%0\n"				\
-		op_string "	%1,%3\n"				\
-		"	csg	%0,%1,%2\n"				\
-		"	jl	0b"					\
-		: "=&d" (old_val), "=&d" (new_val), "+Q" ((ptr)->counter)\
-		: "d" (op_val)						\
-		: "cc", "memory");					\
-	old_val;							\
-})
-
-#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */
-
-static inline long long atomic64_read(const atomic64_t *v)
+static inline long atomic64_read(const atomic64_t *v)
 {
-	long long c;
+	long c;
 
 	asm volatile(
 		"	lg	%0,%1\n"
@@ -234,71 +116,60 @@ static inline long long atomic64_read(const atomic64_t *v)
 	return c;
 }
 
-static inline void atomic64_set(atomic64_t *v, long long i)
+static inline void atomic64_set(atomic64_t *v, long i)
 {
 	asm volatile(
 		"	stg	%1,%0\n"
 		: "=Q" (v->counter) : "d" (i));
 }
 
-static inline long long atomic64_add_return(long long i, atomic64_t *v)
+static inline long atomic64_add_return(long i, atomic64_t *v)
 {
-	return __ATOMIC64_LOOP(v, i, __ATOMIC64_ADD, __ATOMIC64_BARRIER) + i;
+	return __atomic64_add_barrier(i, &v->counter) + i;
 }
 
-static inline long long atomic64_fetch_add(long long i, atomic64_t *v)
+static inline long atomic64_fetch_add(long i, atomic64_t *v)
 {
-	return __ATOMIC64_LOOP(v, i, __ATOMIC64_ADD, __ATOMIC64_BARRIER);
+	return __atomic64_add_barrier(i, &v->counter);
 }
 
-static inline void atomic64_add(long long i, atomic64_t *v)
+static inline void atomic64_add(long i, atomic64_t *v)
 {
 #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
 	if (__builtin_constant_p(i) && (i > -129) && (i < 128)) {
-		asm volatile(
-			"agsi	%0,%1\n"
-			: "+Q" (v->counter)
-			: "i" (i)
-			: "cc", "memory");
+		__atomic64_add_const(i, &v->counter);
 		return;
 	}
 #endif
-	__ATOMIC64_LOOP(v, i, __ATOMIC64_ADD, __ATOMIC64_NO_BARRIER);
+	__atomic64_add(i, &v->counter);
 }
 
 #define atomic64_xchg(v, new) (xchg(&((v)->counter), new))
 
-static inline long long atomic64_cmpxchg(atomic64_t *v,
-					     long long old, long long new)
+static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
 {
-	asm volatile(
-		"	csg	%0,%2,%1"
-		: "+d" (old), "+Q" (v->counter)
-		: "d" (new)
-		: "cc", "memory");
-	return old;
+	return __atomic64_cmpxchg(&v->counter, old, new);
 }
 
-#define ATOMIC64_OPS(op, OP)						\
+#define ATOMIC64_OPS(op)						\
 static inline void atomic64_##op(long i, atomic64_t *v)			\
 {									\
-	__ATOMIC64_LOOP(v, i, __ATOMIC64_##OP, __ATOMIC64_NO_BARRIER);	\
+	__atomic64_##op(i, &v->counter);				\
 }									\
 static inline long atomic64_fetch_##op(long i, atomic64_t *v)		\
 {									\
-	return __ATOMIC64_LOOP(v, i, __ATOMIC64_##OP, __ATOMIC64_BARRIER); \
+	return __atomic64_##op##_barrier(i, &v->counter);		\
 }
 
-ATOMIC64_OPS(and, AND)
-ATOMIC64_OPS(or, OR)
-ATOMIC64_OPS(xor, XOR)
+ATOMIC64_OPS(and)
+ATOMIC64_OPS(or)
+ATOMIC64_OPS(xor)
 
 #undef ATOMIC64_OPS
-#undef __ATOMIC64_LOOP
 
-static inline int atomic64_add_unless(atomic64_t *v, long long i, long long u)
+static inline int atomic64_add_unless(atomic64_t *v, long i, long u)
 {
-	long long c, old;
+	long c, old;
 
 	c = atomic64_read(v);
 	for (;;) {
@@ -312,9 +183,9 @@ static inline int atomic64_add_unless(atomic64_t *v, long long i, long long u)
 	return c != u;
 }
 
-static inline long long atomic64_dec_if_positive(atomic64_t *v)
+static inline long atomic64_dec_if_positive(atomic64_t *v)
 {
-	long long c, old, dec;
+	long c, old, dec;
 
 	c = atomic64_read(v);
 	for (;;) {
@@ -333,9 +204,9 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v)
 #define atomic64_inc(_v)		atomic64_add(1, _v)
 #define atomic64_inc_return(_v)		atomic64_add_return(1, _v)
 #define atomic64_inc_and_test(_v)	(atomic64_add_return(1, _v) == 0)
-#define atomic64_sub_return(_i, _v)	atomic64_add_return(-(long long)(_i), _v)
-#define atomic64_fetch_sub(_i, _v)	atomic64_fetch_add(-(long long)(_i), _v)
-#define atomic64_sub(_i, _v)		atomic64_add(-(long long)(_i), _v)
+#define atomic64_sub_return(_i, _v)	atomic64_add_return(-(long)(_i), _v)
+#define atomic64_fetch_sub(_i, _v)	atomic64_fetch_add(-(long)(_i), _v)
+#define atomic64_sub(_i, _v)		atomic64_add(-(long)(_i), _v)
 #define atomic64_sub_and_test(_i, _v)	(atomic64_sub_return(_i, _v) == 0)
 #define atomic64_dec(_v)		atomic64_sub(1, _v)
 #define atomic64_dec_return(_v)		atomic64_sub_return(1, _v)
diff --git a/arch/s390/include/asm/atomic_ops.h b/arch/s390/include/asm/atomic_ops.h
new file mode 100644
index 000000000000..ac9e2b939d04
--- /dev/null
+++ b/arch/s390/include/asm/atomic_ops.h
@@ -0,0 +1,130 @@
+/*
+ * Low level function for atomic operations
+ *
+ * Copyright IBM Corp. 1999, 2016
+ */
+
+#ifndef __ARCH_S390_ATOMIC_OPS__
+#define __ARCH_S390_ATOMIC_OPS__
+
+#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
+
+#define __ATOMIC_OP(op_name, op_type, op_string, op_barrier)		\
+static inline op_type op_name(op_type val, op_type *ptr)		\
+{									\
+	op_type old;							\
+									\
+	asm volatile(							\
+		op_string "	%[old],%[val],%[ptr]\n"			\
+		op_barrier						\
+		: [old] "=d" (old), [ptr] "+Q" (*ptr)			\
+		: [val] "d" (val) : "cc", "memory");			\
+	return old;							\
+}									\
+
+#define __ATOMIC_OPS(op_name, op_type, op_string)			\
+	__ATOMIC_OP(op_name, op_type, op_string, "\n")			\
+	__ATOMIC_OP(op_name##_barrier, op_type, op_string, "bcr 14,0\n")
+
+__ATOMIC_OPS(__atomic_add, int, "laa")
+__ATOMIC_OPS(__atomic_and, int, "lan")
+__ATOMIC_OPS(__atomic_or,  int, "lao")
+__ATOMIC_OPS(__atomic_xor, int, "lax")
+
+__ATOMIC_OPS(__atomic64_add, long, "laag")
+__ATOMIC_OPS(__atomic64_and, long, "lang")
+__ATOMIC_OPS(__atomic64_or,  long, "laog")
+__ATOMIC_OPS(__atomic64_xor, long, "laxg")
+
+#undef __ATOMIC_OPS
+#undef __ATOMIC_OP
+
+static inline void __atomic_add_const(int val, int *ptr)
+{
+	asm volatile(
+		"	asi	%[ptr],%[val]\n"
+		: [ptr] "+Q" (*ptr) : [val] "i" (val) : "cc");
+}
+
+static inline void __atomic64_add_const(long val, long *ptr)
+{
+	asm volatile(
+		"	agsi	%[ptr],%[val]\n"
+		: [ptr] "+Q" (*ptr) : [val] "i" (val) : "cc");
+}
+
+#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */
+
+#define __ATOMIC_OP(op_name, op_string)					\
+static inline int op_name(int val, int *ptr)				\
+{									\
+	int old, new;							\
+									\
+	asm volatile(							\
+		"0:	lr	%[new],%[old]\n"			\
+		op_string "	%[new],%[val]\n"			\
+		"	cs	%[old],%[new],%[ptr]\n"			\
+		"	jl	0b"					\
+		: [old] "=d" (old), [new] "=&d" (new), [ptr] "+Q" (*ptr)\
+		: [val] "d" (val), "0" (*ptr) : "cc", "memory");	\
+	return old;							\
+}
+
+#define __ATOMIC_OPS(op_name, op_string)				\
+	__ATOMIC_OP(op_name, op_string)					\
+	__ATOMIC_OP(op_name##_barrier, op_string)
+
+__ATOMIC_OPS(__atomic_add, "ar")
+__ATOMIC_OPS(__atomic_and, "nr")
+__ATOMIC_OPS(__atomic_or,  "or")
+__ATOMIC_OPS(__atomic_xor, "xr")
+
+#undef __ATOMIC_OPS
+
+#define __ATOMIC64_OP(op_name, op_string)				\
+static inline long op_name(long val, long *ptr)				\
+{									\
+	long old, new;							\
+									\
+	asm volatile(							\
+		"0:	lgr	%[new],%[old]\n"			\
+		op_string "	%[new],%[val]\n"			\
+		"	csg	%[old],%[new],%[ptr]\n"			\
+		"	jl	0b"					\
+		: [old] "=d" (old), [new] "=&d" (new), [ptr] "+Q" (*ptr)\
+		: [val] "d" (val), "0" (*ptr) : "cc", "memory");	\
+	return old;							\
+}
+
+#define __ATOMIC64_OPS(op_name, op_string)				\
+	__ATOMIC64_OP(op_name, op_string)				\
+	__ATOMIC64_OP(op_name##_barrier, op_string)
+
+__ATOMIC64_OPS(__atomic64_add, "agr")
+__ATOMIC64_OPS(__atomic64_and, "ngr")
+__ATOMIC64_OPS(__atomic64_or,  "ogr")
+__ATOMIC64_OPS(__atomic64_xor, "xgr")
+
+#undef __ATOMIC64_OPS
+
+#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */
+
+static inline int __atomic_cmpxchg(int *ptr, int old, int new)
+{
+	asm volatile(
+		"	cs	%[old],%[new],%[ptr]"
+		: [old] "+d" (old), [ptr] "+Q" (*ptr)
+		: [new] "d" (new) : "cc", "memory");
+	return old;
+}
+
+static inline long __atomic64_cmpxchg(long *ptr, long old, long new)
+{
+	asm volatile(
+		"	csg	%[old],%[new],%[ptr]"
+		: [old] "+d" (old), [ptr] "+Q" (*ptr)
+		: [new] "d" (new) : "cc", "memory");
+	return old;
+}
+
+#endif /* __ARCH_S390_ATOMIC_OPS__  */
diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h
index 8043f10da6b5..d92047da5ccb 100644
--- a/arch/s390/include/asm/bitops.h
+++ b/arch/s390/include/asm/bitops.h
@@ -42,57 +42,9 @@
 
 #include <linux/typecheck.h>
 #include <linux/compiler.h>
+#include <asm/atomic_ops.h>
 #include <asm/barrier.h>
 
-#define __BITOPS_NO_BARRIER	"\n"
-
-#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
-
-#define __BITOPS_OR		"laog"
-#define __BITOPS_AND		"lang"
-#define __BITOPS_XOR		"laxg"
-#define __BITOPS_BARRIER	"bcr	14,0\n"
-
-#define __BITOPS_LOOP(__addr, __val, __op_string, __barrier)	\
-({								\
-	unsigned long __old;					\
-								\
-	typecheck(unsigned long *, (__addr));			\
-	asm volatile(						\
-		__op_string "	%0,%2,%1\n"			\
-		__barrier					\
-		: "=d" (__old),	"+Q" (*(__addr))		\
-		: "d" (__val)					\
-		: "cc", "memory");				\
-	__old;							\
-})
-
-#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */
-
-#define __BITOPS_OR		"ogr"
-#define __BITOPS_AND		"ngr"
-#define __BITOPS_XOR		"xgr"
-#define __BITOPS_BARRIER	"\n"
-
-#define __BITOPS_LOOP(__addr, __val, __op_string, __barrier)	\
-({								\
-	unsigned long __old, __new;				\
-								\
-	typecheck(unsigned long *, (__addr));			\
-	asm volatile(						\
-		"	lg	%0,%2\n"			\
-		"0:	lgr	%1,%0\n"			\
-		__op_string "	%1,%3\n"			\
-		"	csg	%0,%1,%2\n"			\
-		"	jl	0b"				\
-		: "=&d" (__old), "=&d" (__new), "+Q" (*(__addr))\
-		: "d" (__val)					\
-		: "cc", "memory");				\
-	__old;							\
-})
-
-#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */
-
 #define __BITOPS_WORDS(bits) (((bits) + BITS_PER_LONG - 1) / BITS_PER_LONG)
 
 static inline unsigned long *
@@ -128,7 +80,7 @@ static inline void set_bit(unsigned long nr, volatile unsigned long *ptr)
 	}
 #endif
 	mask = 1UL << (nr & (BITS_PER_LONG - 1));
-	__BITOPS_LOOP(addr, mask, __BITOPS_OR, __BITOPS_NO_BARRIER);
+	__atomic64_or(mask, addr);
 }
 
 static inline void clear_bit(unsigned long nr, volatile unsigned long *ptr)
@@ -149,7 +101,7 @@ static inline void clear_bit(unsigned long nr, volatile unsigned long *ptr)
 	}
 #endif
 	mask = ~(1UL << (nr & (BITS_PER_LONG - 1)));
-	__BITOPS_LOOP(addr, mask, __BITOPS_AND, __BITOPS_NO_BARRIER);
+	__atomic64_and(mask, addr);
 }
 
 static inline void change_bit(unsigned long nr, volatile unsigned long *ptr)
@@ -170,7 +122,7 @@ static inline void change_bit(unsigned long nr, volatile unsigned long *ptr)
 	}
 #endif
 	mask = 1UL << (nr & (BITS_PER_LONG - 1));
-	__BITOPS_LOOP(addr, mask, __BITOPS_XOR, __BITOPS_NO_BARRIER);
+	__atomic64_xor(mask, addr);
 }
 
 static inline int
@@ -180,7 +132,7 @@ test_and_set_bit(unsigned long nr, volatile unsigned long *ptr)
 	unsigned long old, mask;
 
 	mask = 1UL << (nr & (BITS_PER_LONG - 1));
-	old = __BITOPS_LOOP(addr, mask, __BITOPS_OR, __BITOPS_BARRIER);
+	old = __atomic64_or_barrier(mask, addr);
 	return (old & mask) != 0;
 }
 
@@ -191,7 +143,7 @@ test_and_clear_bit(unsigned long nr, volatile unsigned long *ptr)
 	unsigned long old, mask;
 
 	mask = ~(1UL << (nr & (BITS_PER_LONG - 1)));
-	old = __BITOPS_LOOP(addr, mask, __BITOPS_AND, __BITOPS_BARRIER);
+	old = __atomic64_and_barrier(mask, addr);
 	return (old & ~mask) != 0;
 }
 
@@ -202,7 +154,7 @@ test_and_change_bit(unsigned long nr, volatile unsigned long *ptr)
 	unsigned long old, mask;
 
 	mask = 1UL << (nr & (BITS_PER_LONG - 1));
-	old = __BITOPS_LOOP(addr, mask, __BITOPS_XOR, __BITOPS_BARRIER);
+	old = __atomic64_xor_barrier(mask, addr);
 	return (old & mask) != 0;
 }
 
diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h
index 03516476127b..b69d8bc231a5 100644
--- a/arch/s390/include/asm/cpu_mf.h
+++ b/arch/s390/include/asm/cpu_mf.h
@@ -104,7 +104,8 @@ struct hws_basic_entry {
 	unsigned int P:1;	    /* 28 PSW Problem state		 */
 	unsigned int AS:2;	    /* 29-30 PSW address-space control	 */
 	unsigned int I:1;	    /* 31 entry valid or invalid	 */
-	unsigned int:16;
+	unsigned int CL:2;	    /* 32-33 Configuration Level	 */
+	unsigned int:14;
 	unsigned int prim_asn:16;   /* primary ASN			 */
 	unsigned long long ia;	    /* Instruction Address		 */
 	unsigned long long gpp;     /* Guest Program Parameter		 */
diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h
index 1736c7d3c94c..f4381e1fb19e 100644
--- a/arch/s390/include/asm/elf.h
+++ b/arch/s390/include/asm/elf.h
@@ -193,7 +193,7 @@ extern char elf_platform[];
 do {								\
 	set_personality(PER_LINUX |				\
 		(current->personality & (~PER_MASK)));		\
-	current_thread_info()->sys_call_table = 		\
+	current->thread.sys_call_table =			\
 		(unsigned long) &sys_call_table;		\
 } while (0)
 #else /* CONFIG_COMPAT */
@@ -204,11 +204,11 @@ do {								\
 			(current->personality & ~PER_MASK));	\
 	if ((ex).e_ident[EI_CLASS] == ELFCLASS32) {		\
 		set_thread_flag(TIF_31BIT);			\
-		current_thread_info()->sys_call_table =		\
+		current->thread.sys_call_table =		\
 			(unsigned long)	&sys_call_table_emu;	\
 	} else {						\
 		clear_thread_flag(TIF_31BIT);			\
-		current_thread_info()->sys_call_table =		\
+		current->thread.sys_call_table =		\
 			(unsigned long) &sys_call_table;	\
 	}							\
 } while (0)
diff --git a/arch/s390/include/asm/facilities_src.h b/arch/s390/include/asm/facilities_src.h
deleted file mode 100644
index 3b758f66e48b..000000000000
--- a/arch/s390/include/asm/facilities_src.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- *    Copyright IBM Corp. 2015
- */
-
-#ifndef S390_GEN_FACILITIES_C
-#error "This file can only be included by gen_facilities.c"
-#endif
-
-#include <linux/kconfig.h>
-
-struct facility_def {
-	char *name;
-	int *bits;
-};
-
-static struct facility_def facility_defs[] = {
-	{
-		/*
-		 * FACILITIES_ALS contains the list of facilities that are
-		 * required to run a kernel that is compiled e.g. with
-		 * -march=<machine>.
-		 */
-		.name = "FACILITIES_ALS",
-		.bits = (int[]){
-#ifdef CONFIG_HAVE_MARCH_Z900_FEATURES
-			0,  /* N3 instructions */
-			1,  /* z/Arch mode installed */
-#endif
-#ifdef CONFIG_HAVE_MARCH_Z990_FEATURES
-			18, /* long displacement facility */
-#endif
-#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES
-			7,  /* stfle */
-			17, /* message security assist */
-			21, /* extended-immediate facility */
-			25, /* store clock fast */
-#endif
-#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES
-			27, /* mvcos */
-			32, /* compare and swap and store */
-			33, /* compare and swap and store 2 */
-			34, /* general extension facility */
-			35, /* execute extensions */
-#endif
-#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
-			45, /* fast-BCR, etc. */
-#endif
-#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES
-			49, /* misc-instruction-extensions */
-			52, /* interlocked facility 2 */
-#endif
-#ifdef CONFIG_HAVE_MARCH_Z13_FEATURES
-			53, /* load-and-zero-rightmost-byte, etc. */
-#endif
-			-1 /* END */
-		}
-	},
-	{
-		.name = "FACILITIES_KVM",
-		.bits = (int[]){
-			0,  /* N3 instructions */
-			1,  /* z/Arch mode installed */
-			2,  /* z/Arch mode active */
-			3,  /* DAT-enhancement */
-			4,  /* idte segment table */
-			5,  /* idte region table */
-			6,  /* ASN-and-LX reuse */
-			7,  /* stfle */
-			8,  /* enhanced-DAT 1 */
-			9,  /* sense-running-status */
-			10, /* conditional sske */
-			13, /* ipte-range */
-			14, /* nonquiescing key-setting */
-			73, /* transactional execution */
-			75, /* access-exception-fetch/store indication */
-			76, /* msa extension 3 */
-			77, /* msa extension 4 */
-			78, /* enhanced-DAT 2 */
-			-1  /* END */
-		}
-	},
-};
diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h
index 4da22b2f0521..edb5161df7e2 100644
--- a/arch/s390/include/asm/ipl.h
+++ b/arch/s390/include/asm/ipl.h
@@ -97,7 +97,7 @@ void __init save_area_add_vxrs(struct save_area *, __vector128 *vxrs);
 extern void do_reipl(void);
 extern void do_halt(void);
 extern void do_poff(void);
-extern void ipl_save_parameters(void);
+extern void ipl_verify_parameters(void);
 extern void ipl_update_parameters(void);
 extern size_t append_ipl_vmparm(char *, size_t);
 extern size_t append_ipl_scpdata(char *, size_t);
diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h
index 7b93b78f423c..9bfad2ad6312 100644
--- a/arch/s390/include/asm/lowcore.h
+++ b/arch/s390/include/asm/lowcore.h
@@ -95,7 +95,7 @@ struct lowcore {
 
 	/* Current process. */
 	__u64	current_task;			/* 0x0310 */
-	__u64	thread_info;			/* 0x0318 */
+	__u8	pad_0x318[0x320-0x318];		/* 0x0318 */
 	__u64	kernel_stack;			/* 0x0320 */
 
 	/* Interrupt, panic and restart stack. */
@@ -126,7 +126,8 @@ struct lowcore {
 	__u64	percpu_offset;			/* 0x0378 */
 	__u64	vdso_per_cpu_data;		/* 0x0380 */
 	__u64	machine_flags;			/* 0x0388 */
-	__u8	pad_0x0390[0x0398-0x0390];	/* 0x0390 */
+	__u32	preempt_count;			/* 0x0390 */
+	__u8	pad_0x0394[0x0398-0x0394];	/* 0x0394 */
 	__u64	gmap;				/* 0x0398 */
 	__u32	spinlock_lockval;		/* 0x03a0 */
 	__u32	fpu_flags;			/* 0x03a4 */
diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h
index e75c64cbcf08..c232ef9711f5 100644
--- a/arch/s390/include/asm/pci_clp.h
+++ b/arch/s390/include/asm/pci_clp.h
@@ -46,6 +46,8 @@ struct clp_fh_list_entry {
 #define CLP_UTIL_STR_LEN	64
 #define CLP_PFIP_NR_SEGMENTS	4
 
+extern bool zpci_unique_uid;
+
 /* List PCI functions request */
 struct clp_req_list_pci {
 	struct clp_req_hdr hdr;
@@ -59,7 +61,8 @@ struct clp_rsp_list_pci {
 	u64 resume_token;
 	u32 reserved2;
 	u16 max_fn;
-	u8 reserved3;
+	u8			: 7;
+	u8 uid_checking		: 1;
 	u8 entry_size;
 	struct clp_fh_list_entry fh_list[CLP_FH_LIST_NR_ENTRIES];
 } __packed;
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index f4eb9843eed4..166f703dad7c 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -27,17 +27,17 @@ extern int page_table_allocate_pgste;
 
 static inline void clear_table(unsigned long *s, unsigned long val, size_t n)
 {
-	typedef struct { char _[n]; } addrtype;
-
-	*s = val;
-	n = (n / 256) - 1;
-	asm volatile(
-		"	mvc	8(248,%0),0(%0)\n"
-		"0:	mvc	256(256,%0),0(%0)\n"
-		"	la	%0,256(%0)\n"
-		"	brct	%1,0b\n"
-		: "+a" (s), "+d" (n), "=m" (*(addrtype *) s)
-		: "m" (*(addrtype *) s));
+	struct addrtype { char _[256]; };
+	int i;
+
+	for (i = 0; i < n; i += 256) {
+		*s = val;
+		asm volatile(
+			"mvc	8(248,%[s]),0(%[s])\n"
+			: "+m" (*(struct addrtype *) s)
+			: [s] "a" (s));
+		s += 256 / sizeof(long);
+	}
 }
 
 static inline void crst_table_init(unsigned long *crst, unsigned long entry)
diff --git a/arch/s390/include/asm/preempt.h b/arch/s390/include/asm/preempt.h
new file mode 100644
index 000000000000..b0776b2c8dcf
--- /dev/null
+++ b/arch/s390/include/asm/preempt.h
@@ -0,0 +1,137 @@
+#ifndef __ASM_PREEMPT_H
+#define __ASM_PREEMPT_H
+
+#include <asm/current.h>
+#include <linux/thread_info.h>
+#include <asm/atomic_ops.h>
+
+#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
+
+#define PREEMPT_ENABLED	(0 + PREEMPT_NEED_RESCHED)
+
+static inline int preempt_count(void)
+{
+	return READ_ONCE(S390_lowcore.preempt_count) & ~PREEMPT_NEED_RESCHED;
+}
+
+static inline void preempt_count_set(int pc)
+{
+	int old, new;
+
+	do {
+		old = READ_ONCE(S390_lowcore.preempt_count);
+		new = (old & PREEMPT_NEED_RESCHED) |
+			(pc & ~PREEMPT_NEED_RESCHED);
+	} while (__atomic_cmpxchg(&S390_lowcore.preempt_count,
+				  old, new) != old);
+}
+
+#define init_task_preempt_count(p)	do { } while (0)
+
+#define init_idle_preempt_count(p, cpu)	do { \
+	S390_lowcore.preempt_count = PREEMPT_ENABLED; \
+} while (0)
+
+static inline void set_preempt_need_resched(void)
+{
+	__atomic_and(~PREEMPT_NEED_RESCHED, &S390_lowcore.preempt_count);
+}
+
+static inline void clear_preempt_need_resched(void)
+{
+	__atomic_or(PREEMPT_NEED_RESCHED, &S390_lowcore.preempt_count);
+}
+
+static inline bool test_preempt_need_resched(void)
+{
+	return !(READ_ONCE(S390_lowcore.preempt_count) & PREEMPT_NEED_RESCHED);
+}
+
+static inline void __preempt_count_add(int val)
+{
+	if (__builtin_constant_p(val) && (val >= -128) && (val <= 127))
+		__atomic_add_const(val, &S390_lowcore.preempt_count);
+	else
+		__atomic_add(val, &S390_lowcore.preempt_count);
+}
+
+static inline void __preempt_count_sub(int val)
+{
+	__preempt_count_add(-val);
+}
+
+static inline bool __preempt_count_dec_and_test(void)
+{
+	return __atomic_add(-1, &S390_lowcore.preempt_count) == 1;
+}
+
+static inline bool should_resched(int preempt_offset)
+{
+	return unlikely(READ_ONCE(S390_lowcore.preempt_count) ==
+			preempt_offset);
+}
+
+#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */
+
+#define PREEMPT_ENABLED	(0)
+
+static inline int preempt_count(void)
+{
+	return READ_ONCE(S390_lowcore.preempt_count);
+}
+
+static inline void preempt_count_set(int pc)
+{
+	S390_lowcore.preempt_count = pc;
+}
+
+#define init_task_preempt_count(p)	do { } while (0)
+
+#define init_idle_preempt_count(p, cpu)	do { \
+	S390_lowcore.preempt_count = PREEMPT_ENABLED; \
+} while (0)
+
+static inline void set_preempt_need_resched(void)
+{
+}
+
+static inline void clear_preempt_need_resched(void)
+{
+}
+
+static inline bool test_preempt_need_resched(void)
+{
+	return false;
+}
+
+static inline void __preempt_count_add(int val)
+{
+	S390_lowcore.preempt_count += val;
+}
+
+static inline void __preempt_count_sub(int val)
+{
+	S390_lowcore.preempt_count -= val;
+}
+
+static inline bool __preempt_count_dec_and_test(void)
+{
+	return !--S390_lowcore.preempt_count && tif_need_resched();
+}
+
+static inline bool should_resched(int preempt_offset)
+{
+	return unlikely(preempt_count() == preempt_offset &&
+			tif_need_resched());
+}
+
+#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */
+
+#ifdef CONFIG_PREEMPT
+extern asmlinkage void preempt_schedule(void);
+#define __preempt_schedule() preempt_schedule()
+extern asmlinkage void preempt_schedule_notrace(void);
+#define __preempt_schedule_notrace() preempt_schedule_notrace()
+#endif /* CONFIG_PREEMPT */
+
+#endif /* __ASM_PREEMPT_H */
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 9d3a21aedc97..6bca916a5ba0 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -110,14 +110,20 @@ typedef struct {
 struct thread_struct {
 	unsigned int  acrs[NUM_ACRS];
         unsigned long ksp;              /* kernel stack pointer             */
+	unsigned long user_timer;	/* task cputime in user space */
+	unsigned long system_timer;	/* task cputime in kernel space */
+	unsigned long sys_call_table;	/* system call table address */
 	mm_segment_t mm_segment;
 	unsigned long gmap_addr;	/* address of last gmap fault. */
 	unsigned int gmap_write_flag;	/* gmap fault write indication */
 	unsigned int gmap_int_code;	/* int code of last gmap fault */
 	unsigned int gmap_pfault;	/* signal of a pending guest pfault */
+	/* Per-thread information related to debugging */
 	struct per_regs per_user;	/* User specified PER registers */
 	struct per_event per_event;	/* Cause of the last PER trap */
 	unsigned long per_flags;	/* Flags to control debug behavior */
+	unsigned int system_call;	/* system call number in signal */
+	unsigned long last_break;	/* last breaking-event-address. */
         /* pfault_wait is used to block the process on a pfault event */
 	unsigned long pfault_wait;
 	struct list_head list;
diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index 2ad9c204b1a2..8db92a5b3bf1 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -101,7 +101,8 @@ struct zpci_report_error_header {
 	u8 data[0];	/* Subsequent Data passed verbatim to SCLP ET 24 */
 } __packed;
 
-int sclp_get_core_info(struct sclp_core_info *info);
+int _sclp_get_core_info_early(struct sclp_core_info *info);
+int _sclp_get_core_info(struct sclp_core_info *info);
 int sclp_core_configure(u8 core);
 int sclp_core_deconfigure(u8 core);
 int sclp_sdias_blk_count(void);
@@ -119,4 +120,11 @@ void sclp_early_detect(void);
 void _sclp_print_early(const char *);
 void sclp_ocf_cpc_name_copy(char *dst);
 
+static inline int sclp_get_core_info(struct sclp_core_info *info, int early)
+{
+	if (early)
+		return _sclp_get_core_info_early(info);
+	return _sclp_get_core_info(info);
+}
+
 #endif /* _ASM_S390_SCLP_H */
diff --git a/arch/s390/include/asm/scsw.h b/arch/s390/include/asm/scsw.h
index 4af99cdaddf5..17a7904f001a 100644
--- a/arch/s390/include/asm/scsw.h
+++ b/arch/s390/include/asm/scsw.h
@@ -96,7 +96,8 @@ struct tm_scsw {
 	u32 dstat:8;
 	u32 cstat:8;
 	u32 fcxs:8;
-	u32 schxs:8;
+	u32 ifob:1;
+	u32 sesq:7;
 } __attribute__ ((packed));
 
 /**
@@ -177,6 +178,9 @@ union scsw {
 #define SCHN_STAT_INTF_CTRL_CHK	 0x02
 #define SCHN_STAT_CHAIN_CHECK	 0x01
 
+#define SCSW_SESQ_DEV_NOFCX	 3
+#define SCSW_SESQ_PATH_NOFCX	 4
+
 /*
  * architectured values for first sense byte
  */
diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h
index 0cc383b9be7f..3deb134587b7 100644
--- a/arch/s390/include/asm/smp.h
+++ b/arch/s390/include/asm/smp.h
@@ -36,6 +36,7 @@ extern void smp_yield_cpu(int cpu);
 extern void smp_cpu_set_polarization(int cpu, int val);
 extern int smp_cpu_get_polarization(int cpu);
 extern void smp_fill_possible_mask(void);
+extern void smp_detect_cpus(void);
 
 #else /* CONFIG_SMP */
 
@@ -56,6 +57,7 @@ static inline int smp_store_status(int cpu) { return 0; }
 static inline int smp_vcpu_scheduled(int cpu) { return 1; }
 static inline void smp_yield_cpu(int cpu) { }
 static inline void smp_fill_possible_mask(void) { }
+static inline void smp_detect_cpus(void) { }
 
 #endif /* CONFIG_SMP */
 
@@ -69,6 +71,12 @@ static inline void smp_stop_cpu(void)
 	}
 }
 
+/* Return thread 0 CPU number as base CPU */
+static inline int smp_get_base_cpu(int cpu)
+{
+	return cpu - (cpu % (smp_cpu_mtid + 1));
+}
+
 #ifdef CONFIG_HOTPLUG_CPU
 extern int smp_rescan_cpus(void);
 extern void __noreturn cpu_die(void);
diff --git a/arch/s390/include/asm/string.h b/arch/s390/include/asm/string.h
index 8662f5c8e17f..15a3c005c274 100644
--- a/arch/s390/include/asm/string.h
+++ b/arch/s390/include/asm/string.h
@@ -14,6 +14,7 @@
 #define __HAVE_ARCH_MEMCHR	/* inline & arch function */
 #define __HAVE_ARCH_MEMCMP	/* arch function */
 #define __HAVE_ARCH_MEMCPY	/* gcc builtin & arch function */
+#define __HAVE_ARCH_MEMMOVE	/* gcc builtin & arch function */
 #define __HAVE_ARCH_MEMSCAN	/* inline & arch function */
 #define __HAVE_ARCH_MEMSET	/* gcc builtin & arch function */
 #define __HAVE_ARCH_STRCAT	/* inline & arch function */
@@ -32,6 +33,7 @@
 extern int memcmp(const void *, const void *, size_t);
 extern void *memcpy(void *, const void *, size_t);
 extern void *memset(void *, int, size_t);
+extern void *memmove(void *, const void *, size_t);
 extern int strcmp(const char *,const char *);
 extern size_t strlcat(char *, const char *, size_t);
 extern size_t strlcpy(char *, const char *, size_t);
@@ -40,7 +42,6 @@ extern char *strncpy(char *, const char *, size_t);
 extern char *strrchr(const char *, int);
 extern char *strstr(const char *, const char *);
 
-#undef __HAVE_ARCH_MEMMOVE
 #undef __HAVE_ARCH_STRCHR
 #undef __HAVE_ARCH_STRNCHR
 #undef __HAVE_ARCH_STRNCMP
diff --git a/arch/s390/include/asm/sysinfo.h b/arch/s390/include/asm/sysinfo.h
index 2728114d5484..229326c942c7 100644
--- a/arch/s390/include/asm/sysinfo.h
+++ b/arch/s390/include/asm/sysinfo.h
@@ -107,6 +107,11 @@ struct sysinfo_2_2_2 {
 	char reserved_3[5];
 	unsigned short cpus_dedicated;
 	unsigned short cpus_shared;
+	char reserved_4[3];
+	unsigned char vsne;
+	uuid_be uuid;
+	char reserved_5[160];
+	char ext_name[256];
 };
 
 #define LPAR_CHAR_DEDICATED	(1 << 7)
@@ -127,7 +132,7 @@ struct sysinfo_3_2_2 {
 		unsigned int caf;
 		char cpi[16];
 		char reserved_1[3];
-		char ext_name_encoding;
+		unsigned char evmne;
 		unsigned int reserved_2;
 		uuid_be uuid;
 	} vm[8];
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index f15c0398c363..a5b54a445eb8 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -12,10 +12,10 @@
 /*
  * Size of kernel stack for each process
  */
-#define THREAD_ORDER 2
+#define THREAD_SIZE_ORDER 2
 #define ASYNC_ORDER  2
 
-#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
+#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
 #define ASYNC_SIZE  (PAGE_SIZE << ASYNC_ORDER)
 
 #ifndef __ASSEMBLY__
@@ -30,15 +30,7 @@
  * - if the contents of this structure are changed, the assembly constants must also be changed
  */
 struct thread_info {
-	struct task_struct	*task;		/* main task structure */
 	unsigned long		flags;		/* low level flags */
-	unsigned long		sys_call_table;	/* System call table address */
-	unsigned int		cpu;		/* current CPU */
-	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
-	unsigned int		system_call;
-	__u64			user_timer;
-	__u64			system_timer;
-	unsigned long		last_break;	/* last breaking-event-address. */
 };
 
 /*
@@ -46,26 +38,14 @@ struct thread_info {
  */
 #define INIT_THREAD_INFO(tsk)			\
 {						\
-	.task		= &tsk,			\
 	.flags		= 0,			\
-	.cpu		= 0,			\
-	.preempt_count	= INIT_PREEMPT_COUNT,	\
 }
 
-#define init_thread_info	(init_thread_union.thread_info)
 #define init_stack		(init_thread_union.stack)
 
-/* how to get the thread information struct from C */
-static inline struct thread_info *current_thread_info(void)
-{
-	return (struct thread_info *) S390_lowcore.thread_info;
-}
-
 void arch_release_task_struct(struct task_struct *tsk);
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
 
-#define THREAD_SIZE_ORDER THREAD_ORDER
-
 #endif
 
 /*
diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h
index 0bb08f341c09..de8298800722 100644
--- a/arch/s390/include/asm/timex.h
+++ b/arch/s390/include/asm/timex.h
@@ -52,11 +52,9 @@ static inline void store_clock_comparator(__u64 *time)
 
 void clock_comparator_work(void);
 
-void __init ptff_init(void);
+void __init time_early_init(void);
 
 extern unsigned char ptff_function_mask[16];
-extern unsigned long lpar_offset;
-extern unsigned long initial_leap_seconds;
 
 /* Function codes for the ptff instruction. */
 #define PTFF_QAF	0x00	/* query available functions */
@@ -100,21 +98,28 @@ struct ptff_qui {
 	unsigned int pad_0x5c[41];
 } __packed;
 
-static inline int ptff(void *ptff_block, size_t len, unsigned int func)
-{
-	typedef struct { char _[len]; } addrtype;
-	register unsigned int reg0 asm("0") = func;
-	register unsigned long reg1 asm("1") = (unsigned long) ptff_block;
-	int rc;
-
-	asm volatile(
-		"	.word	0x0104\n"
-		"	ipm	%0\n"
-		"	srl	%0,28\n"
-		: "=d" (rc), "+m" (*(addrtype *) ptff_block)
-		: "d" (reg0), "d" (reg1) : "cc");
-	return rc;
-}
+/*
+ * ptff - Perform timing facility function
+ * @ptff_block: Pointer to ptff parameter block
+ * @len: Length of parameter block
+ * @func: Function code
+ * Returns: Condition code (0 on success)
+ */
+#define ptff(ptff_block, len, func)					\
+({									\
+	struct addrtype { char _[len]; };				\
+	register unsigned int reg0 asm("0") = func;			\
+	register unsigned long reg1 asm("1") = (unsigned long) (ptff_block);\
+	int rc;								\
+									\
+	asm volatile(							\
+		"	.word	0x0104\n"				\
+		"	ipm	%0\n"					\
+		"	srl	%0,28\n"				\
+		: "=d" (rc), "+m" (*(struct addrtype *) reg1)		\
+		: "d" (reg0), "d" (reg1) : "cc");			\
+	rc;								\
+})
 
 static inline unsigned long long local_tick_disable(void)
 {
diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h
index f15f5571ca2b..fa1bfce10370 100644
--- a/arch/s390/include/asm/topology.h
+++ b/arch/s390/include/asm/topology.h
@@ -22,21 +22,22 @@ struct cpu_topology_s390 {
 	cpumask_t drawer_mask;
 };
 
-DECLARE_PER_CPU(struct cpu_topology_s390, cpu_topology);
-
-#define topology_physical_package_id(cpu) (per_cpu(cpu_topology, cpu).socket_id)
-#define topology_thread_id(cpu)		  (per_cpu(cpu_topology, cpu).thread_id)
-#define topology_sibling_cpumask(cpu) \
-		(&per_cpu(cpu_topology, cpu).thread_mask)
-#define topology_core_id(cpu)		  (per_cpu(cpu_topology, cpu).core_id)
-#define topology_core_cpumask(cpu)	  (&per_cpu(cpu_topology, cpu).core_mask)
-#define topology_book_id(cpu)		  (per_cpu(cpu_topology, cpu).book_id)
-#define topology_book_cpumask(cpu)	  (&per_cpu(cpu_topology, cpu).book_mask)
-#define topology_drawer_id(cpu)		  (per_cpu(cpu_topology, cpu).drawer_id)
-#define topology_drawer_cpumask(cpu)	  (&per_cpu(cpu_topology, cpu).drawer_mask)
+extern struct cpu_topology_s390 cpu_topology[NR_CPUS];
+extern cpumask_t cpus_with_topology;
+
+#define topology_physical_package_id(cpu) (cpu_topology[cpu].socket_id)
+#define topology_thread_id(cpu)		  (cpu_topology[cpu].thread_id)
+#define topology_sibling_cpumask(cpu)	  (&cpu_topology[cpu].thread_mask)
+#define topology_core_id(cpu)		  (cpu_topology[cpu].core_id)
+#define topology_core_cpumask(cpu)	  (&cpu_topology[cpu].core_mask)
+#define topology_book_id(cpu)		  (cpu_topology[cpu].book_id)
+#define topology_book_cpumask(cpu)	  (&cpu_topology[cpu].book_mask)
+#define topology_drawer_id(cpu)		  (cpu_topology[cpu].drawer_id)
+#define topology_drawer_cpumask(cpu)	  (&cpu_topology[cpu].drawer_mask)
 
 #define mc_capable() 1
 
+void topology_init_early(void);
 int topology_cpu_init(struct cpu *);
 int topology_set_cpu_management(int fc);
 void topology_schedule_update(void);
@@ -46,6 +47,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu);
 
 #else /* CONFIG_SCHED_TOPOLOGY */
 
+static inline void topology_init_early(void) { }
 static inline void topology_schedule_update(void) { }
 static inline int topology_cpu_init(struct cpu *cpu) { return 0; }
 static inline void topology_expect_change(void) { }
@@ -65,7 +67,7 @@ static inline void topology_expect_change(void) { }
 #define cpu_to_node cpu_to_node
 static inline int cpu_to_node(int cpu)
 {
-	return per_cpu(cpu_topology, cpu).node_id;
+	return cpu_topology[cpu].node_id;
 }
 
 /* Returns a pointer to the cpumask of CPUs on node 'node'. */
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index 52d7c8709279..f82b04e85a21 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -37,14 +37,14 @@
 #define get_ds()        (KERNEL_DS)
 #define get_fs()        (current->thread.mm_segment)
 
-#define set_fs(x) \
-({									\
+#define set_fs(x)							\
+{									\
 	unsigned long __pto;						\
 	current->thread.mm_segment = (x);				\
 	__pto = current->thread.mm_segment.ar4 ?			\
 		S390_lowcore.user_asce : S390_lowcore.kernel_asce;	\
 	__ctl_load(__pto, 7, 7);					\
-})
+}
 
 #define segment_eq(a,b) ((a).ar4 == (b).ar4)
 
diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h
index d0a2dbf2433d..88bdc477a843 100644
--- a/arch/s390/include/asm/vdso.h
+++ b/arch/s390/include/asm/vdso.h
@@ -33,6 +33,8 @@ struct vdso_data {
 	__u32 ectg_available;		/* ECTG instruction present	0x58 */
 	__u32 tk_mult;			/* Mult. used for xtime_nsec	0x5c */
 	__u32 tk_shift;			/* Shift used for xtime_nsec	0x60 */
+	__u32 ts_dir;			/* TOD steering direction	0x64 */
+	__u64 ts_end;			/* TOD steering end		0x68 */
 };
 
 struct vdso_per_cpu_data {
diff --git a/arch/s390/include/uapi/asm/Kbuild b/arch/s390/include/uapi/asm/Kbuild
index cc44b09c25fc..bf736e764cb4 100644
--- a/arch/s390/include/uapi/asm/Kbuild
+++ b/arch/s390/include/uapi/asm/Kbuild
@@ -12,6 +12,7 @@ header-y += dasd.h
 header-y += debug.h
 header-y += errno.h
 header-y += fcntl.h
+header-y += hypfs.h
 header-y += ioctl.h
 header-y += ioctls.h
 header-y += ipcbuf.h
@@ -29,16 +30,16 @@ header-y += ptrace.h
 header-y += qeth.h
 header-y += resource.h
 header-y += schid.h
+header-y += sclp_ctl.h
 header-y += sembuf.h
 header-y += setup.h
 header-y += shmbuf.h
+header-y += sie.h
 header-y += sigcontext.h
 header-y += siginfo.h
 header-y += signal.h
 header-y += socket.h
 header-y += sockios.h
-header-y += sclp_ctl.h
-header-y += sie.h
 header-y += stat.h
 header-y += statfs.h
 header-y += swab.h
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 1f0fe98f6db9..36b5101c8606 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -2,20 +2,47 @@
 # Makefile for the linux kernel.
 #
 
-KCOV_INSTRUMENT_early.o := n
-KCOV_INSTRUMENT_sclp.o := n
-KCOV_INSTRUMENT_als.o := n
-
 ifdef CONFIG_FUNCTION_TRACER
-# Don't trace early setup code and tracing code
-CFLAGS_REMOVE_early.o = $(CC_FLAGS_FTRACE)
-CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE)
+
+# Do not trace tracer code
+CFLAGS_REMOVE_ftrace.o	= $(CC_FLAGS_FTRACE)
+
+# Do not trace early setup code
+CFLAGS_REMOVE_als.o	= $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_early.o	= $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_sclp.o	= $(CC_FLAGS_FTRACE)
+
+endif
+
+GCOV_PROFILE_als.o	:= n
+GCOV_PROFILE_early.o	:= n
+GCOV_PROFILE_sclp.o	:= n
+
+KCOV_INSTRUMENT_als.o	:= n
+KCOV_INSTRUMENT_early.o	:= n
+KCOV_INSTRUMENT_sclp.o	:= n
+
+UBSAN_SANITIZE_als.o	:= n
+UBSAN_SANITIZE_early.o	:= n
+UBSAN_SANITIZE_sclp.o	:= n
+
+#
+# Use -march=z900 for sclp.c and als.c to be able to print an error
+# message if the kernel is started on a machine which is too old
+#
+ifneq ($(CC_FLAGS_MARCH),-march=z900)
+CFLAGS_REMOVE_als.o	+= $(CC_FLAGS_MARCH)
+CFLAGS_als.o		+= -march=z900
+CFLAGS_REMOVE_sclp.o	+= $(CC_FLAGS_MARCH)
+CFLAGS_sclp.o		+= -march=z900
+AFLAGS_REMOVE_head.o	+= $(CC_FLAGS_MARCH)
+AFLAGS_head.o		+= -march=z900
 endif
 
 #
 # Passing null pointers is ok for smp code, since we access the lowcore here.
 #
-CFLAGS_smp.o	:= -Wno-nonnull
+CFLAGS_smp.o		:= -Wno-nonnull
 
 #
 # Disable tailcall optimizations for stack / callchain walking functions
@@ -30,27 +57,7 @@ CFLAGS_dumpstack.o	+= -fno-optimize-sibling-calls
 #
 CFLAGS_ptrace.o		+= -DUTS_MACHINE='"$(UTS_MACHINE)"'
 
-CFLAGS_sysinfo.o += -w
-
-#
-# Use -march=z900 for sclp.c and als.c to be able to print an error
-# message if the kernel is started on a machine which is too old
-#
-CFLAGS_REMOVE_sclp.o = $(CC_FLAGS_FTRACE)
-CFLAGS_REMOVE_als.o = $(CC_FLAGS_FTRACE)
-ifneq ($(CC_FLAGS_MARCH),-march=z900)
-CFLAGS_REMOVE_sclp.o	+= $(CC_FLAGS_MARCH)
-CFLAGS_sclp.o		+= -march=z900
-CFLAGS_REMOVE_als.o	+= $(CC_FLAGS_MARCH)
-CFLAGS_als.o		+= -march=z900
-AFLAGS_REMOVE_head.o	+= $(CC_FLAGS_MARCH)
-AFLAGS_head.o		+= -march=z900
-endif
-GCOV_PROFILE_sclp.o := n
-GCOV_PROFILE_als.o := n
-UBSAN_SANITIZE_als.o := n
-UBSAN_SANITIZE_early.o := n
-UBSAN_SANITIZE_sclp.o := n
+CFLAGS_sysinfo.o	+= -w
 
 obj-y	:= traps.o time.o process.o base.o early.o setup.o idle.o vtime.o
 obj-y	+= processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index f3df9e0a5dec..c4b3570ded5b 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -25,12 +25,14 @@
 int main(void)
 {
 	/* task struct offsets */
-	OFFSET(__TASK_thread_info, task_struct, stack);
+	OFFSET(__TASK_stack, task_struct, stack);
 	OFFSET(__TASK_thread, task_struct, thread);
 	OFFSET(__TASK_pid, task_struct, pid);
 	BLANK();
 	/* thread struct offsets */
 	OFFSET(__THREAD_ksp, thread_struct, ksp);
+	OFFSET(__THREAD_sysc_table,  thread_struct, sys_call_table);
+	OFFSET(__THREAD_last_break, thread_struct, last_break);
 	OFFSET(__THREAD_FPU_fpc, thread_struct, fpu.fpc);
 	OFFSET(__THREAD_FPU_regs, thread_struct, fpu.regs);
 	OFFSET(__THREAD_per_cause, thread_struct, per_event.cause);
@@ -39,14 +41,7 @@ int main(void)
 	OFFSET(__THREAD_trap_tdb, thread_struct, trap_tdb);
 	BLANK();
 	/* thread info offsets */
-	OFFSET(__TI_task, thread_info, task);
-	OFFSET(__TI_flags, thread_info, flags);
-	OFFSET(__TI_sysc_table, thread_info, sys_call_table);
-	OFFSET(__TI_cpu, thread_info, cpu);
-	OFFSET(__TI_precount, thread_info, preempt_count);
-	OFFSET(__TI_user_timer, thread_info, user_timer);
-	OFFSET(__TI_system_timer, thread_info, system_timer);
-	OFFSET(__TI_last_break, thread_info, last_break);
+	OFFSET(__TI_flags, task_struct, thread_info.flags);
 	BLANK();
 	/* pt_regs offsets */
 	OFFSET(__PT_ARGS, pt_regs, args);
@@ -79,6 +74,8 @@ int main(void)
 	OFFSET(__VDSO_ECTG_OK, vdso_data, ectg_available);
 	OFFSET(__VDSO_TK_MULT, vdso_data, tk_mult);
 	OFFSET(__VDSO_TK_SHIFT, vdso_data, tk_shift);
+	OFFSET(__VDSO_TS_DIR, vdso_data, ts_dir);
+	OFFSET(__VDSO_TS_END, vdso_data, ts_end);
 	OFFSET(__VDSO_ECTG_BASE, vdso_per_cpu_data, ectg_timer_base);
 	OFFSET(__VDSO_ECTG_USER, vdso_per_cpu_data, ectg_user_time);
 	OFFSET(__VDSO_CPU_NR, vdso_per_cpu_data, cpu_nr);
@@ -159,7 +156,6 @@ int main(void)
 	OFFSET(__LC_INT_CLOCK, lowcore, int_clock);
 	OFFSET(__LC_MCCK_CLOCK, lowcore, mcck_clock);
 	OFFSET(__LC_CURRENT, lowcore, current_task);
-	OFFSET(__LC_THREAD_INFO, lowcore, thread_info);
 	OFFSET(__LC_KERNEL_STACK, lowcore, kernel_stack);
 	OFFSET(__LC_ASYNC_STACK, lowcore, async_stack);
 	OFFSET(__LC_PANIC_STACK, lowcore, panic_stack);
@@ -173,6 +169,7 @@ int main(void)
 	OFFSET(__LC_PERCPU_OFFSET, lowcore, percpu_offset);
 	OFFSET(__LC_VDSO_PER_CPU, lowcore, vdso_per_cpu_data);
 	OFFSET(__LC_MACHINE_FLAGS, lowcore, machine_flags);
+	OFFSET(__LC_PREEMPT_COUNT, lowcore, preempt_count);
 	OFFSET(__LC_GMAP, lowcore, gmap);
 	OFFSET(__LC_PASTE, lowcore, paste);
 	/* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 4af60374eba0..6f2a6ab13cb5 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -446,7 +446,7 @@ static int setup_frame32(struct ksignal *ksig, sigset_t *set,
 		/* set extra registers only for synchronous signals */
 		regs->gprs[4] = regs->int_code & 127;
 		regs->gprs[5] = regs->int_parm_long;
-		regs->gprs[6] = task_thread_info(current)->last_break;
+		regs->gprs[6] = current->thread.last_break;
 	}
 
 	return 0;
@@ -523,7 +523,7 @@ static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set,
 	regs->gprs[2] = ksig->sig;
 	regs->gprs[3] = (__force __u64) &frame->info;
 	regs->gprs[4] = (__force __u64) &frame->uc;
-	regs->gprs[5] = task_thread_info(current)->last_break;
+	regs->gprs[5] = current->thread.last_break;
 	return 0;
 }
 
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index 2374c5b46bbc..d038c8cea6cb 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -293,6 +293,7 @@ static noinline __init void setup_lowcore_early(void)
 	psw.addr = (unsigned long) s390_base_pgm_handler;
 	S390_lowcore.program_new_psw = psw;
 	s390_base_pgm_handler_fn = early_pgm_check_handler;
+	S390_lowcore.preempt_count = INIT_PREEMPT_COUNT;
 }
 
 static noinline __init void setup_facility_list(void)
@@ -391,7 +392,49 @@ static int __init cad_init(void)
 }
 early_initcall(cad_init);
 
-static __init void rescue_initrd(void)
+static __init void memmove_early(void *dst, const void *src, size_t n)
+{
+	unsigned long addr;
+	long incr;
+	psw_t old;
+
+	if (!n)
+		return;
+	incr = 1;
+	if (dst > src) {
+		incr = -incr;
+		dst += n - 1;
+		src += n - 1;
+	}
+	old = S390_lowcore.program_new_psw;
+	S390_lowcore.program_new_psw.mask = __extract_psw();
+	asm volatile(
+		"	larl	%[addr],1f\n"
+		"	stg	%[addr],%[psw_pgm_addr]\n"
+		"0:     mvc	0(1,%[dst]),0(%[src])\n"
+		"	agr	%[dst],%[incr]\n"
+		"	agr	%[src],%[incr]\n"
+		"	brctg	%[n],0b\n"
+		"1:\n"
+		: [addr] "=&d" (addr),
+		  [psw_pgm_addr] "=&Q" (S390_lowcore.program_new_psw.addr),
+		  [dst] "+&a" (dst), [src] "+&a" (src),  [n] "+d" (n)
+		: [incr] "d" (incr)
+		: "cc", "memory");
+	S390_lowcore.program_new_psw = old;
+}
+
+static __init noinline void ipl_save_parameters(void)
+{
+	void *src, *dst;
+
+	src = (void *)(unsigned long) S390_lowcore.ipl_parmblock_ptr;
+	dst = (void *) IPL_PARMBLOCK_ORIGIN;
+	memmove_early(dst, src, PAGE_SIZE);
+	S390_lowcore.ipl_parmblock_ptr = IPL_PARMBLOCK_ORIGIN;
+}
+
+static __init noinline void rescue_initrd(void)
 {
 #ifdef CONFIG_BLK_DEV_INITRD
 	unsigned long min_initrd_addr = (unsigned long) _end + (4UL << 20);
@@ -405,7 +448,7 @@ static __init void rescue_initrd(void)
 		return;
 	if (INITRD_START >= min_initrd_addr)
 		return;
-	memmove((void *) min_initrd_addr, (void *) INITRD_START, INITRD_SIZE);
+	memmove_early((void *) min_initrd_addr, (void *) INITRD_START, INITRD_SIZE);
 	INITRD_START = min_initrd_addr;
 #endif
 }
@@ -467,7 +510,8 @@ void __init startup_init(void)
 	ipl_save_parameters();
 	rescue_initrd();
 	clear_bss_section();
-	ptff_init();
+	ipl_verify_parameters();
+	time_early_init();
 	init_kernel_storage_key();
 	lockdep_off();
 	setup_lowcore_early();
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 49a30737adde..97298c58b2be 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -42,7 +42,7 @@ __PT_R13     =	__PT_GPRS + 104
 __PT_R14     =	__PT_GPRS + 112
 __PT_R15     =	__PT_GPRS + 120
 
-STACK_SHIFT = PAGE_SHIFT + THREAD_ORDER
+STACK_SHIFT = PAGE_SHIFT + THREAD_SIZE_ORDER
 STACK_SIZE  = 1 << STACK_SHIFT
 STACK_INIT = STACK_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE
 
@@ -123,8 +123,14 @@ _PIF_WORK	= (_PIF_PER_TRAP)
 
 	.macro	LAST_BREAK scratch
 	srag	\scratch,%r10,23
+#ifdef CONFIG_HAVE_MARCH_Z990_FEATURES
 	jz	.+10
-	stg	%r10,__TI_last_break(%r12)
+	stg	%r10,__TASK_thread+__THREAD_last_break(%r12)
+#else
+	jz	.+14
+	lghi	\scratch,__TASK_thread
+	stg	%r10,__THREAD_last_break(\scratch,%r12)
+#endif
 	.endm
 
 	.macro REENABLE_IRQS
@@ -186,14 +192,13 @@ ENTRY(__switch_to)
 	stmg	%r6,%r15,__SF_GPRS(%r15)	# store gprs of prev task
 	lgr	%r1,%r2
 	aghi	%r1,__TASK_thread		# thread_struct of prev task
-	lg	%r5,__TASK_thread_info(%r3)	# get thread_info of next
+	lg	%r5,__TASK_stack(%r3)		# start of kernel stack of next
 	stg	%r15,__THREAD_ksp(%r1)		# store kernel stack of prev
 	lgr	%r1,%r3
 	aghi	%r1,__TASK_thread		# thread_struct of next task
 	lgr	%r15,%r5
 	aghi	%r15,STACK_INIT			# end of kernel stack of next
 	stg	%r3,__LC_CURRENT		# store task struct of next
-	stg	%r5,__LC_THREAD_INFO		# store thread info of next
 	stg	%r15,__LC_KERNEL_STACK		# store end of kernel stack
 	lg	%r15,__THREAD_ksp(%r1)		# load kernel stack of next
 	/* c4 is used in guest detection: arch/s390/kernel/perf_cpum_sf.c */
@@ -274,7 +279,7 @@ ENTRY(system_call)
 .Lsysc_stmg:
 	stmg	%r8,%r15,__LC_SAVE_AREA_SYNC
 	lg	%r10,__LC_LAST_BREAK
-	lg	%r12,__LC_THREAD_INFO
+	lg	%r12,__LC_CURRENT
 	lghi	%r14,_PIF_SYSCALL
 .Lsysc_per:
 	lg	%r15,__LC_KERNEL_STACK
@@ -288,7 +293,13 @@ ENTRY(system_call)
 	mvc	__PT_INT_CODE(4,%r11),__LC_SVC_ILC
 	stg	%r14,__PT_FLAGS(%r11)
 .Lsysc_do_svc:
-	lg	%r10,__TI_sysc_table(%r12)	# address of system call table
+	# load address of system call table
+#ifdef CONFIG_HAVE_MARCH_Z990_FEATURES
+	lg	%r10,__TASK_thread+__THREAD_sysc_table(%r12)
+#else
+	lghi	%r13,__TASK_thread
+	lg	%r10,__THREAD_sysc_table(%r13,%r12)
+#endif
 	llgh	%r8,__PT_INT_CODE+2(%r11)
 	slag	%r8,%r8,2			# shift and test for svc 0
 	jnz	.Lsysc_nr_ok
@@ -389,7 +400,6 @@ ENTRY(system_call)
 	TSTMSK	__PT_FLAGS(%r11),_PIF_SYSCALL
 	jno	.Lsysc_return
 	lmg	%r2,%r7,__PT_R2(%r11)	# load svc arguments
-	lg	%r10,__TI_sysc_table(%r12)	# address of system call table
 	lghi	%r8,0			# svc 0 returns -ENOSYS
 	llgh	%r1,__PT_INT_CODE+2(%r11)	# load new svc number
 	cghi	%r1,NR_syscalls
@@ -457,7 +467,7 @@ ENTRY(system_call)
 #
 ENTRY(ret_from_fork)
 	la	%r11,STACK_FRAME_OVERHEAD(%r15)
-	lg	%r12,__LC_THREAD_INFO
+	lg	%r12,__LC_CURRENT
 	brasl	%r14,schedule_tail
 	TRACE_IRQS_ON
 	ssm	__LC_SVC_NEW_PSW	# reenable interrupts
@@ -478,7 +488,7 @@ ENTRY(pgm_check_handler)
 	stpt	__LC_SYNC_ENTER_TIMER
 	stmg	%r8,%r15,__LC_SAVE_AREA_SYNC
 	lg	%r10,__LC_LAST_BREAK
-	lg	%r12,__LC_THREAD_INFO
+	lg	%r12,__LC_CURRENT
 	larl	%r13,cleanup_critical
 	lmg	%r8,%r9,__LC_PGM_OLD_PSW
 	tmhh	%r8,0x0001		# test problem state bit
@@ -501,7 +511,7 @@ ENTRY(pgm_check_handler)
 2:	LAST_BREAK %r14
 	UPDATE_VTIME %r14,%r15,__LC_SYNC_ENTER_TIMER
 	lg	%r15,__LC_KERNEL_STACK
-	lg	%r14,__TI_task(%r12)
+	lgr	%r14,%r12
 	aghi	%r14,__TASK_thread	# pointer to thread_struct
 	lghi	%r13,__LC_PGM_TDB
 	tm	__LC_PGM_ILC+2,0x02	# check for transaction abort
@@ -567,7 +577,7 @@ ENTRY(io_int_handler)
 	stpt	__LC_ASYNC_ENTER_TIMER
 	stmg	%r8,%r15,__LC_SAVE_AREA_ASYNC
 	lg	%r10,__LC_LAST_BREAK
-	lg	%r12,__LC_THREAD_INFO
+	lg	%r12,__LC_CURRENT
 	larl	%r13,cleanup_critical
 	lmg	%r8,%r9,__LC_IO_OLD_PSW
 	SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER
@@ -626,7 +636,7 @@ ENTRY(io_int_handler)
 	jo	.Lio_work_user		# yes -> do resched & signal
 #ifdef CONFIG_PREEMPT
 	# check for preemptive scheduling
-	icm	%r0,15,__TI_precount(%r12)
+	icm	%r0,15,__LC_PREEMPT_COUNT
 	jnz	.Lio_restore		# preemption is disabled
 	TSTMSK	__TI_flags(%r12),_TIF_NEED_RESCHED
 	jno	.Lio_restore
@@ -741,7 +751,7 @@ ENTRY(ext_int_handler)
 	stpt	__LC_ASYNC_ENTER_TIMER
 	stmg	%r8,%r15,__LC_SAVE_AREA_ASYNC
 	lg	%r10,__LC_LAST_BREAK
-	lg	%r12,__LC_THREAD_INFO
+	lg	%r12,__LC_CURRENT
 	larl	%r13,cleanup_critical
 	lmg	%r8,%r9,__LC_EXT_OLD_PSW
 	SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER
@@ -798,13 +808,10 @@ ENTRY(save_fpu_regs)
 	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU
 	bor	%r14
 	stfpc	__THREAD_FPU_fpc(%r2)
-.Lsave_fpu_regs_fpc_end:
 	lg	%r3,__THREAD_FPU_regs(%r2)
 	TSTMSK	__LC_MACHINE_FLAGS,MACHINE_FLAG_VX
 	jz	.Lsave_fpu_regs_fp	  # no -> store FP regs
-.Lsave_fpu_regs_vx_low:
 	VSTM	%v0,%v15,0,%r3		  # vstm 0,15,0(3)
-.Lsave_fpu_regs_vx_high:
 	VSTM	%v16,%v31,256,%r3	  # vstm 16,31,256(3)
 	j	.Lsave_fpu_regs_done	  # -> set CIF_FPU flag
 .Lsave_fpu_regs_fp:
@@ -851,9 +858,7 @@ load_fpu_regs:
 	TSTMSK	__LC_MACHINE_FLAGS,MACHINE_FLAG_VX
 	lg	%r4,__THREAD_FPU_regs(%r4)	# %r4 <- reg save area
 	jz	.Lload_fpu_regs_fp		# -> no VX, load FP regs
-.Lload_fpu_regs_vx:
 	VLM	%v0,%v15,0,%r4
-.Lload_fpu_regs_vx_high:
 	VLM	%v16,%v31,256,%r4
 	j	.Lload_fpu_regs_done
 .Lload_fpu_regs_fp:
@@ -889,7 +894,7 @@ ENTRY(mcck_int_handler)
 	spt	__LC_CPU_TIMER_SAVE_AREA-4095(%r1)	# revalidate cpu timer
 	lmg	%r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# revalidate gprs
 	lg	%r10,__LC_LAST_BREAK
-	lg	%r12,__LC_THREAD_INFO
+	lg	%r12,__LC_CURRENT
 	larl	%r13,cleanup_critical
 	lmg	%r8,%r9,__LC_MCK_OLD_PSW
 	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_SYSTEM_DAMAGE
@@ -948,7 +953,7 @@ ENTRY(mcck_int_handler)
 
 .Lmcck_panic:
 	lg	%r15,__LC_PANIC_STACK
-	aghi	%r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
+	la	%r11,STACK_FRAME_OVERHEAD(%r15)
 	j	.Lmcck_skip
 
 #
@@ -1085,7 +1090,7 @@ cleanup_critical:
 	jhe	0f
 	# set up saved registers r10 and r12
 	stg	%r10,16(%r11)		# r10 last break
-	stg	%r12,32(%r11)		# r12 thread-info pointer
+	stg	%r12,32(%r11)		# r12 task struct pointer
 0:	# check if the user time update has been done
 	clg	%r9,BASED(.Lcleanup_system_call_insn+24)
 	jh	0f
@@ -1106,7 +1111,9 @@ cleanup_critical:
 	lg	%r9,16(%r11)
 	srag	%r9,%r9,23
 	jz	0f
-	mvc	__TI_last_break(8,%r12),16(%r11)
+	lgr	%r9,%r12
+	aghi	%r9,__TASK_thread
+	mvc	__THREAD_last_break(8,%r9),16(%r11)
 0:	# set up saved register r11
 	lg	%r15,__LC_KERNEL_STACK
 	la	%r9,STACK_FRAME_OVERHEAD(%r15)
diff --git a/arch/s390/kernel/head.S b/arch/s390/kernel/head.S
index 4431905f8cfa..0b5ebf8a3d30 100644
--- a/arch/s390/kernel/head.S
+++ b/arch/s390/kernel/head.S
@@ -315,7 +315,7 @@ ENTRY(startup_kdump)
 	jg	startup_continue
 
 .Lstack:
-	.long	0x8000 + (1<<(PAGE_SHIFT+THREAD_ORDER))
+	.long	0x8000 + (1<<(PAGE_SHIFT+THREAD_SIZE_ORDER))
 	.align	8
 6:	.long	0x7fffffff,0xffffffff
 
diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S
index 03c2b469c472..482d3526e32b 100644
--- a/arch/s390/kernel/head64.S
+++ b/arch/s390/kernel/head64.S
@@ -32,11 +32,10 @@ ENTRY(startup_continue)
 #
 # Setup stack
 #
-	larl	%r15,init_thread_union
-	stg	%r15,__LC_THREAD_INFO	# cache thread info in lowcore
-	lg	%r14,__TI_task(%r15)	# cache current in lowcore
+	larl	%r14,init_task
 	stg	%r14,__LC_CURRENT
-	aghi	%r15,1<<(PAGE_SHIFT+THREAD_ORDER) # init_task_union + THREAD_SIZE
+	larl	%r15,init_thread_union
+	aghi	%r15,1<<(PAGE_SHIFT+THREAD_SIZE_ORDER) # init_task_union + THREAD_SIZE
 	stg	%r15,__LC_KERNEL_STACK	# set end of kernel stack
 	aghi	%r15,-160
 #
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index 295bfb7124bc..ff3364a067ff 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -1991,10 +1991,9 @@ void __init ipl_update_parameters(void)
 		diag308_set_works = 1;
 }
 
-void __init ipl_save_parameters(void)
+void __init ipl_verify_parameters(void)
 {
 	struct cio_iplinfo iplinfo;
-	void *src, *dst;
 
 	if (cio_get_iplinfo(&iplinfo))
 		return;
@@ -2005,10 +2004,6 @@ void __init ipl_save_parameters(void)
 	if (!iplinfo.is_qdio)
 		return;
 	ipl_flags |= IPL_PARMBLOCK_VALID;
-	src = (void *)(unsigned long)S390_lowcore.ipl_parmblock_ptr;
-	dst = (void *)IPL_PARMBLOCK_ORIGIN;
-	memmove(dst, src, PAGE_SIZE);
-	S390_lowcore.ipl_parmblock_ptr = IPL_PARMBLOCK_ORIGIN;
 }
 
 static LIST_HEAD(rcall);
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index 285d6561076d..ef60f4177331 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -168,7 +168,7 @@ void do_softirq_own_stack(void)
 	old = current_stack_pointer();
 	/* Check against async. stack address range. */
 	new = S390_lowcore.async_stack;
-	if (((new - old) >> (PAGE_SHIFT + THREAD_ORDER)) != 0) {
+	if (((new - old) >> (PAGE_SHIFT + THREAD_SIZE_ORDER)) != 0) {
 		/* Need to switch to the async. stack. */
 		new -= STACK_FRAME_OVERHEAD;
 		((struct stack_frame *) new)->back_chain = old;
diff --git a/arch/s390/kernel/lgr.c b/arch/s390/kernel/lgr.c
index 6ea6d69339b5..ae7dff110054 100644
--- a/arch/s390/kernel/lgr.c
+++ b/arch/s390/kernel/lgr.c
@@ -5,7 +5,8 @@
  * Author(s): Michael Holzheu <holzheu@linux.vnet.ibm.com>
  */
 
-#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/export.h>
 #include <linux/timer.h>
 #include <linux/slab.h>
 #include <asm/facility.h>
@@ -183,4 +184,4 @@ static int __init lgr_init(void)
 	lgr_timer_set();
 	return 0;
 }
-module_init(lgr_init);
+device_initcall(lgr_init);
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index fcc634c1479a..763dec18edcd 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -995,39 +995,36 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr)
 	regs.int_parm = CPU_MF_INT_SF_PRA;
 	sde_regs = (struct perf_sf_sde_regs *) &regs.int_parm_long;
 
-	regs.psw.addr = sfr->basic.ia;
-	if (sfr->basic.T)
-		regs.psw.mask |= PSW_MASK_DAT;
-	if (sfr->basic.W)
-		regs.psw.mask |= PSW_MASK_WAIT;
-	if (sfr->basic.P)
-		regs.psw.mask |= PSW_MASK_PSTATE;
-	switch (sfr->basic.AS) {
-	case 0x0:
-		regs.psw.mask |= PSW_ASC_PRIMARY;
-		break;
-	case 0x1:
-		regs.psw.mask |= PSW_ASC_ACCREG;
-		break;
-	case 0x2:
-		regs.psw.mask |= PSW_ASC_SECONDARY;
-		break;
-	case 0x3:
-		regs.psw.mask |= PSW_ASC_HOME;
-		break;
-	}
+	psw_bits(regs.psw).ia = sfr->basic.ia;
+	psw_bits(regs.psw).t  = sfr->basic.T;
+	psw_bits(regs.psw).w  = sfr->basic.W;
+	psw_bits(regs.psw).p  = sfr->basic.P;
+	psw_bits(regs.psw).as = sfr->basic.AS;
 
 	/*
-	 * A non-zero guest program parameter indicates a guest
-	 * sample.
-	 * Note that some early samples or samples from guests without
+	 * Use the hardware provided configuration level to decide if the
+	 * sample belongs to a guest or host. If that is not available,
+	 * fall back to the following heuristics:
+	 * A non-zero guest program parameter always indicates a guest
+	 * sample. Some early samples or samples from guests without
 	 * lpp usage would be misaccounted to the host. We use the asn
-	 * value as a heuristic to detect most of these guest samples.
-	 * If the value differs from the host hpp value, we assume
-	 * it to be a KVM guest.
+	 * value as an addon heuristic to detect most of these guest samples.
+	 * If the value differs from the host hpp value, we assume to be a
+	 * KVM guest.
 	 */
-	if (sfr->basic.gpp || sfr->basic.prim_asn != (u16) sfr->basic.hpp)
+	switch (sfr->basic.CL) {
+	case 1: /* logical partition */
+		sde_regs->in_guest = 0;
+		break;
+	case 2: /* virtual machine */
 		sde_regs->in_guest = 1;
+		break;
+	default: /* old machine, use heuristics */
+		if (sfr->basic.gpp ||
+		    sfr->basic.prim_asn != (u16)sfr->basic.hpp)
+			sde_regs->in_guest = 1;
+		break;
+	}
 
 	overflow = 0;
 	if (perf_exclude_event(event, &regs, sde_regs))
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index bba4fa74b321..400d14f0b9f5 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -103,7 +103,6 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 int copy_thread(unsigned long clone_flags, unsigned long new_stackp,
 		unsigned long arg, struct task_struct *p)
 {
-	struct thread_info *ti;
 	struct fake_frame
 	{
 		struct stack_frame sf;
@@ -121,9 +120,8 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp,
 	memset(&p->thread.per_event, 0, sizeof(p->thread.per_event));
 	clear_tsk_thread_flag(p, TIF_SINGLE_STEP);
 	/* Initialize per thread user and system timer values */
-	ti = task_thread_info(p);
-	ti->user_timer = 0;
-	ti->system_timer = 0;
+	p->thread.user_timer = 0;
+	p->thread.system_timer = 0;
 
 	frame->sf.back_chain = 0;
 	/* new return point is ret_from_fork */
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 9336e824e2db..b81ab8882e2e 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -461,7 +461,7 @@ long arch_ptrace(struct task_struct *child, long request,
 		}
 		return 0;
 	case PTRACE_GET_LAST_BREAK:
-		put_user(task_thread_info(child)->last_break,
+		put_user(child->thread.last_break,
 			 (unsigned long __user *) data);
 		return 0;
 	case PTRACE_ENABLE_TE:
@@ -811,7 +811,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 		}
 		return 0;
 	case PTRACE_GET_LAST_BREAK:
-		put_user(task_thread_info(child)->last_break,
+		put_user(child->thread.last_break,
 			 (unsigned int __user *) data);
 		return 0;
 	}
@@ -997,10 +997,10 @@ static int s390_last_break_get(struct task_struct *target,
 	if (count > 0) {
 		if (kbuf) {
 			unsigned long *k = kbuf;
-			*k = task_thread_info(target)->last_break;
+			*k = target->thread.last_break;
 		} else {
 			unsigned long  __user *u = ubuf;
-			if (__put_user(task_thread_info(target)->last_break, u))
+			if (__put_user(target->thread.last_break, u))
 				return -EFAULT;
 		}
 	}
@@ -1113,7 +1113,7 @@ static int s390_system_call_get(struct task_struct *target,
 				unsigned int pos, unsigned int count,
 				void *kbuf, void __user *ubuf)
 {
-	unsigned int *data = &task_thread_info(target)->system_call;
+	unsigned int *data = &target->thread.system_call;
 	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
 				   data, 0, sizeof(unsigned int));
 }
@@ -1123,7 +1123,7 @@ static int s390_system_call_set(struct task_struct *target,
 				unsigned int pos, unsigned int count,
 				const void *kbuf, const void __user *ubuf)
 {
-	unsigned int *data = &task_thread_info(target)->system_call;
+	unsigned int *data = &target->thread.system_call;
 	return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
 				  data, 0, sizeof(unsigned int));
 }
@@ -1327,7 +1327,7 @@ static int s390_compat_last_break_get(struct task_struct *target,
 	compat_ulong_t last_break;
 
 	if (count > 0) {
-		last_break = task_thread_info(target)->last_break;
+		last_break = target->thread.last_break;
 		if (kbuf) {
 			unsigned long *k = kbuf;
 			*k = last_break;
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 7f7ba5f23f13..adfac9f0a89f 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -35,6 +35,7 @@
 #include <linux/root_dev.h>
 #include <linux/console.h>
 #include <linux/kernel_stat.h>
+#include <linux/dma-contiguous.h>
 #include <linux/device.h>
 #include <linux/notifier.h>
 #include <linux/pfn.h>
@@ -303,7 +304,7 @@ static void __init setup_lowcore(void)
 	 * Setup lowcore for boot cpu
 	 */
 	BUILD_BUG_ON(sizeof(struct lowcore) != LC_PAGES * 4096);
-	lc = __alloc_bootmem_low(LC_PAGES * PAGE_SIZE, LC_PAGES * PAGE_SIZE, 0);
+	lc = memblock_virt_alloc_low(sizeof(*lc), sizeof(*lc));
 	lc->restart_psw.mask = PSW_KERNEL_BITS;
 	lc->restart_psw.addr = (unsigned long) restart_int_handler;
 	lc->external_new_psw.mask = PSW_KERNEL_BITS |
@@ -324,15 +325,15 @@ static void __init setup_lowcore(void)
 	lc->kernel_stack = ((unsigned long) &init_thread_union)
 		+ THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs);
 	lc->async_stack = (unsigned long)
-		__alloc_bootmem(ASYNC_SIZE, ASYNC_SIZE, 0)
+		memblock_virt_alloc(ASYNC_SIZE, ASYNC_SIZE)
 		+ ASYNC_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs);
 	lc->panic_stack = (unsigned long)
-		__alloc_bootmem(PAGE_SIZE, PAGE_SIZE, 0)
+		memblock_virt_alloc(PAGE_SIZE, PAGE_SIZE)
 		+ PAGE_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs);
-	lc->current_task = (unsigned long) init_thread_union.thread_info.task;
-	lc->thread_info = (unsigned long) &init_thread_union;
+	lc->current_task = (unsigned long)&init_task;
 	lc->lpp = LPP_MAGIC;
 	lc->machine_flags = S390_lowcore.machine_flags;
+	lc->preempt_count = S390_lowcore.preempt_count;
 	lc->stfl_fac_list = S390_lowcore.stfl_fac_list;
 	memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list,
 	       MAX_FACILITY_BIT/8);
@@ -349,7 +350,7 @@ static void __init setup_lowcore(void)
 	lc->last_update_timer = S390_lowcore.last_update_timer;
 	lc->last_update_clock = S390_lowcore.last_update_clock;
 
-	restart_stack = __alloc_bootmem(ASYNC_SIZE, ASYNC_SIZE, 0);
+	restart_stack = memblock_virt_alloc(ASYNC_SIZE, ASYNC_SIZE);
 	restart_stack += ASYNC_SIZE;
 
 	/*
@@ -412,7 +413,7 @@ static void __init setup_resources(void)
 	bss_resource.end = (unsigned long) &__bss_stop - 1;
 
 	for_each_memblock(memory, reg) {
-		res = alloc_bootmem_low(sizeof(*res));
+		res = memblock_virt_alloc(sizeof(*res), 8);
 		res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
 
 		res->name = "System RAM";
@@ -426,7 +427,7 @@ static void __init setup_resources(void)
 			    std_res->start > res->end)
 				continue;
 			if (std_res->end > res->end) {
-				sub_res = alloc_bootmem_low(sizeof(*sub_res));
+				sub_res = memblock_virt_alloc(sizeof(*sub_res), 8);
 				*sub_res = *std_res;
 				sub_res->end = res->end;
 				std_res->start = res->end + 1;
@@ -445,7 +446,7 @@ static void __init setup_resources(void)
 	 * part of the System RAM resource.
 	 */
 	if (crashk_res.end) {
-		memblock_add(crashk_res.start, resource_size(&crashk_res));
+		memblock_add_node(crashk_res.start, resource_size(&crashk_res), 0);
 		memblock_reserve(crashk_res.start, resource_size(&crashk_res));
 		insert_resource(&iomem_resource, &crashk_res);
 	}
@@ -903,6 +904,7 @@ void __init setup_arch(char **cmdline_p)
 
 	setup_memory_end();
 	setup_memory();
+	dma_contiguous_reserve(memory_end);
 
 	check_initrd();
 	reserve_crashkernel();
@@ -921,6 +923,8 @@ void __init setup_arch(char **cmdline_p)
 	cpu_detect_mhz_feature();
         cpu_init();
 	numa_setup();
+	smp_detect_cpus();
+	topology_init_early();
 
 	/*
 	 * Create kernel page tables and switch to virtual addressing.
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index d82562cf0a0e..9f241d1efeda 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -359,7 +359,7 @@ static int setup_frame(int sig, struct k_sigaction *ka,
 		/* set extra registers only for synchronous signals */
 		regs->gprs[4] = regs->int_code & 127;
 		regs->gprs[5] = regs->int_parm_long;
-		regs->gprs[6] = task_thread_info(current)->last_break;
+		regs->gprs[6] = current->thread.last_break;
 	}
 	return 0;
 }
@@ -430,7 +430,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set,
 	regs->gprs[2] = ksig->sig;
 	regs->gprs[3] = (unsigned long) &frame->info;
 	regs->gprs[4] = (unsigned long) &frame->uc;
-	regs->gprs[5] = task_thread_info(current)->last_break;
+	regs->gprs[5] = current->thread.last_break;
 	return 0;
 }
 
@@ -467,13 +467,13 @@ void do_signal(struct pt_regs *regs)
 	 * the debugger may change all our registers, including the system
 	 * call information.
 	 */
-	current_thread_info()->system_call =
+	current->thread.system_call =
 		test_pt_regs_flag(regs, PIF_SYSCALL) ? regs->int_code : 0;
 
 	if (get_signal(&ksig)) {
 		/* Whee!  Actually deliver the signal.  */
-		if (current_thread_info()->system_call) {
-			regs->int_code = current_thread_info()->system_call;
+		if (current->thread.system_call) {
+			regs->int_code = current->thread.system_call;
 			/* Check for system call restarting. */
 			switch (regs->gprs[2]) {
 			case -ERESTART_RESTARTBLOCK:
@@ -506,8 +506,8 @@ void do_signal(struct pt_regs *regs)
 
 	/* No handlers present - check for system call restart */
 	clear_pt_regs_flag(regs, PIF_SYSCALL);
-	if (current_thread_info()->system_call) {
-		regs->int_code = current_thread_info()->system_call;
+	if (current->thread.system_call) {
+		regs->int_code = current->thread.system_call;
 		switch (regs->gprs[2]) {
 		case -ERESTART_RESTARTBLOCK:
 			/* Restart with sys_restart_syscall */
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index df4a508ff35c..e49f61aadaf9 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -19,6 +19,7 @@
 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
 
 #include <linux/workqueue.h>
+#include <linux/bootmem.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/mm.h>
@@ -259,16 +260,14 @@ static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu)
 static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk)
 {
 	struct lowcore *lc = pcpu->lowcore;
-	struct thread_info *ti = task_thread_info(tsk);
 
 	lc->kernel_stack = (unsigned long) task_stack_page(tsk)
 		+ THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs);
-	lc->thread_info = (unsigned long) task_thread_info(tsk);
 	lc->current_task = (unsigned long) tsk;
 	lc->lpp = LPP_MAGIC;
 	lc->current_pid = tsk->pid;
-	lc->user_timer = ti->user_timer;
-	lc->system_timer = ti->system_timer;
+	lc->user_timer = tsk->thread.user_timer;
+	lc->system_timer = tsk->thread.system_timer;
 	lc->steal_timer = 0;
 }
 
@@ -662,14 +661,12 @@ int smp_cpu_get_polarization(int cpu)
 	return pcpu_devices[cpu].polarization;
 }
 
-static struct sclp_core_info *smp_get_core_info(void)
+static void __ref smp_get_core_info(struct sclp_core_info *info, int early)
 {
 	static int use_sigp_detection;
-	struct sclp_core_info *info;
 	int address;
 
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
-	if (info && (use_sigp_detection || sclp_get_core_info(info))) {
+	if (use_sigp_detection || sclp_get_core_info(info, early)) {
 		use_sigp_detection = 1;
 		for (address = 0;
 		     address < (SCLP_MAX_CORES << smp_cpu_mt_shift);
@@ -683,7 +680,6 @@ static struct sclp_core_info *smp_get_core_info(void)
 		}
 		info->combined = info->configured;
 	}
-	return info;
 }
 
 static int smp_add_present_cpu(int cpu);
@@ -724,17 +720,15 @@ static int __smp_rescan_cpus(struct sclp_core_info *info, int sysfs_add)
 	return nr;
 }
 
-static void __init smp_detect_cpus(void)
+void __init smp_detect_cpus(void)
 {
 	unsigned int cpu, mtid, c_cpus, s_cpus;
 	struct sclp_core_info *info;
 	u16 address;
 
 	/* Get CPU information */
-	info = smp_get_core_info();
-	if (!info)
-		panic("smp_detect_cpus failed to allocate memory\n");
-
+	info = memblock_virt_alloc(sizeof(*info), 8);
+	smp_get_core_info(info, 1);
 	/* Find boot CPU type */
 	if (sclp.has_core_type) {
 		address = stap();
@@ -770,7 +764,7 @@ static void __init smp_detect_cpus(void)
 	get_online_cpus();
 	__smp_rescan_cpus(info, 0);
 	put_online_cpus();
-	kfree(info);
+	memblock_free_early((unsigned long)info, sizeof(*info));
 }
 
 /*
@@ -807,7 +801,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
 	pcpu = pcpu_devices + cpu;
 	if (pcpu->state != CPU_STATE_CONFIGURED)
 		return -EIO;
-	base = cpu - (cpu % (smp_cpu_mtid + 1));
+	base = smp_get_base_cpu(cpu);
 	for (i = 0; i <= smp_cpu_mtid; i++) {
 		if (base + i < nr_cpu_ids)
 			if (cpu_online(base + i))
@@ -907,7 +901,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	/* request the 0x1202 external call external interrupt */
 	if (register_external_irq(EXT_IRQ_EXTERNAL_CALL, do_ext_call_interrupt))
 		panic("Couldn't request external interrupt 0x1202");
-	smp_detect_cpus();
 }
 
 void __init smp_prepare_boot_cpu(void)
@@ -973,7 +966,7 @@ static ssize_t cpu_configure_store(struct device *dev,
 	rc = -EBUSY;
 	/* disallow configuration changes of online cpus and cpu 0 */
 	cpu = dev->id;
-	cpu -= cpu % (smp_cpu_mtid + 1);
+	cpu = smp_get_base_cpu(cpu);
 	if (cpu == 0)
 		goto out;
 	for (i = 0; i <= smp_cpu_mtid; i++)
@@ -1106,9 +1099,10 @@ int __ref smp_rescan_cpus(void)
 	struct sclp_core_info *info;
 	int nr;
 
-	info = smp_get_core_info();
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
 		return -ENOMEM;
+	smp_get_core_info(info, 0);
 	get_online_cpus();
 	mutex_lock(&smp_cpu_state_mutex);
 	nr = __smp_rescan_cpus(info, 1);
diff --git a/arch/s390/kernel/swsusp.S b/arch/s390/kernel/swsusp.S
index 2d6b6e81f812..1ff21f05d7dd 100644
--- a/arch/s390/kernel/swsusp.S
+++ b/arch/s390/kernel/swsusp.S
@@ -194,7 +194,7 @@ pgm_check_entry:
 
 	/* Suspend CPU not available -> panic */
 	larl	%r15,init_thread_union
-	ahi	%r15,1<<(PAGE_SHIFT+THREAD_ORDER)
+	ahi	%r15,1<<(PAGE_SHIFT+THREAD_SIZE_ORDER)
 	larl	%r2,.Lpanic_string
 	larl	%r3,_sclp_print_early
 	lghi	%r1,0
diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c
index bfda6aa40280..24021c1e3ecb 100644
--- a/arch/s390/kernel/sysinfo.c
+++ b/arch/s390/kernel/sysinfo.c
@@ -56,6 +56,20 @@ int stsi(void *sysinfo, int fc, int sel1, int sel2)
 }
 EXPORT_SYMBOL(stsi);
 
+static bool convert_ext_name(unsigned char encoding, char *name, size_t len)
+{
+	switch (encoding) {
+	case 1: /* EBCDIC */
+		EBCASC(name, len);
+		break;
+	case 2:	/* UTF-8 */
+		break;
+	default:
+		return false;
+	}
+	return true;
+}
+
 static void stsi_1_1_1(struct seq_file *m, struct sysinfo_1_1_1 *info)
 {
 	int i;
@@ -207,24 +221,19 @@ static void stsi_2_2_2(struct seq_file *m, struct sysinfo_2_2_2 *info)
 		seq_printf(m, "LPAR CPUs S-MTID:     %d\n", info->mt_stid);
 		seq_printf(m, "LPAR CPUs PS-MTID:    %d\n", info->mt_psmtid);
 	}
+	if (convert_ext_name(info->vsne, info->ext_name, sizeof(info->ext_name))) {
+		seq_printf(m, "LPAR Extended Name:   %-.256s\n", info->ext_name);
+		seq_printf(m, "LPAR UUID:            %pUb\n", &info->uuid);
+	}
 }
 
 static void print_ext_name(struct seq_file *m, int lvl,
 			   struct sysinfo_3_2_2 *info)
 {
-	if (info->vm[lvl].ext_name_encoding == 0)
-		return;
-	if (info->ext_names[lvl][0] == 0)
-		return;
-	switch (info->vm[lvl].ext_name_encoding) {
-	case 1: /* EBCDIC */
-		EBCASC(info->ext_names[lvl], sizeof(info->ext_names[lvl]));
-		break;
-	case 2:	/* UTF-8 */
-		break;
-	default:
+	size_t len = sizeof(info->ext_names[lvl]);
+
+	if (!convert_ext_name(info->vm[lvl].evmne, info->ext_names[lvl], len))
 		return;
-	}
 	seq_printf(m, "VM%02d Extended Name:   %-.256s\n", lvl,
 		   info->ext_names[lvl]);
 }
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 0bfcc492987e..867d0a057046 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -59,19 +59,27 @@ ATOMIC_NOTIFIER_HEAD(s390_epoch_delta_notifier);
 EXPORT_SYMBOL(s390_epoch_delta_notifier);
 
 unsigned char ptff_function_mask[16];
-unsigned long lpar_offset;
-unsigned long initial_leap_seconds;
+
+static unsigned long long lpar_offset;
+static unsigned long long initial_leap_seconds;
+static unsigned long long tod_steering_end;
+static long long tod_steering_delta;
 
 /*
  * Get time offsets with PTFF
  */
-void __init ptff_init(void)
+void __init time_early_init(void)
 {
 	struct ptff_qto qto;
 	struct ptff_qui qui;
 
+	/* Initialize TOD steering parameters */
+	tod_steering_end = sched_clock_base_cc;
+	vdso_data->ts_end = tod_steering_end;
+
 	if (!test_facility(28))
 		return;
+
 	ptff(&ptff_function_mask, sizeof(ptff_function_mask), PTFF_QAF);
 
 	/* get LPAR offset */
@@ -80,7 +88,7 @@ void __init ptff_init(void)
 
 	/* get initial leap seconds */
 	if (ptff_query(PTFF_QUI) && ptff(&qui, sizeof(qui), PTFF_QUI) == 0)
-		initial_leap_seconds = (unsigned long)
+		initial_leap_seconds = (unsigned long long)
 			((long) qui.old_leap * 4096000000L);
 }
 
@@ -123,18 +131,6 @@ void clock_comparator_work(void)
 	cd->event_handler(cd);
 }
 
-/*
- * Fixup the clock comparator.
- */
-static void fixup_clock_comparator(unsigned long long delta)
-{
-	/* If nobody is waiting there's nothing to fix. */
-	if (S390_lowcore.clock_comparator == -1ULL)
-		return;
-	S390_lowcore.clock_comparator += delta;
-	set_clock_comparator(S390_lowcore.clock_comparator);
-}
-
 static int s390_next_event(unsigned long delta,
 			   struct clock_event_device *evt)
 {
@@ -215,7 +211,21 @@ void read_boot_clock64(struct timespec64 *ts)
 
 static cycle_t read_tod_clock(struct clocksource *cs)
 {
-	return get_tod_clock();
+	unsigned long long now, adj;
+
+	preempt_disable(); /* protect from changes to steering parameters */
+	now = get_tod_clock();
+	adj = tod_steering_end - now;
+	if (unlikely((s64) adj >= 0))
+		/*
+		 * manually steer by 1 cycle every 2^16 cycles. This
+		 * corresponds to shifting the tod delta by 15. 1s is
+		 * therefore steered in ~9h. The adjust will decrease
+		 * over time, until it finally reaches 0.
+		 */
+		now += (tod_steering_delta < 0) ? (adj >> 15) : -(adj >> 15);
+	preempt_enable();
+	return now;
 }
 
 static struct clocksource clocksource_tod = {
@@ -384,6 +394,55 @@ static inline int check_sync_clock(void)
 	return rc;
 }
 
+/*
+ * Apply clock delta to the global data structures.
+ * This is called once on the CPU that performed the clock sync.
+ */
+static void clock_sync_global(unsigned long long delta)
+{
+	unsigned long now, adj;
+	struct ptff_qto qto;
+
+	/* Fixup the monotonic sched clock. */
+	sched_clock_base_cc += delta;
+	/* Adjust TOD steering parameters. */
+	vdso_data->tb_update_count++;
+	now = get_tod_clock();
+	adj = tod_steering_end - now;
+	if (unlikely((s64) adj >= 0))
+		/* Calculate how much of the old adjustment is left. */
+		tod_steering_delta = (tod_steering_delta < 0) ?
+			-(adj >> 15) : (adj >> 15);
+	tod_steering_delta += delta;
+	if ((abs(tod_steering_delta) >> 48) != 0)
+		panic("TOD clock sync offset %lli is too large to drift\n",
+		      tod_steering_delta);
+	tod_steering_end = now + (abs(tod_steering_delta) << 15);
+	vdso_data->ts_dir = (tod_steering_delta < 0) ? 0 : 1;
+	vdso_data->ts_end = tod_steering_end;
+	vdso_data->tb_update_count++;
+	/* Update LPAR offset. */
+	if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0)
+		lpar_offset = qto.tod_epoch_difference;
+	/* Call the TOD clock change notifier. */
+	atomic_notifier_call_chain(&s390_epoch_delta_notifier, 0, &delta);
+}
+
+/*
+ * Apply clock delta to the per-CPU data structures of this CPU.
+ * This is called for each online CPU after the call to clock_sync_global.
+ */
+static void clock_sync_local(unsigned long long delta)
+{
+	/* Add the delta to the clock comparator. */
+	if (S390_lowcore.clock_comparator != -1ULL) {
+		S390_lowcore.clock_comparator += delta;
+		set_clock_comparator(S390_lowcore.clock_comparator);
+	}
+	/* Adjust the last_update_clock time-stamp. */
+	S390_lowcore.last_update_clock += delta;
+}
+
 /* Single threaded workqueue used for stp sync events */
 static struct workqueue_struct *time_sync_wq;
 
@@ -397,31 +456,9 @@ static void __init time_init_wq(void)
 struct clock_sync_data {
 	atomic_t cpus;
 	int in_sync;
-	unsigned long long fixup_cc;
+	unsigned long long clock_delta;
 };
 
-static void clock_sync_cpu(struct clock_sync_data *sync)
-{
-	atomic_dec(&sync->cpus);
-	enable_sync_clock();
-	while (sync->in_sync == 0) {
-		__udelay(1);
-		/*
-		 * A different cpu changes *in_sync. Therefore use
-		 * barrier() to force memory access.
-		 */
-		barrier();
-	}
-	if (sync->in_sync != 1)
-		/* Didn't work. Clear per-cpu in sync bit again. */
-		disable_sync_clock(NULL);
-	/*
-	 * This round of TOD syncing is done. Set the clock comparator
-	 * to the next tick and let the processor continue.
-	 */
-	fixup_clock_comparator(sync->fixup_cc);
-}
-
 /*
  * Server Time Protocol (STP) code.
  */
@@ -523,54 +560,46 @@ void stp_queue_work(void)
 
 static int stp_sync_clock(void *data)
 {
-	static int first;
+	struct clock_sync_data *sync = data;
 	unsigned long long clock_delta;
-	struct clock_sync_data *stp_sync;
-	struct ptff_qto qto;
+	static int first;
 	int rc;
 
-	stp_sync = data;
-
-	if (xchg(&first, 1) == 1) {
-		/* Slave */
-		clock_sync_cpu(stp_sync);
-		return 0;
-	}
-
-	/* Wait until all other cpus entered the sync function. */
-	while (atomic_read(&stp_sync->cpus) != 0)
-		cpu_relax();
-
 	enable_sync_clock();
-
-	rc = 0;
-	if (stp_info.todoff[0] || stp_info.todoff[1] ||
-	    stp_info.todoff[2] || stp_info.todoff[3] ||
-	    stp_info.tmd != 2) {
-		rc = chsc_sstpc(stp_page, STP_OP_SYNC, 0, &clock_delta);
-		if (rc == 0) {
-			/* fixup the monotonic sched clock */
-			sched_clock_base_cc += clock_delta;
-			if (ptff_query(PTFF_QTO) &&
-			    ptff(&qto, sizeof(qto), PTFF_QTO) == 0)
-				/* Update LPAR offset */
-				lpar_offset = qto.tod_epoch_difference;
-			atomic_notifier_call_chain(&s390_epoch_delta_notifier,
-						   0, &clock_delta);
-			stp_sync->fixup_cc = clock_delta;
-			fixup_clock_comparator(clock_delta);
-			rc = chsc_sstpi(stp_page, &stp_info,
-					sizeof(struct stp_sstpi));
-			if (rc == 0 && stp_info.tmd != 2)
-				rc = -EAGAIN;
+	if (xchg(&first, 1) == 0) {
+		/* Wait until all other cpus entered the sync function. */
+		while (atomic_read(&sync->cpus) != 0)
+			cpu_relax();
+		rc = 0;
+		if (stp_info.todoff[0] || stp_info.todoff[1] ||
+		    stp_info.todoff[2] || stp_info.todoff[3] ||
+		    stp_info.tmd != 2) {
+			rc = chsc_sstpc(stp_page, STP_OP_SYNC, 0,
+					&clock_delta);
+			if (rc == 0) {
+				sync->clock_delta = clock_delta;
+				clock_sync_global(clock_delta);
+				rc = chsc_sstpi(stp_page, &stp_info,
+						sizeof(struct stp_sstpi));
+				if (rc == 0 && stp_info.tmd != 2)
+					rc = -EAGAIN;
+			}
 		}
+		sync->in_sync = rc ? -EAGAIN : 1;
+		xchg(&first, 0);
+	} else {
+		/* Slave */
+		atomic_dec(&sync->cpus);
+		/* Wait for in_sync to be set. */
+		while (READ_ONCE(sync->in_sync) == 0)
+			__udelay(1);
 	}
-	if (rc) {
+	if (sync->in_sync != 1)
+		/* Didn't work. Clear per-cpu in sync bit again. */
 		disable_sync_clock(NULL);
-		stp_sync->in_sync = -EAGAIN;
-	} else
-		stp_sync->in_sync = 1;
-	xchg(&first, 0);
+	/* Apply clock delta to per-CPU fields of this CPU. */
+	clock_sync_local(sync->clock_delta);
+
 	return 0;
 }
 
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index e959c02e0cac..93dcbae1e98d 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -7,6 +7,7 @@
 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
 
 #include <linux/workqueue.h>
+#include <linux/bootmem.h>
 #include <linux/cpuset.h>
 #include <linux/device.h>
 #include <linux/export.h>
@@ -41,15 +42,17 @@ static bool topology_enabled = true;
 static DECLARE_WORK(topology_work, topology_work_fn);
 
 /*
- * Socket/Book linked lists and per_cpu(cpu_topology) updates are
+ * Socket/Book linked lists and cpu_topology updates are
  * protected by "sched_domains_mutex".
  */
 static struct mask_info socket_info;
 static struct mask_info book_info;
 static struct mask_info drawer_info;
 
-DEFINE_PER_CPU(struct cpu_topology_s390, cpu_topology);
-EXPORT_PER_CPU_SYMBOL_GPL(cpu_topology);
+struct cpu_topology_s390 cpu_topology[NR_CPUS];
+EXPORT_SYMBOL_GPL(cpu_topology);
+
+cpumask_t cpus_with_topology;
 
 static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu)
 {
@@ -97,7 +100,7 @@ static void add_cpus_to_mask(struct topology_core *tl_core,
 		if (lcpu < 0)
 			continue;
 		for (i = 0; i <= smp_cpu_mtid; i++) {
-			topo = &per_cpu(cpu_topology, lcpu + i);
+			topo = &cpu_topology[lcpu + i];
 			topo->drawer_id = drawer->id;
 			topo->book_id = book->id;
 			topo->socket_id = socket->id;
@@ -106,6 +109,7 @@ static void add_cpus_to_mask(struct topology_core *tl_core,
 			cpumask_set_cpu(lcpu + i, &drawer->mask);
 			cpumask_set_cpu(lcpu + i, &book->mask);
 			cpumask_set_cpu(lcpu + i, &socket->mask);
+			cpumask_set_cpu(lcpu + i, &cpus_with_topology);
 			smp_cpu_set_polarization(lcpu + i, tl_core->pp);
 		}
 	}
@@ -220,7 +224,7 @@ static void update_cpu_masks(void)
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
-		topo = &per_cpu(cpu_topology, cpu);
+		topo = &cpu_topology[cpu];
 		topo->thread_mask = cpu_thread_map(cpu);
 		topo->core_mask = cpu_group_map(&socket_info, cpu);
 		topo->book_mask = cpu_group_map(&book_info, cpu);
@@ -231,6 +235,8 @@ static void update_cpu_masks(void)
 			topo->socket_id = cpu;
 			topo->book_id = cpu;
 			topo->drawer_id = cpu;
+			if (cpu_present(cpu))
+				cpumask_set_cpu(cpu, &cpus_with_topology);
 		}
 	}
 	numa_update_cpu_topology();
@@ -241,12 +247,12 @@ void store_topology(struct sysinfo_15_1_x *info)
 	stsi(info, 15, 1, min(topology_max_mnest, 4));
 }
 
-int arch_update_cpu_topology(void)
+static int __arch_update_cpu_topology(void)
 {
 	struct sysinfo_15_1_x *info = tl_info;
-	struct device *dev;
-	int cpu, rc = 0;
+	int rc = 0;
 
+	cpumask_clear(&cpus_with_topology);
 	if (MACHINE_HAS_TOPOLOGY) {
 		rc = 1;
 		store_topology(info);
@@ -255,6 +261,15 @@ int arch_update_cpu_topology(void)
 	update_cpu_masks();
 	if (!MACHINE_HAS_TOPOLOGY)
 		topology_update_polarization_simple();
+	return rc;
+}
+
+int arch_update_cpu_topology(void)
+{
+	struct device *dev;
+	int cpu, rc;
+
+	rc = __arch_update_cpu_topology();
 	for_each_online_cpu(cpu) {
 		dev = get_cpu_device(cpu);
 		kobject_uevent(&dev->kobj, KOBJ_CHANGE);
@@ -394,23 +409,23 @@ int topology_cpu_init(struct cpu *cpu)
 
 static const struct cpumask *cpu_thread_mask(int cpu)
 {
-	return &per_cpu(cpu_topology, cpu).thread_mask;
+	return &cpu_topology[cpu].thread_mask;
 }
 
 
 const struct cpumask *cpu_coregroup_mask(int cpu)
 {
-	return &per_cpu(cpu_topology, cpu).core_mask;
+	return &cpu_topology[cpu].core_mask;
 }
 
 static const struct cpumask *cpu_book_mask(int cpu)
 {
-	return &per_cpu(cpu_topology, cpu).book_mask;
+	return &cpu_topology[cpu].book_mask;
 }
 
 static const struct cpumask *cpu_drawer_mask(int cpu)
 {
-	return &per_cpu(cpu_topology, cpu).drawer_mask;
+	return &cpu_topology[cpu].drawer_mask;
 }
 
 static int __init early_parse_topology(char *p)
@@ -438,19 +453,20 @@ static void __init alloc_masks(struct sysinfo_15_1_x *info,
 		nr_masks *= info->mag[TOPOLOGY_NR_MAG - offset - 1 - i];
 	nr_masks = max(nr_masks, 1);
 	for (i = 0; i < nr_masks; i++) {
-		mask->next = kzalloc(sizeof(*mask->next), GFP_KERNEL);
+		mask->next = memblock_virt_alloc(sizeof(*mask->next), 8);
 		mask = mask->next;
 	}
 }
 
-static int __init s390_topology_init(void)
+void __init topology_init_early(void)
 {
 	struct sysinfo_15_1_x *info;
 	int i;
 
+	set_sched_topology(s390_topology);
 	if (!MACHINE_HAS_TOPOLOGY)
-		return 0;
-	tl_info = (struct sysinfo_15_1_x *)__get_free_page(GFP_KERNEL);
+		goto out;
+	tl_info = memblock_virt_alloc(sizeof(*tl_info), PAGE_SIZE);
 	info = tl_info;
 	store_topology(info);
 	pr_info("The CPU configuration topology of the machine is:");
@@ -460,10 +476,9 @@ static int __init s390_topology_init(void)
 	alloc_masks(info, &socket_info, 1);
 	alloc_masks(info, &book_info, 2);
 	alloc_masks(info, &drawer_info, 3);
-	set_sched_topology(s390_topology);
-	return 0;
+out:
+	__arch_update_cpu_topology();
 }
-early_initcall(s390_topology_init);
 
 static int __init topology_init(void)
 {
diff --git a/arch/s390/kernel/vdso32/clock_gettime.S b/arch/s390/kernel/vdso32/clock_gettime.S
index 5eec9afbb5b5..a5769b83d90e 100644
--- a/arch/s390/kernel/vdso32/clock_gettime.S
+++ b/arch/s390/kernel/vdso32/clock_gettime.S
@@ -99,8 +99,27 @@ __kernel_clock_gettime:
 	tml	%r4,0x0001			/* pending update ? loop */
 	jnz	11b
 	stcke	0(%r15)				/* Store TOD clock */
-	lm	%r0,%r1,1(%r15)
-	s	%r0,__VDSO_XTIME_STAMP(%r5)	/* TOD - cycle_last */
+	lm	%r0,%r1,__VDSO_TS_END(%r5)	/* TOD steering end time */
+	s	%r0,1(%r15)			/* no - ts_steering_end */
+	sl	%r1,5(%r15)
+	brc	3,22f
+	ahi	%r0,-1
+22:	ltr	%r0,%r0				/* past end of steering? */
+	jm	24f
+	srdl	%r0,15				/* 1 per 2^16 */
+	tm	__VDSO_TS_DIR+3(%r5),0x01	/* steering direction? */
+	jz	23f
+	lcr	%r0,%r0				/* negative TOD offset */
+	lcr	%r1,%r1
+	je	23f
+	ahi	%r0,-1
+23:	a	%r0,1(%r15)			/* add TOD timestamp */
+	al	%r1,5(%r15)
+	brc	12,25f
+	ahi	%r0,1
+	j	25f
+24:	lm	%r0,%r1,1(%r15)			/* load TOD timestamp */
+25:	s	%r0,__VDSO_XTIME_STAMP(%r5)	/* TOD - cycle_last */
 	sl	%r1,__VDSO_XTIME_STAMP+4(%r5)
 	brc	3,12f
 	ahi	%r0,-1
diff --git a/arch/s390/kernel/vdso32/gettimeofday.S b/arch/s390/kernel/vdso32/gettimeofday.S
index 719de6186b20..63b86dceb0bf 100644
--- a/arch/s390/kernel/vdso32/gettimeofday.S
+++ b/arch/s390/kernel/vdso32/gettimeofday.S
@@ -31,8 +31,27 @@ __kernel_gettimeofday:
 	tml	%r4,0x0001			/* pending update ? loop */
 	jnz	1b
 	stcke	0(%r15)				/* Store TOD clock */
-	lm	%r0,%r1,1(%r15)
-	s	%r0,__VDSO_XTIME_STAMP(%r5)	/* TOD - cycle_last */
+	lm	%r0,%r1,__VDSO_TS_END(%r5)	/* TOD steering end time */
+	s	%r0,1(%r15)
+	sl	%r1,5(%r15)
+	brc	3,14f
+	ahi	%r0,-1
+14:	ltr	%r0,%r0				/* past end of steering? */
+	jm	16f
+	srdl	%r0,15				/* 1 per 2^16 */
+	tm	__VDSO_TS_DIR+3(%r5),0x01	/* steering direction? */
+	jz	15f
+	lcr	%r0,%r0				/* negative TOD offset */
+	lcr	%r1,%r1
+	je	15f
+	ahi	%r0,-1
+15:	a	%r0,1(%r15)			/* add TOD timestamp */
+	al	%r1,5(%r15)
+	brc	12,17f
+	ahi	%r0,1
+	j	17f
+16:	lm	%r0,%r1,1(%r15)			/* load TOD timestamp */
+17:	s	%r0,__VDSO_XTIME_STAMP(%r5)	/* TOD - cycle_last */
 	sl	%r1,__VDSO_XTIME_STAMP+4(%r5)
 	brc	3,3f
 	ahi	%r0,-1
diff --git a/arch/s390/kernel/vdso64/clock_gettime.S b/arch/s390/kernel/vdso64/clock_gettime.S
index 61541fb93dc6..9c3b12626dba 100644
--- a/arch/s390/kernel/vdso64/clock_gettime.S
+++ b/arch/s390/kernel/vdso64/clock_gettime.S
@@ -83,8 +83,17 @@ __kernel_clock_gettime:
 	tmll	%r4,0x0001			/* pending update ? loop */
 	jnz	5b
 	stcke	0(%r15)				/* Store TOD clock */
-	lgf	%r2,__VDSO_TK_SHIFT(%r5)	/* Timekeeper shift */
 	lg	%r1,1(%r15)
+	lg	%r0,__VDSO_TS_END(%r5)		/* TOD steering end time */
+	slgr	%r0,%r1				/* now - ts_steering_end */
+	ltgr	%r0,%r0				/* past end of steering ? */
+	jm	17f
+	srlg	%r0,%r0,15			/* 1 per 2^16 */
+	tm	__VDSO_TS_DIR+3(%r5),0x01	/* steering direction? */
+	jz	18f
+	lcgr	%r0,%r0				/* negative TOD offset */
+18:	algr	%r1,%r0				/* add steering offset */
+17:	lgf	%r2,__VDSO_TK_SHIFT(%r5)	/* Timekeeper shift */
 	sg	%r1,__VDSO_XTIME_STAMP(%r5)	/* TOD - cycle_last */
 	msgf	%r1,__VDSO_TK_MULT(%r5)		/*  * tk->mult */
 	alg	%r1,__VDSO_XTIME_NSEC(%r5)	/*  + tk->xtime_nsec */
diff --git a/arch/s390/kernel/vdso64/gettimeofday.S b/arch/s390/kernel/vdso64/gettimeofday.S
index 6ce46707663c..b02e62f3bc12 100644
--- a/arch/s390/kernel/vdso64/gettimeofday.S
+++ b/arch/s390/kernel/vdso64/gettimeofday.S
@@ -31,7 +31,16 @@ __kernel_gettimeofday:
 	jnz	0b
 	stcke	0(%r15)				/* Store TOD clock */
 	lg	%r1,1(%r15)
-	sg	%r1,__VDSO_XTIME_STAMP(%r5)	/* TOD - cycle_last */
+	lg	%r0,__VDSO_TS_END(%r5)		/* TOD steering end time */
+	slgr	%r0,%r1				/* now - ts_steering_end */
+	ltgr	%r0,%r0				/* past end of steering ? */
+	jm	6f
+	srlg	%r0,%r0,15			/* 1 per 2^16 */
+	tm	__VDSO_TS_DIR+3(%r5),0x01	/* steering direction? */
+	jz	7f
+	lcgr	%r0,%r0				/* negative TOD offset */
+7:	algr	%r1,%r0				/* add steering offset */
+6:	sg	%r1,__VDSO_XTIME_STAMP(%r5)	/* TOD - cycle_last */
 	msgf	%r1,__VDSO_TK_MULT(%r5)		/*  * tk->mult */
 	alg	%r1,__VDSO_XTIME_NSEC(%r5)	/*  + tk->xtime_nsec */
 	lg	%r0,__VDSO_XTIME_SEC(%r5)	/* tk->xtime_sec */
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 1bd5dde2d5a9..6b246aadf311 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -96,7 +96,6 @@ static void update_mt_scaling(void)
  */
 static int do_account_vtime(struct task_struct *tsk, int hardirq_offset)
 {
-	struct thread_info *ti = task_thread_info(tsk);
 	u64 timer, clock, user, system, steal;
 	u64 user_scaled, system_scaled;
 
@@ -119,13 +118,13 @@ static int do_account_vtime(struct task_struct *tsk, int hardirq_offset)
 	    time_after64(jiffies_64, this_cpu_read(mt_scaling_jiffies)))
 		update_mt_scaling();
 
-	user = S390_lowcore.user_timer - ti->user_timer;
+	user = S390_lowcore.user_timer - tsk->thread.user_timer;
 	S390_lowcore.steal_timer -= user;
-	ti->user_timer = S390_lowcore.user_timer;
+	tsk->thread.user_timer = S390_lowcore.user_timer;
 
-	system = S390_lowcore.system_timer - ti->system_timer;
+	system = S390_lowcore.system_timer - tsk->thread.system_timer;
 	S390_lowcore.steal_timer -= system;
-	ti->system_timer = S390_lowcore.system_timer;
+	tsk->thread.system_timer = S390_lowcore.system_timer;
 
 	user_scaled = user;
 	system_scaled = system;
@@ -153,15 +152,11 @@ static int do_account_vtime(struct task_struct *tsk, int hardirq_offset)
 
 void vtime_task_switch(struct task_struct *prev)
 {
-	struct thread_info *ti;
-
 	do_account_vtime(prev, 0);
-	ti = task_thread_info(prev);
-	ti->user_timer = S390_lowcore.user_timer;
-	ti->system_timer = S390_lowcore.system_timer;
-	ti = task_thread_info(current);
-	S390_lowcore.user_timer = ti->user_timer;
-	S390_lowcore.system_timer = ti->system_timer;
+	prev->thread.user_timer = S390_lowcore.user_timer;
+	prev->thread.system_timer = S390_lowcore.system_timer;
+	S390_lowcore.user_timer = current->thread.user_timer;
+	S390_lowcore.system_timer = current->thread.system_timer;
 }
 
 /*
@@ -181,7 +176,6 @@ void vtime_account_user(struct task_struct *tsk)
  */
 void vtime_account_irq_enter(struct task_struct *tsk)
 {
-	struct thread_info *ti = task_thread_info(tsk);
 	u64 timer, system, system_scaled;
 
 	timer = S390_lowcore.last_update_timer;
@@ -193,9 +187,9 @@ void vtime_account_irq_enter(struct task_struct *tsk)
 	    time_after64(jiffies_64, this_cpu_read(mt_scaling_jiffies)))
 		update_mt_scaling();
 
-	system = S390_lowcore.system_timer - ti->system_timer;
+	system = S390_lowcore.system_timer - tsk->thread.system_timer;
 	S390_lowcore.steal_timer -= system;
-	ti->system_timer = S390_lowcore.system_timer;
+	tsk->thread.system_timer = S390_lowcore.system_timer;
 	system_scaled = system;
 	/* Do MT utilization scaling */
 	if (smp_cpu_mtid) {
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index be4db07f70d3..af13f1a135b6 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -415,7 +415,7 @@ static int __write_machine_check(struct kvm_vcpu *vcpu,
 	int rc;
 
 	mci.val = mchk->mcic;
-	/* take care of lazy register loading via vcpu load/put */
+	/* take care of lazy register loading */
 	save_fpu_regs();
 	save_access_regs(vcpu->run->s.regs.acrs);
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 9c7a1ecfe6bd..bec71e902be3 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1812,22 +1812,7 @@ __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu)
 
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-	/* Save host register state */
-	save_fpu_regs();
-	vcpu->arch.host_fpregs.fpc = current->thread.fpu.fpc;
-	vcpu->arch.host_fpregs.regs = current->thread.fpu.regs;
-
-	if (MACHINE_HAS_VX)
-		current->thread.fpu.regs = vcpu->run->s.regs.vrs;
-	else
-		current->thread.fpu.regs = vcpu->run->s.regs.fprs;
-	current->thread.fpu.fpc = vcpu->run->s.regs.fpc;
-	if (test_fp_ctl(current->thread.fpu.fpc))
-		/* User space provided an invalid FPC, let's clear it */
-		current->thread.fpu.fpc = 0;
 
-	save_access_regs(vcpu->arch.host_acrs);
-	restore_access_regs(vcpu->run->s.regs.acrs);
 	gmap_enable(vcpu->arch.enabled_gmap);
 	atomic_or(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
 	if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
@@ -1844,16 +1829,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	vcpu->arch.enabled_gmap = gmap_get_enabled();
 	gmap_disable(vcpu->arch.enabled_gmap);
 
-	/* Save guest register state */
-	save_fpu_regs();
-	vcpu->run->s.regs.fpc = current->thread.fpu.fpc;
-
-	/* Restore host register state */
-	current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc;
-	current->thread.fpu.regs = vcpu->arch.host_fpregs.regs;
-
-	save_access_regs(vcpu->run->s.regs.acrs);
-	restore_access_regs(vcpu->arch.host_acrs);
 }
 
 static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
@@ -2243,7 +2218,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 {
 	memcpy(&vcpu->run->s.regs.acrs, &sregs->acrs, sizeof(sregs->acrs));
 	memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs));
-	restore_access_regs(vcpu->run->s.regs.acrs);
 	return 0;
 }
 
@@ -2257,11 +2231,9 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 
 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-	/* make sure the new values will be lazily loaded */
-	save_fpu_regs();
 	if (test_fp_ctl(fpu->fpc))
 		return -EINVAL;
-	current->thread.fpu.fpc = fpu->fpc;
+	vcpu->run->s.regs.fpc = fpu->fpc;
 	if (MACHINE_HAS_VX)
 		convert_fp_to_vx((__vector128 *) vcpu->run->s.regs.vrs,
 				 (freg_t *) fpu->fprs);
@@ -2279,7 +2251,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 				 (__vector128 *) vcpu->run->s.regs.vrs);
 	else
 		memcpy(fpu->fprs, vcpu->run->s.regs.fprs, sizeof(fpu->fprs));
-	fpu->fpc = current->thread.fpu.fpc;
+	fpu->fpc = vcpu->run->s.regs.fpc;
 	return 0;
 }
 
@@ -2740,6 +2712,20 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		if (riccb->valid)
 			vcpu->arch.sie_block->ecb3 |= 0x01;
 	}
+	save_access_regs(vcpu->arch.host_acrs);
+	restore_access_regs(vcpu->run->s.regs.acrs);
+	/* save host (userspace) fprs/vrs */
+	save_fpu_regs();
+	vcpu->arch.host_fpregs.fpc = current->thread.fpu.fpc;
+	vcpu->arch.host_fpregs.regs = current->thread.fpu.regs;
+	if (MACHINE_HAS_VX)
+		current->thread.fpu.regs = vcpu->run->s.regs.vrs;
+	else
+		current->thread.fpu.regs = vcpu->run->s.regs.fprs;
+	current->thread.fpu.fpc = vcpu->run->s.regs.fpc;
+	if (test_fp_ctl(current->thread.fpu.fpc))
+		/* User space provided an invalid FPC, let's clear it */
+		current->thread.fpu.fpc = 0;
 
 	kvm_run->kvm_dirty_regs = 0;
 }
@@ -2758,6 +2744,15 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	kvm_run->s.regs.pft = vcpu->arch.pfault_token;
 	kvm_run->s.regs.pfs = vcpu->arch.pfault_select;
 	kvm_run->s.regs.pfc = vcpu->arch.pfault_compare;
+	save_access_regs(vcpu->run->s.regs.acrs);
+	restore_access_regs(vcpu->arch.host_acrs);
+	/* Save guest register state */
+	save_fpu_regs();
+	vcpu->run->s.regs.fpc = current->thread.fpu.fpc;
+	/* Restore will be done lazily at return */
+	current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc;
+	current->thread.fpu.regs = vcpu->arch.host_fpregs.regs;
+
 }
 
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -2874,7 +2869,7 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
 {
 	/*
 	 * The guest FPRS and ACRS are in the host FPRS/ACRS due to the lazy
-	 * copying in vcpu load/put. Lets update our copies before we save
+	 * switch in the run ioctl. Let's update our copies before we save
 	 * it into the save area
 	 */
 	save_fpu_regs();
diff --git a/arch/s390/lib/mem.S b/arch/s390/lib/mem.S
index be9fa65bfac4..7422a706f310 100644
--- a/arch/s390/lib/mem.S
+++ b/arch/s390/lib/mem.S
@@ -8,6 +8,45 @@
 #include <asm/export.h>
 
 /*
+ * void *memmove(void *dest, const void *src, size_t n)
+ */
+ENTRY(memmove)
+	ltgr	%r4,%r4
+	lgr	%r1,%r2
+	bzr	%r14
+	clgr	%r2,%r3
+	jnh	.Lmemmove_forward
+	la	%r5,0(%r4,%r3)
+	clgr	%r2,%r5
+	jl	.Lmemmove_reverse
+.Lmemmove_forward:
+	aghi	%r4,-1
+	srlg	%r0,%r4,8
+	ltgr	%r0,%r0
+	jz	.Lmemmove_rest
+.Lmemmove_loop:
+	mvc	0(256,%r1),0(%r3)
+	la	%r1,256(%r1)
+	la	%r3,256(%r3)
+	brctg	%r0,.Lmemmove_loop
+.Lmemmove_rest:
+	larl	%r5,.Lmemmove_mvc
+	ex	%r4,0(%r5)
+	br	%r14
+.Lmemmove_reverse:
+	aghi	%r4,-1
+.Lmemmove_reverse_loop:
+	ic	%r0,0(%r4,%r3)
+	stc	%r0,0(%r4,%r1)
+	brctg	%r4,.Lmemmove_reverse_loop
+	ic	%r0,0(%r4,%r3)
+	stc	%r0,0(%r4,%r1)
+	br	%r14
+.Lmemmove_mvc:
+	mvc	0(1,%r1),0(%r3)
+EXPORT_SYMBOL(memmove)
+
+/*
  * memset implementation
  *
  * This code corresponds to the C construct below. We do distinguish
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 661d9fe63c43..d1faae5cdd12 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -733,6 +733,7 @@ block:
 			 * return to userspace schedule() to block. */
 			__set_current_state(TASK_UNINTERRUPTIBLE);
 			set_tsk_need_resched(tsk);
+			set_preempt_need_resched();
 		}
 	}
 out:
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 1848292766ef..45becc8a44ec 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -34,7 +34,7 @@ static void __ref *vmem_alloc_pages(unsigned int order)
 
 	if (slab_is_available())
 		return (void *)__get_free_pages(GFP_KERNEL, order);
-	return alloc_bootmem_align(size, size);
+	return (void *) memblock_alloc(size, size);
 }
 
 static inline pud_t *vmem_pud_alloc(void)
@@ -61,17 +61,16 @@ pmd_t *vmem_pmd_alloc(void)
 
 pte_t __ref *vmem_pte_alloc(void)
 {
+	unsigned long size = PTRS_PER_PTE * sizeof(pte_t);
 	pte_t *pte;
 
 	if (slab_is_available())
 		pte = (pte_t *) page_table_alloc(&init_mm);
 	else
-		pte = alloc_bootmem_align(PTRS_PER_PTE * sizeof(pte_t),
-					  PTRS_PER_PTE * sizeof(pte_t));
+		pte = (pte_t *) memblock_alloc(size, size);
 	if (!pte)
 		return NULL;
-	clear_table((unsigned long *) pte, _PAGE_INVALID,
-		    PTRS_PER_PTE * sizeof(pte_t));
+	clear_table((unsigned long *) pte, _PAGE_INVALID, size);
 	return pte;
 }
 
diff --git a/arch/s390/numa/mode_emu.c b/arch/s390/numa/mode_emu.c
index 37e0bb835516..cfd08384f0ab 100644
--- a/arch/s390/numa/mode_emu.c
+++ b/arch/s390/numa/mode_emu.c
@@ -21,6 +21,7 @@
 #include <linux/kernel.h>
 #include <linux/cpumask.h>
 #include <linux/memblock.h>
+#include <linux/bootmem.h>
 #include <linux/node.h>
 #include <linux/memory.h>
 #include <linux/slab.h>
@@ -307,13 +308,11 @@ fail:
 /*
  * Allocate and initialize core to node mapping
  */
-static void create_core_to_node_map(void)
+static void __ref create_core_to_node_map(void)
 {
 	int i;
 
-	emu_cores = kzalloc(sizeof(*emu_cores), GFP_KERNEL);
-	if (emu_cores == NULL)
-		panic("Could not allocate cores to node memory");
+	emu_cores = memblock_virt_alloc(sizeof(*emu_cores), 8);
 	for (i = 0; i < ARRAY_SIZE(emu_cores->to_node_id); i++)
 		emu_cores->to_node_id[i] = NODE_ID_FREE;
 }
@@ -354,13 +353,13 @@ static struct toptree *toptree_from_topology(void)
 
 	phys = toptree_new(TOPTREE_ID_PHYS, 1);
 
-	for_each_online_cpu(cpu) {
-		top = &per_cpu(cpu_topology, cpu);
+	for_each_cpu(cpu, &cpus_with_topology) {
+		top = &cpu_topology[cpu];
 		node = toptree_get_child(phys, 0);
 		drawer = toptree_get_child(node, top->drawer_id);
 		book = toptree_get_child(drawer, top->book_id);
 		mc = toptree_get_child(book, top->socket_id);
-		core = toptree_get_child(mc, top->core_id);
+		core = toptree_get_child(mc, smp_get_base_cpu(cpu));
 		if (!drawer || !book || !mc || !core)
 			panic("NUMA emulation could not allocate memory");
 		cpumask_set_cpu(cpu, &core->mask);
@@ -378,7 +377,7 @@ static void topology_add_core(struct toptree *core)
 	int cpu;
 
 	for_each_cpu(cpu, &core->mask) {
-		top = &per_cpu(cpu_topology, cpu);
+		top = &cpu_topology[cpu];
 		cpumask_copy(&top->thread_mask, &core->mask);
 		cpumask_copy(&top->core_mask, &core_mc(core)->mask);
 		cpumask_copy(&top->book_mask, &core_book(core)->mask);
@@ -425,6 +424,27 @@ static void print_node_to_core_map(void)
 	}
 }
 
+static void pin_all_possible_cpus(void)
+{
+	int core_id, node_id, cpu;
+	static int initialized;
+
+	if (initialized)
+		return;
+	print_node_to_core_map();
+	node_id = 0;
+	for_each_possible_cpu(cpu) {
+		core_id = smp_get_base_cpu(cpu);
+		if (emu_cores->to_node_id[core_id] != NODE_ID_FREE)
+			continue;
+		pin_core_to_node(core_id, node_id);
+		cpu_topology[cpu].node_id = node_id;
+		node_id = (node_id + 1) % emu_nodes;
+	}
+	print_node_to_core_map();
+	initialized = 1;
+}
+
 /*
  * Transfer physical topology into a NUMA topology and modify CPU masks
  * according to the NUMA topology.
@@ -442,7 +462,7 @@ static void emu_update_cpu_topology(void)
 	toptree_free(phys);
 	toptree_to_topology(numa);
 	toptree_free(numa);
-	print_node_to_core_map();
+	pin_all_possible_cpus();
 }
 
 /*
diff --git a/arch/s390/numa/toptree.c b/arch/s390/numa/toptree.c
index 902d350d859a..26f622b1cd11 100644
--- a/arch/s390/numa/toptree.c
+++ b/arch/s390/numa/toptree.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/bootmem.h>
 #include <linux/cpumask.h>
 #include <linux/list.h>
 #include <linux/list_sort.h>
@@ -25,10 +26,14 @@
  * RETURNS:
  * Pointer to the new tree node or NULL on error
  */
-struct toptree *toptree_alloc(int level, int id)
+struct toptree __ref *toptree_alloc(int level, int id)
 {
-	struct toptree *res = kzalloc(sizeof(struct toptree), GFP_KERNEL);
+	struct toptree *res;
 
+	if (slab_is_available())
+		res = kzalloc(sizeof(*res), GFP_KERNEL);
+	else
+		res = memblock_virt_alloc(sizeof(*res), 8);
 	if (!res)
 		return res;
 
@@ -65,7 +70,7 @@ static void toptree_remove(struct toptree *cand)
  * cleanly using toptree_remove. Possible children are freed
  * recursively. In the end @cand itself is freed.
  */
-void toptree_free(struct toptree *cand)
+void __ref toptree_free(struct toptree *cand)
 {
 	struct toptree *child, *tmp;
 
@@ -73,7 +78,10 @@ void toptree_free(struct toptree *cand)
 		toptree_remove(cand);
 	toptree_for_each_child_safe(child, tmp, cand)
 		toptree_free(child);
-	kfree(cand);
+	if (slab_is_available())
+		kfree(cand);
+	else
+		memblock_free_early((unsigned long)cand, sizeof(*cand));
 }
 
 /**
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index 15ffc19c8c0c..64e1734bebb7 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -722,6 +722,11 @@ struct dev_pm_ops pcibios_pm_ops = {
 
 static int zpci_alloc_domain(struct zpci_dev *zdev)
 {
+	if (zpci_unique_uid) {
+		zdev->domain = (u16) zdev->uid;
+		return 0;
+	}
+
 	spin_lock(&zpci_domain_lock);
 	zdev->domain = find_first_zero_bit(zpci_domain, ZPCI_NR_DEVICES);
 	if (zdev->domain == ZPCI_NR_DEVICES) {
@@ -735,6 +740,9 @@ static int zpci_alloc_domain(struct zpci_dev *zdev)
 
 static void zpci_free_domain(struct zpci_dev *zdev)
 {
+	if (zpci_unique_uid)
+		return;
+
 	spin_lock(&zpci_domain_lock);
 	clear_bit(zdev->domain, zpci_domain);
 	spin_unlock(&zpci_domain_lock);
diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c
index 1a4512c8544a..e3ef63b36b5a 100644
--- a/arch/s390/pci/pci_clp.c
+++ b/arch/s390/pci/pci_clp.c
@@ -22,6 +22,8 @@
 #include <asm/clp.h>
 #include <uapi/asm/clp.h>
 
+bool zpci_unique_uid;
+
 static inline void zpci_err_clp(unsigned int rsp, int rc)
 {
 	struct {
@@ -315,6 +317,7 @@ static int clp_list_pci(struct clp_req_rsp_list_pci *rrb,
 			goto out;
 		}
 
+		zpci_unique_uid = rrb->response.uid_checking;
 		WARN_ON_ONCE(rrb->response.entry_size !=
 			sizeof(struct clp_fh_list_entry));
 
diff --git a/arch/s390/pci/pci_debug.c b/arch/s390/pci/pci_debug.c
index 38993b156924..c2f786f0ea06 100644
--- a/arch/s390/pci/pci_debug.c
+++ b/arch/s390/pci/pci_debug.c
@@ -69,7 +69,7 @@ static void pci_sw_counter_show(struct seq_file *m)
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(pci_sw_names); i++, counter++)
-		seq_printf(m, "%26s:\t%llu\n", pci_sw_names[i],
+		seq_printf(m, "%26s:\t%lu\n", pci_sw_names[i],
 			   atomic64_read(counter));
 }
 
diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index 6b2f72f523b9..1d7a9c71944a 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -181,14 +181,17 @@ static int __dma_purge_tlb(struct zpci_dev *zdev, dma_addr_t dma_addr,
 	/*
 	 * With zdev->tlb_refresh == 0, rpcit is not required to establish new
 	 * translations when previously invalid translation-table entries are
-	 * validated. With lazy unmap, it also is skipped for previously valid
+	 * validated. With lazy unmap, rpcit is skipped for previously valid
 	 * entries, but a global rpcit is then required before any address can
 	 * be re-used, i.e. after each iommu bitmap wrap-around.
 	 */
-	if (!zdev->tlb_refresh &&
-			(!s390_iommu_strict ||
-			((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID)))
-		return 0;
+	if ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID) {
+		if (!zdev->tlb_refresh)
+			return 0;
+	} else {
+		if (!s390_iommu_strict)
+			return 0;
+	}
 
 	return zpci_refresh_trans((u64) zdev->fh << 32, dma_addr,
 				  PAGE_ALIGN(size));
@@ -257,7 +260,7 @@ static dma_addr_t dma_alloc_address(struct device *dev, int size)
 	spin_lock_irqsave(&zdev->iommu_bitmap_lock, flags);
 	offset = __dma_alloc_iommu(dev, zdev->next_bit, size);
 	if (offset == -1) {
-		if (!zdev->tlb_refresh && !s390_iommu_strict) {
+		if (!s390_iommu_strict) {
 			/* global flush before DMA addresses are reused */
 			if (zpci_refresh_global(zdev))
 				goto out_error;
@@ -292,7 +295,7 @@ static void dma_free_address(struct device *dev, dma_addr_t dma_addr, int size)
 	if (!zdev->iommu_bitmap)
 		goto out;
 
-	if (zdev->tlb_refresh || s390_iommu_strict)
+	if (s390_iommu_strict)
 		bitmap_clear(zdev->iommu_bitmap, offset, size);
 	else
 		bitmap_set(zdev->lazy_bitmap, offset, size);
@@ -388,8 +391,6 @@ static void *s390_dma_alloc(struct device *dev, size_t size,
 		return NULL;
 
 	pa = page_to_phys(page);
-	memset((void *) pa, 0, size);
-
 	map = s390_dma_map_pages(dev, page, 0, size, DMA_BIDIRECTIONAL, 0);
 	if (dma_mapping_error(dev, map)) {
 		free_pages(pa, get_order(size));
@@ -419,6 +420,7 @@ static int __s390_dma_map_sg(struct device *dev, struct scatterlist *sg,
 			     size_t size, dma_addr_t *handle,
 			     enum dma_data_direction dir)
 {
+	unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
 	struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
 	dma_addr_t dma_addr_base, dma_addr;
 	int flags = ZPCI_PTE_VALID;
@@ -426,8 +428,7 @@ static int __s390_dma_map_sg(struct device *dev, struct scatterlist *sg,
 	unsigned long pa = 0;
 	int ret;
 
-	size = PAGE_ALIGN(size);
-	dma_addr_base = dma_alloc_address(dev, size >> PAGE_SHIFT);
+	dma_addr_base = dma_alloc_address(dev, nr_pages);
 	if (dma_addr_base == DMA_ERROR_CODE)
 		return -ENOMEM;
 
@@ -436,26 +437,27 @@ static int __s390_dma_map_sg(struct device *dev, struct scatterlist *sg,
 		flags |= ZPCI_TABLE_PROTECTED;
 
 	for (s = sg; dma_addr < dma_addr_base + size; s = sg_next(s)) {
-		pa = page_to_phys(sg_page(s)) + s->offset;
-		ret = __dma_update_trans(zdev, pa, dma_addr, s->length, flags);
+		pa = page_to_phys(sg_page(s));
+		ret = __dma_update_trans(zdev, pa, dma_addr,
+					 s->offset + s->length, flags);
 		if (ret)
 			goto unmap;
 
-		dma_addr += s->length;
+		dma_addr += s->offset + s->length;
 	}
 	ret = __dma_purge_tlb(zdev, dma_addr_base, size, flags);
 	if (ret)
 		goto unmap;
 
 	*handle = dma_addr_base;
-	atomic64_add(size >> PAGE_SHIFT, &zdev->mapped_pages);
+	atomic64_add(nr_pages, &zdev->mapped_pages);
 
 	return ret;
 
 unmap:
 	dma_update_trans(zdev, 0, dma_addr_base, dma_addr - dma_addr_base,
 			 ZPCI_PTE_INVALID);
-	dma_free_address(dev, dma_addr_base, size >> PAGE_SHIFT);
+	dma_free_address(dev, dma_addr_base, nr_pages);
 	zpci_err("map error:\n");
 	zpci_err_dma(ret, pa);
 	return ret;
@@ -564,7 +566,7 @@ int zpci_dma_init_device(struct zpci_dev *zdev)
 		rc = -ENOMEM;
 		goto free_dma_table;
 	}
-	if (!zdev->tlb_refresh && !s390_iommu_strict) {
+	if (!s390_iommu_strict) {
 		zdev->lazy_bitmap = vzalloc(zdev->iommu_pages / 8);
 		if (!zdev->lazy_bitmap) {
 			rc = -ENOMEM;
diff --git a/arch/s390/tools/Makefile b/arch/s390/tools/Makefile
index 6d9814c9df2b..4b5e1e499527 100644
--- a/arch/s390/tools/Makefile
+++ b/arch/s390/tools/Makefile
@@ -9,7 +9,5 @@ define filechk_facilities.h
 	$(obj)/gen_facilities
 endef
 
-$(obj)/gen_facilities.o: $(srctree)/arch/s390/tools/gen_facilities.c
-
 include/generated/facilities.h: $(obj)/gen_facilities FORCE
 	$(call filechk,facilities.h)
diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c
index fe4e6c910dd7..8cc53b1e6d03 100644
--- a/arch/s390/tools/gen_facilities.c
+++ b/arch/s390/tools/gen_facilities.c
@@ -7,13 +7,83 @@
  *
  */
 
-#define S390_GEN_FACILITIES_C
-
 #include <strings.h>
 #include <string.h>
 #include <stdlib.h>
 #include <stdio.h>
-#include <asm/facilities_src.h>
+
+struct facility_def {
+	char *name;
+	int *bits;
+};
+
+static struct facility_def facility_defs[] = {
+	{
+		/*
+		 * FACILITIES_ALS contains the list of facilities that are
+		 * required to run a kernel that is compiled e.g. with
+		 * -march=<machine>.
+		 */
+		.name = "FACILITIES_ALS",
+		.bits = (int[]){
+#ifdef CONFIG_HAVE_MARCH_Z900_FEATURES
+			0,  /* N3 instructions */
+			1,  /* z/Arch mode installed */
+#endif
+#ifdef CONFIG_HAVE_MARCH_Z990_FEATURES
+			18, /* long displacement facility */
+#endif
+#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES
+			7,  /* stfle */
+			17, /* message security assist */
+			21, /* extended-immediate facility */
+			25, /* store clock fast */
+#endif
+#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES
+			27, /* mvcos */
+			32, /* compare and swap and store */
+			33, /* compare and swap and store 2 */
+			34, /* general extension facility */
+			35, /* execute extensions */
+#endif
+#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
+			45, /* fast-BCR, etc. */
+#endif
+#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES
+			49, /* misc-instruction-extensions */
+			52, /* interlocked facility 2 */
+#endif
+#ifdef CONFIG_HAVE_MARCH_Z13_FEATURES
+			53, /* load-and-zero-rightmost-byte, etc. */
+#endif
+			-1 /* END */
+		}
+	},
+	{
+		.name = "FACILITIES_KVM",
+		.bits = (int[]){
+			0,  /* N3 instructions */
+			1,  /* z/Arch mode installed */
+			2,  /* z/Arch mode active */
+			3,  /* DAT-enhancement */
+			4,  /* idte segment table */
+			5,  /* idte region table */
+			6,  /* ASN-and-LX reuse */
+			7,  /* stfle */
+			8,  /* enhanced-DAT 1 */
+			9,  /* sense-running-status */
+			10, /* conditional sske */
+			13, /* ipte-range */
+			14, /* nonquiescing key-setting */
+			73, /* transactional execution */
+			75, /* access-exception-fetch/store indication */
+			76, /* msa extension 3 */
+			77, /* msa extension 4 */
+			78, /* enhanced-DAT 2 */
+			-1  /* END */
+		}
+	},
+};
 
 static void print_facility_list(struct facility_def *def)
 {
diff --git a/arch/sh/kernel/cpu/Makefile b/arch/sh/kernel/cpu/Makefile
index accc7ca722e1..252e9fee687f 100644
--- a/arch/sh/kernel/cpu/Makefile
+++ b/arch/sh/kernel/cpu/Makefile
@@ -1,5 +1,5 @@
 #
-# Makefile for the Linux/SuperH CPU-specifc backends.
+# Makefile for the Linux/SuperH CPU-specific backends.
 #
 
 obj-$(CONFIG_CPU_SH2)		= sh2/
diff --git a/arch/sh/kernel/cpu/irq/Makefile b/arch/sh/kernel/cpu/irq/Makefile
index f0c7025a67d1..3f8e79402d7d 100644
--- a/arch/sh/kernel/cpu/irq/Makefile
+++ b/arch/sh/kernel/cpu/irq/Makefile
@@ -1,5 +1,5 @@
 #
-# Makefile for the Linux/SuperH CPU-specifc IRQ handlers.
+# Makefile for the Linux/SuperH CPU-specific IRQ handlers.
 #
 obj-$(CONFIG_SUPERH32)			+= imask.o
 obj-$(CONFIG_CPU_SH5)			+= intc-sh5.o
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 34d9e15857c3..44163e8c3868 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -25,7 +25,7 @@ KCOV_INSTRUMENT		:= n
 targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
 	vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4
 
-KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
+KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ -O2
 KBUILD_CFLAGS += -fno-strict-aliasing $(call cc-option, -fPIE, -fPIC)
 KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
 cflags-$(CONFIG_X86_32) := -march=i386
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 476b574de99e..ec23d8e1297c 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -1,13 +1,17 @@
 #ifndef _ASM_X86_E820_H
 #define _ASM_X86_E820_H
 
-#ifdef CONFIG_EFI
+/*
+ * E820_X_MAX is the maximum size of the extended E820 table.  The extended
+ * table may contain up to 3 extra E820 entries per possible NUMA node, so we
+ * make room for 3 * MAX_NUMNODES possible entries, beyond the standard 128.
+ * Also note that E820_X_MAX *must* be defined before we include uapi/asm/e820.h.
+ */
 #include <linux/numa.h>
 #define E820_X_MAX (E820MAX + 3 * MAX_NUMNODES)
-#else	/* ! CONFIG_EFI */
-#define E820_X_MAX E820MAX
-#endif
+
 #include <uapi/asm/e820.h>
+
 #ifndef __ASSEMBLY__
 /* see comment in arch/x86/kernel/e820.c */
 extern struct e820map *e820;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bdde80731f49..7892530cbacf 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -191,6 +191,8 @@ enum {
 #define PFERR_RSVD_BIT 3
 #define PFERR_FETCH_BIT 4
 #define PFERR_PK_BIT 5
+#define PFERR_GUEST_FINAL_BIT 32
+#define PFERR_GUEST_PAGE_BIT 33
 
 #define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
 #define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
@@ -198,6 +200,13 @@ enum {
 #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
 #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
 #define PFERR_PK_MASK (1U << PFERR_PK_BIT)
+#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT)
+#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT)
+
+#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK |	\
+				 PFERR_USER_MASK |		\
+				 PFERR_WRITE_MASK |		\
+				 PFERR_PRESENT_MASK)
 
 /* apic attention bits */
 #define KVM_APIC_CHECK_VAPIC	0
@@ -1062,6 +1071,7 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
 
 int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
+bool pdptrs_changed(struct kvm_vcpu *vcpu);
 
 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 			  const void *val, int bytes);
@@ -1124,7 +1134,8 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
 struct x86_emulate_ctxt;
 
 int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
+int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port);
+int kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
 int kvm_emulate_halt(struct kvm_vcpu *vcpu);
 int kvm_vcpu_halt(struct kvm_vcpu *vcpu);
 int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
@@ -1203,7 +1214,7 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu);
 
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code,
 		       void *insn, int insn_len);
 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
 void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu);
@@ -1358,7 +1369,8 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
 bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
 extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 
-void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
+int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu);
+int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
 
 int kvm_is_in_guest(void);
 
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index a002b07a7099..2b5b2d4b924e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -25,6 +25,7 @@
 #define VMX_H
 
 
+#include <linux/bitops.h>
 #include <linux/types.h>
 #include <uapi/asm/vmx.h>
 
@@ -60,6 +61,7 @@
  */
 #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
 #define SECONDARY_EXEC_ENABLE_EPT               0x00000002
+#define SECONDARY_EXEC_DESC			0x00000004
 #define SECONDARY_EXEC_RDTSCP			0x00000008
 #define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x00000010
 #define SECONDARY_EXEC_ENABLE_VPID              0x00000020
@@ -110,6 +112,36 @@
 #define VMX_MISC_SAVE_EFER_LMA			0x00000020
 #define VMX_MISC_ACTIVITY_HLT			0x00000040
 
+static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
+{
+	return vmx_basic & GENMASK_ULL(30, 0);
+}
+
+static inline u32 vmx_basic_vmcs_size(u64 vmx_basic)
+{
+	return (vmx_basic & GENMASK_ULL(44, 32)) >> 32;
+}
+
+static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc)
+{
+	return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+}
+
+static inline int vmx_misc_cr3_count(u64 vmx_misc)
+{
+	return (vmx_misc & GENMASK_ULL(24, 16)) >> 16;
+}
+
+static inline int vmx_misc_max_msr(u64 vmx_misc)
+{
+	return (vmx_misc & GENMASK_ULL(27, 25)) >> 25;
+}
+
+static inline int vmx_misc_mseg_revid(u64 vmx_misc)
+{
+	return (vmx_misc & GENMASK_ULL(63, 32)) >> 32;
+}
+
 /* VMCS Encodings */
 enum vmcs_field {
 	VIRTUAL_PROCESSOR_ID            = 0x00000000,
@@ -399,10 +431,11 @@ enum vmcs_field {
 #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT	(KVM_USER_MEM_SLOTS + 2)
 
 #define VMX_NR_VPIDS				(1 << 16)
+#define VMX_VPID_EXTENT_INDIVIDUAL_ADDR		0
 #define VMX_VPID_EXTENT_SINGLE_CONTEXT		1
 #define VMX_VPID_EXTENT_ALL_CONTEXT		2
+#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL	3
 
-#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR		0
 #define VMX_EPT_EXTENT_CONTEXT			1
 #define VMX_EPT_EXTENT_GLOBAL			2
 #define VMX_EPT_EXTENT_SHIFT			24
@@ -419,8 +452,10 @@ enum vmcs_field {
 #define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
 
 #define VMX_VPID_INVVPID_BIT                    (1ull << 0) /* (32 - 32) */
+#define VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT     (1ull << 8) /* (40 - 32) */
 #define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT      (1ull << 9) /* (41 - 32) */
 #define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT      (1ull << 10) /* (42 - 32) */
+#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT   (1ull << 11) /* (43 - 32) */
 
 #define VMX_EPT_DEFAULT_GAW			3
 #define VMX_EPT_MAX_GAW				0x4
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 37fee272618f..14458658e988 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -65,6 +65,8 @@
 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
 #define EXIT_REASON_APIC_ACCESS         44
 #define EXIT_REASON_EOI_INDUCED         45
+#define EXIT_REASON_GDTR_IDTR           46
+#define EXIT_REASON_LDTR_TR             47
 #define EXIT_REASON_EPT_VIOLATION       48
 #define EXIT_REASON_EPT_MISCONFIG       49
 #define EXIT_REASON_INVEPT              50
@@ -113,6 +115,8 @@
 	{ EXIT_REASON_MCE_DURING_VMENTRY,    "MCE_DURING_VMENTRY" }, \
 	{ EXIT_REASON_TPR_BELOW_THRESHOLD,   "TPR_BELOW_THRESHOLD" }, \
 	{ EXIT_REASON_APIC_ACCESS,           "APIC_ACCESS" }, \
+	{ EXIT_REASON_GDTR_IDTR,	     "GDTR_IDTR" }, \
+	{ EXIT_REASON_LDTR_TR,		     "LDTR_TR" }, \
 	{ EXIT_REASON_EPT_VIOLATION,         "EPT_VIOLATION" }, \
 	{ EXIT_REASON_EPT_MISCONFIG,         "EPT_MISCONFIG" }, \
 	{ EXIT_REASON_INVEPT,                "INVEPT" }, \
@@ -129,6 +133,7 @@
 	{ EXIT_REASON_XRSTORS,               "XRSTORS" }
 
 #define VMX_ABORT_SAVE_GUEST_MSR_FAIL        1
+#define VMX_ABORT_LOAD_HOST_PDPTE_FAIL       2
 #define VMX_ABORT_LOAD_HOST_MSR_FAIL         4
 
 #endif /* _UAPIVMX_H */
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index de6626c18e42..be6337156502 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -934,6 +934,8 @@ static int __populate_cache_leaves(unsigned int cpu)
 		ci_leaf_init(this_leaf++, &id4_regs);
 		__cache_cpumap_setup(cpu, idx, &id4_regs);
 	}
+	this_cpu_ci->cpu_map_populated = true;
+
 	return 0;
 }
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 132e1ec67da0..00ef43233e03 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -516,7 +516,7 @@ int mce_available(struct cpuinfo_x86 *c)
 
 static void mce_schedule_work(void)
 {
-	if (!mce_gen_pool_empty() && keventd_up())
+	if (!mce_gen_pool_empty())
 		schedule_work(&mce_work);
 }
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 0aefb626fa8f..b2d3cf1ef54a 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -16,6 +16,7 @@
 #include <linux/export.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
+#include <asm/processor.h>
 #include <asm/user.h>
 #include <asm/fpu/xstate.h>
 #include "cpuid.h"
@@ -64,6 +65,11 @@ u64 kvm_supported_xcr0(void)
 
 #define F(x) bit(X86_FEATURE_##x)
 
+/* These are scattered features in cpufeatures.h. */
+#define KVM_CPUID_BIT_AVX512_4VNNIW     2
+#define KVM_CPUID_BIT_AVX512_4FMAPS     3
+#define KF(x) bit(KVM_CPUID_BIT_##x)
+
 int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
@@ -80,6 +86,10 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 			best->ecx |= F(OSXSAVE);
 	}
 
+	best->edx &= ~F(APIC);
+	if (vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE)
+		best->edx |= F(APIC);
+
 	if (apic) {
 		if (best->ecx & F(TSC_DEADLINE_TIMER))
 			apic->lapic_timer.timer_mode_mask = 3 << 17;
@@ -374,6 +384,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	/* cpuid 7.0.ecx*/
 	const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/;
 
+	/* cpuid 7.0.edx*/
+	const u32 kvm_cpuid_7_0_edx_x86_features =
+		KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS);
+
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
 
@@ -456,12 +470,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 			/* PKU is not yet implemented for shadow paging. */
 			if (!tdp_enabled)
 				entry->ecx &= ~F(PKU);
+			entry->edx &= kvm_cpuid_7_0_edx_x86_features;
+			entry->edx &= get_scattered_cpuid_leaf(7, 0, CPUID_EDX);
 		} else {
 			entry->ebx = 0;
 			entry->ecx = 0;
+			entry->edx = 0;
 		}
 		entry->eax = 0;
-		entry->edx = 0;
 		break;
 	}
 	case 9:
@@ -861,17 +877,17 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
 }
 EXPORT_SYMBOL_GPL(kvm_cpuid);
 
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
 {
-	u32 function, eax, ebx, ecx, edx;
+	u32 eax, ebx, ecx, edx;
 
-	function = eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
 	ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
 	kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx);
 	kvm_register_write(vcpu, VCPU_REGS_RAX, eax);
 	kvm_register_write(vcpu, VCPU_REGS_RBX, ebx);
 	kvm_register_write(vcpu, VCPU_REGS_RCX, ecx);
 	kvm_register_write(vcpu, VCPU_REGS_RDX, edx);
-	kvm_x86_ops->skip_emulated_instruction(vcpu);
+	return kvm_skip_emulated_instruction(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a3ce9d260d68..56628a44668b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -158,9 +158,11 @@
 #define Src2GS      (OpGS << Src2Shift)
 #define Src2Mask    (OpMask << Src2Shift)
 #define Mmx         ((u64)1 << 40)  /* MMX Vector instruction */
+#define AlignMask   ((u64)7 << 41)
 #define Aligned     ((u64)1 << 41)  /* Explicitly aligned (e.g. MOVDQA) */
-#define Unaligned   ((u64)1 << 42)  /* Explicitly unaligned (e.g. MOVDQU) */
-#define Avx         ((u64)1 << 43)  /* Advanced Vector Extensions */
+#define Unaligned   ((u64)2 << 41)  /* Explicitly unaligned (e.g. MOVDQU) */
+#define Avx         ((u64)3 << 41)  /* Advanced Vector Extensions */
+#define Aligned16   ((u64)4 << 41)  /* Aligned to 16 byte boundary (e.g. FXSAVE) */
 #define Fastop      ((u64)1 << 44)  /* Use opcode::u.fastop */
 #define NoWrite     ((u64)1 << 45)  /* No writeback */
 #define SrcWrite    ((u64)1 << 46)  /* Write back src operand */
@@ -446,6 +448,26 @@ FOP_END;
 FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET
 FOP_END;
 
+/*
+ * XXX: inoutclob user must know where the argument is being expanded.
+ *      Relying on CC_HAVE_ASM_GOTO would allow us to remove _fault.
+ */
+#define asm_safe(insn, inoutclob...) \
+({ \
+	int _fault = 0; \
+ \
+	asm volatile("1:" insn "\n" \
+	             "2:\n" \
+	             ".pushsection .fixup, \"ax\"\n" \
+	             "3: movl $1, %[_fault]\n" \
+	             "   jmp  2b\n" \
+	             ".popsection\n" \
+	             _ASM_EXTABLE(1b, 3b) \
+	             : [_fault] "+qm"(_fault) inoutclob ); \
+ \
+	_fault ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; \
+})
+
 static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
 				    enum x86_intercept intercept,
 				    enum x86_intercept_stage stage)
@@ -632,21 +654,26 @@ static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector,
  * depending on whether they're AVX encoded or not.
  *
  * Also included is CMPXCHG16B which is not a vector instruction, yet it is
- * subject to the same check.
+ * subject to the same check.  FXSAVE and FXRSTOR are checked here too as their
+ * 512 bytes of data must be aligned to a 16 byte boundary.
  */
-static bool insn_aligned(struct x86_emulate_ctxt *ctxt, unsigned size)
+static unsigned insn_alignment(struct x86_emulate_ctxt *ctxt, unsigned size)
 {
-	if (likely(size < 16))
-		return false;
+	u64 alignment = ctxt->d & AlignMask;
 
-	if (ctxt->d & Aligned)
-		return true;
-	else if (ctxt->d & Unaligned)
-		return false;
-	else if (ctxt->d & Avx)
-		return false;
-	else
-		return true;
+	if (likely(size < 16))
+		return 1;
+
+	switch (alignment) {
+	case Unaligned:
+	case Avx:
+		return 1;
+	case Aligned16:
+		return 16;
+	case Aligned:
+	default:
+		return size;
+	}
 }
 
 static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
@@ -704,7 +731,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
 		}
 		break;
 	}
-	if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0))
+	if (la & (insn_alignment(ctxt, size) - 1))
 		return emulate_gp(ctxt, 0);
 	return X86EMUL_CONTINUE;
 bad:
@@ -3842,6 +3869,131 @@ static int em_movsxd(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int check_fxsr(struct x86_emulate_ctxt *ctxt)
+{
+	u32 eax = 1, ebx, ecx = 0, edx;
+
+	ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+	if (!(edx & FFL(FXSR)))
+		return emulate_ud(ctxt);
+
+	if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+		return emulate_nm(ctxt);
+
+	/*
+	 * Don't emulate a case that should never be hit, instead of working
+	 * around a lack of fxsave64/fxrstor64 on old compilers.
+	 */
+	if (ctxt->mode >= X86EMUL_MODE_PROT64)
+		return X86EMUL_UNHANDLEABLE;
+
+	return X86EMUL_CONTINUE;
+}
+
+/*
+ * FXSAVE and FXRSTOR have 4 different formats depending on execution mode,
+ *  1) 16 bit mode
+ *  2) 32 bit mode
+ *     - like (1), but FIP and FDP (foo) are only 16 bit.  At least Intel CPUs
+ *       preserve whole 32 bit values, though, so (1) and (2) are the same wrt.
+ *       save and restore
+ *  3) 64-bit mode with REX.W prefix
+ *     - like (2), but XMM 8-15 are being saved and restored
+ *  4) 64-bit mode without REX.W prefix
+ *     - like (3), but FIP and FDP are 64 bit
+ *
+ * Emulation uses (3) for (1) and (2) and preserves XMM 8-15 to reach the
+ * desired result.  (4) is not emulated.
+ *
+ * Note: Guest and host CPUID.(EAX=07H,ECX=0H):EBX[bit 13] (deprecate FPU CS
+ * and FPU DS) should match.
+ */
+static int em_fxsave(struct x86_emulate_ctxt *ctxt)
+{
+	struct fxregs_state fx_state;
+	size_t size;
+	int rc;
+
+	rc = check_fxsr(ctxt);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	ctxt->ops->get_fpu(ctxt);
+
+	rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state));
+
+	ctxt->ops->put_fpu(ctxt);
+
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)
+		size = offsetof(struct fxregs_state, xmm_space[8 * 16/4]);
+	else
+		size = offsetof(struct fxregs_state, xmm_space[0]);
+
+	return segmented_write(ctxt, ctxt->memop.addr.mem, &fx_state, size);
+}
+
+static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt,
+		struct fxregs_state *new)
+{
+	int rc = X86EMUL_CONTINUE;
+	struct fxregs_state old;
+
+	rc = asm_safe("fxsave %[fx]", , [fx] "+m"(old));
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	/*
+	 * 64 bit host will restore XMM 8-15, which is not correct on non-64
+	 * bit guests.  Load the current values in order to preserve 64 bit
+	 * XMMs after fxrstor.
+	 */
+#ifdef CONFIG_X86_64
+	/* XXX: accessing XMM 8-15 very awkwardly */
+	memcpy(&new->xmm_space[8 * 16/4], &old.xmm_space[8 * 16/4], 8 * 16);
+#endif
+
+	/*
+	 * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but
+	 * does save and restore MXCSR.
+	 */
+	if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))
+		memcpy(new->xmm_space, old.xmm_space, 8 * 16);
+
+	return rc;
+}
+
+static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
+{
+	struct fxregs_state fx_state;
+	int rc;
+
+	rc = check_fxsr(ctxt);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	rc = segmented_read(ctxt, ctxt->memop.addr.mem, &fx_state, 512);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	if (fx_state.mxcsr >> 16)
+		return emulate_gp(ctxt, 0);
+
+	ctxt->ops->get_fpu(ctxt);
+
+	if (ctxt->mode < X86EMUL_MODE_PROT64)
+		rc = fxrstor_fixup(ctxt, &fx_state);
+
+	if (rc == X86EMUL_CONTINUE)
+		rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state));
+
+	ctxt->ops->put_fpu(ctxt);
+
+	return rc;
+}
+
 static bool valid_cr(int nr)
 {
 	switch (nr) {
@@ -4194,7 +4346,9 @@ static const struct gprefix pfx_0f_ae_7 = {
 };
 
 static const struct group_dual group15 = { {
-	N, N, N, N, N, N, N, GP(0, &pfx_0f_ae_7),
+	I(ModRM | Aligned16, em_fxsave),
+	I(ModRM | Aligned16, em_fxrstor),
+	N, N, N, N, N, GP(0, &pfx_0f_ae_7),
 }, {
 	N, N, N, N, N, N, N, N,
 } };
@@ -5066,21 +5220,13 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
 
 static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt)
 {
-	bool fault = false;
+	int rc;
 
 	ctxt->ops->get_fpu(ctxt);
-	asm volatile("1: fwait \n\t"
-		     "2: \n\t"
-		     ".pushsection .fixup,\"ax\" \n\t"
-		     "3: \n\t"
-		     "movb $1, %[fault] \n\t"
-		     "jmp 2b \n\t"
-		     ".popsection \n\t"
-		     _ASM_EXTABLE(1b, 3b)
-		     : [fault]"+qm"(fault));
+	rc = asm_safe("fwait");
 	ctxt->ops->put_fpu(ctxt);
 
-	if (unlikely(fault))
+	if (unlikely(rc != X86EMUL_CONTINUE))
 		return emulate_exception(ctxt, MF_VECTOR, 0, false);
 
 	return X86EMUL_CONTINUE;
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 42b1c83741c8..99cde5220e07 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -291,7 +291,7 @@ static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata)
 	return ret;
 }
 
-int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
+static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
 {
 	struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
 	struct kvm_lapic_irq irq;
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 16a7134eedac..a78b445ce411 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -212,7 +212,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 	 */
 	smp_mb();
 	if (atomic_dec_if_positive(&ps->pending) > 0)
-		kthread_queue_work(&pit->worker, &pit->expired);
+		kthread_queue_work(pit->worker, &pit->expired);
 }
 
 void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -272,7 +272,7 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 	if (atomic_read(&ps->reinject))
 		atomic_inc(&ps->pending);
 
-	kthread_queue_work(&pt->worker, &pt->expired);
+	kthread_queue_work(pt->worker, &pt->expired);
 
 	if (ps->is_periodic) {
 		hrtimer_add_expires_ns(&ps->timer, ps->period);
@@ -667,10 +667,8 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 	pid_nr = pid_vnr(pid);
 	put_pid(pid);
 
-	kthread_init_worker(&pit->worker);
-	pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker,
-				       "kvm-pit/%d", pid_nr);
-	if (IS_ERR(pit->worker_task))
+	pit->worker = kthread_create_worker(0, "kvm-pit/%d", pid_nr);
+	if (IS_ERR(pit->worker))
 		goto fail_kthread;
 
 	kthread_init_work(&pit->expired, pit_do_work);
@@ -713,7 +711,7 @@ fail_register_speaker:
 fail_register_pit:
 	mutex_unlock(&kvm->slots_lock);
 	kvm_pit_set_reinject(pit, false);
-	kthread_stop(pit->worker_task);
+	kthread_destroy_worker(pit->worker);
 fail_kthread:
 	kvm_free_irq_source_id(kvm, pit->irq_source_id);
 fail_request:
@@ -730,8 +728,7 @@ void kvm_free_pit(struct kvm *kvm)
 		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev);
 		kvm_pit_set_reinject(pit, false);
 		hrtimer_cancel(&pit->pit_state.timer);
-		kthread_flush_work(&pit->expired);
-		kthread_stop(pit->worker_task);
+		kthread_destroy_worker(pit->worker);
 		kvm_free_irq_source_id(kvm, pit->irq_source_id);
 		kfree(pit);
 	}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 2f5af0798326..600bee9dcbbd 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -44,8 +44,7 @@ struct kvm_pit {
 	struct kvm_kpit_state pit_state;
 	int irq_source_id;
 	struct kvm_irq_mask_notifier mask_notifier;
-	struct kthread_worker worker;
-	struct task_struct *worker_task;
+	struct kthread_worker *worker;
 	struct kthread_work expired;
 };
 
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 6f69340f9fa3..34a66b2d47e6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -342,9 +342,11 @@ void __kvm_apic_update_irr(u32 *pir, void *regs)
 	u32 i, pir_val;
 
 	for (i = 0; i <= 7; i++) {
-		pir_val = xchg(&pir[i], 0);
-		if (pir_val)
+		pir_val = READ_ONCE(pir[i]);
+		if (pir_val) {
+			pir_val = xchg(&pir[i], 0);
 			*((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val;
+		}
 	}
 }
 EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
@@ -1090,7 +1092,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 
 static u32 apic_get_tmcct(struct kvm_lapic *apic)
 {
-	ktime_t remaining;
+	ktime_t remaining, now;
 	s64 ns;
 	u32 tmcct;
 
@@ -1101,7 +1103,8 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
 		apic->lapic_timer.period == 0)
 		return 0;
 
-	remaining = hrtimer_get_remaining(&apic->lapic_timer.timer);
+	now = ktime_get();
+	remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
 	if (ktime_to_ns(remaining) < 0)
 		remaining = ktime_set(0, 0);
 
@@ -1332,7 +1335,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
 
 	local_irq_save(flags);
 
-	now = apic->lapic_timer.timer.base->get_time();
+	now = ktime_get();
 	guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
 	if (likely(tscdeadline > guest_tsc)) {
 		ns = (tscdeadline - guest_tsc) * 1000000ULL;
@@ -1347,6 +1350,79 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
 	local_irq_restore(flags);
 }
 
+static void start_sw_period(struct kvm_lapic *apic)
+{
+	if (!apic->lapic_timer.period)
+		return;
+
+	if (apic_lvtt_oneshot(apic) &&
+	    ktime_after(ktime_get(),
+			apic->lapic_timer.target_expiration)) {
+		apic_timer_expired(apic);
+		return;
+	}
+
+	hrtimer_start(&apic->lapic_timer.timer,
+		apic->lapic_timer.target_expiration,
+		HRTIMER_MODE_ABS_PINNED);
+}
+
+static bool set_target_expiration(struct kvm_lapic *apic)
+{
+	ktime_t now;
+	u64 tscl = rdtsc();
+
+	now = ktime_get();
+	apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
+		* APIC_BUS_CYCLE_NS * apic->divide_count;
+
+	if (!apic->lapic_timer.period)
+		return false;
+
+	/*
+	 * Do not allow the guest to program periodic timers with small
+	 * interval, since the hrtimers are not throttled by the host
+	 * scheduler.
+	 */
+	if (apic_lvtt_period(apic)) {
+		s64 min_period = min_timer_period_us * 1000LL;
+
+		if (apic->lapic_timer.period < min_period) {
+			pr_info_ratelimited(
+			    "kvm: vcpu %i: requested %lld ns "
+			    "lapic timer period limited to %lld ns\n",
+			    apic->vcpu->vcpu_id,
+			    apic->lapic_timer.period, min_period);
+			apic->lapic_timer.period = min_period;
+		}
+	}
+
+	apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
+		   PRIx64 ", "
+		   "timer initial count 0x%x, period %lldns, "
+		   "expire @ 0x%016" PRIx64 ".\n", __func__,
+		   APIC_BUS_CYCLE_NS, ktime_to_ns(now),
+		   kvm_lapic_get_reg(apic, APIC_TMICT),
+		   apic->lapic_timer.period,
+		   ktime_to_ns(ktime_add_ns(now,
+				apic->lapic_timer.period)));
+
+	apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
+		nsec_to_cycles(apic->vcpu, apic->lapic_timer.period);
+	apic->lapic_timer.target_expiration = ktime_add_ns(now, apic->lapic_timer.period);
+
+	return true;
+}
+
+static void advance_periodic_target_expiration(struct kvm_lapic *apic)
+{
+	apic->lapic_timer.tscdeadline +=
+		nsec_to_cycles(apic->vcpu, apic->lapic_timer.period);
+	apic->lapic_timer.target_expiration =
+		ktime_add_ns(apic->lapic_timer.target_expiration,
+				apic->lapic_timer.period);
+}
+
 bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
 {
 	if (!lapic_in_kernel(vcpu))
@@ -1356,52 +1432,59 @@ bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
 
-static void cancel_hv_tscdeadline(struct kvm_lapic *apic)
+static void cancel_hv_timer(struct kvm_lapic *apic)
 {
 	kvm_x86_ops->cancel_hv_timer(apic->vcpu);
 	apic->lapic_timer.hv_timer_in_use = false;
 }
 
-void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
-{
-	struct kvm_lapic *apic = vcpu->arch.apic;
-
-	WARN_ON(!apic->lapic_timer.hv_timer_in_use);
-	WARN_ON(swait_active(&vcpu->wq));
-	cancel_hv_tscdeadline(apic);
-	apic_timer_expired(apic);
-}
-EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
-
-static bool start_hv_tscdeadline(struct kvm_lapic *apic)
+static bool start_hv_timer(struct kvm_lapic *apic)
 {
 	u64 tscdeadline = apic->lapic_timer.tscdeadline;
 
-	if (atomic_read(&apic->lapic_timer.pending) ||
+	if ((atomic_read(&apic->lapic_timer.pending) &&
+		!apic_lvtt_period(apic)) ||
 		kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) {
 		if (apic->lapic_timer.hv_timer_in_use)
-			cancel_hv_tscdeadline(apic);
+			cancel_hv_timer(apic);
 	} else {
 		apic->lapic_timer.hv_timer_in_use = true;
 		hrtimer_cancel(&apic->lapic_timer.timer);
 
 		/* In case the sw timer triggered in the window */
-		if (atomic_read(&apic->lapic_timer.pending))
-			cancel_hv_tscdeadline(apic);
+		if (atomic_read(&apic->lapic_timer.pending) &&
+			!apic_lvtt_period(apic))
+			cancel_hv_timer(apic);
 	}
 	trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
 			apic->lapic_timer.hv_timer_in_use);
 	return apic->lapic_timer.hv_timer_in_use;
 }
 
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+	WARN_ON(swait_active(&vcpu->wq));
+	cancel_hv_timer(apic);
+	apic_timer_expired(apic);
+
+	if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
+		advance_periodic_target_expiration(apic);
+		if (!start_hv_timer(apic))
+			start_sw_period(apic);
+	}
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
+
 void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
 	WARN_ON(apic->lapic_timer.hv_timer_in_use);
 
-	if (apic_lvtt_tscdeadline(apic))
-		start_hv_tscdeadline(apic);
+	start_hv_timer(apic);
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
 
@@ -1413,62 +1496,28 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
 	if (!apic->lapic_timer.hv_timer_in_use)
 		return;
 
-	cancel_hv_tscdeadline(apic);
+	cancel_hv_timer(apic);
 
 	if (atomic_read(&apic->lapic_timer.pending))
 		return;
 
-	start_sw_tscdeadline(apic);
+	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
+		start_sw_period(apic);
+	else if (apic_lvtt_tscdeadline(apic))
+		start_sw_tscdeadline(apic);
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
 
 static void start_apic_timer(struct kvm_lapic *apic)
 {
-	ktime_t now;
-
 	atomic_set(&apic->lapic_timer.pending, 0);
 
 	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
-		/* lapic timer in oneshot or periodic mode */
-		now = apic->lapic_timer.timer.base->get_time();
-		apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
-			    * APIC_BUS_CYCLE_NS * apic->divide_count;
-
-		if (!apic->lapic_timer.period)
-			return;
-		/*
-		 * Do not allow the guest to program periodic timers with small
-		 * interval, since the hrtimers are not throttled by the host
-		 * scheduler.
-		 */
-		if (apic_lvtt_period(apic)) {
-			s64 min_period = min_timer_period_us * 1000LL;
-
-			if (apic->lapic_timer.period < min_period) {
-				pr_info_ratelimited(
-				    "kvm: vcpu %i: requested %lld ns "
-				    "lapic timer period limited to %lld ns\n",
-				    apic->vcpu->vcpu_id,
-				    apic->lapic_timer.period, min_period);
-				apic->lapic_timer.period = min_period;
-			}
-		}
-
-		hrtimer_start(&apic->lapic_timer.timer,
-			      ktime_add_ns(now, apic->lapic_timer.period),
-			      HRTIMER_MODE_ABS_PINNED);
-
-		apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
-			   PRIx64 ", "
-			   "timer initial count 0x%x, period %lldns, "
-			   "expire @ 0x%016" PRIx64 ".\n", __func__,
-			   APIC_BUS_CYCLE_NS, ktime_to_ns(now),
-			   kvm_lapic_get_reg(apic, APIC_TMICT),
-			   apic->lapic_timer.period,
-			   ktime_to_ns(ktime_add_ns(now,
-					apic->lapic_timer.period)));
+		if (set_target_expiration(apic) &&
+			!(kvm_x86_ops->set_hv_timer && start_hv_timer(apic)))
+			start_sw_period(apic);
 	} else if (apic_lvtt_tscdeadline(apic)) {
-		if (!(kvm_x86_ops->set_hv_timer && start_hv_tscdeadline(apic)))
+		if (!(kvm_x86_ops->set_hv_timer && start_hv_timer(apic)))
 			start_sw_tscdeadline(apic);
 	}
 }
@@ -1701,13 +1750,22 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
  * LAPIC interface
  *----------------------------------------------------------------------
  */
+u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	if (!lapic_in_kernel(vcpu))
+		return 0;
+
+	return apic->lapic_timer.tscdeadline;
+}
 
 u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) ||
-			apic_lvtt_period(apic))
+	if (!lapic_in_kernel(vcpu) ||
+		!apic_lvtt_tscdeadline(apic))
 		return 0;
 
 	return apic->lapic_timer.tscdeadline;
@@ -1748,14 +1806,17 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 	u64 old_value = vcpu->arch.apic_base;
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (!apic) {
+	if (!apic)
 		value |= MSR_IA32_APICBASE_BSP;
-		vcpu->arch.apic_base = value;
-		return;
-	}
 
 	vcpu->arch.apic_base = value;
 
+	if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE)
+		kvm_update_cpuid(vcpu);
+
+	if (!apic)
+		return;
+
 	/* update jump label if enable bit changes */
 	if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
 		if (value & MSR_IA32_APICBASE_ENABLE) {
@@ -1909,6 +1970,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
 	apic_timer_expired(apic);
 
 	if (lapic_is_periodic(apic)) {
+		advance_periodic_target_expiration(apic);
 		hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
 		return HRTIMER_RESTART;
 	} else
@@ -1993,6 +2055,10 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
 		kvm_apic_local_deliver(apic, APIC_LVTT);
 		if (apic_lvtt_tscdeadline(apic))
 			apic->lapic_timer.tscdeadline = 0;
+		if (apic_lvtt_oneshot(apic)) {
+			apic->lapic_timer.tscdeadline = 0;
+			apic->lapic_timer.target_expiration = ktime_set(0, 0);
+		}
 		atomic_set(&apic->lapic_timer.pending, 0);
 	}
 }
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index f60d01c29d51..e0c80233b3e1 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -15,6 +15,7 @@
 struct kvm_timer {
 	struct hrtimer timer;
 	s64 period; 				/* unit: ns */
+	ktime_t target_expiration;
 	u32 timer_mode;
 	u32 timer_mode_mask;
 	u64 tscdeadline;
@@ -85,6 +86,7 @@ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
 int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
 
+u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu);
 u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
 void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 87c5880ba3b7..7012de4a1fed 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1660,17 +1660,9 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
 	 * This has some overhead, but not as much as the cost of swapping
 	 * out actively used pages or breaking up actively used hugepages.
 	 */
-	if (!shadow_accessed_mask) {
-		/*
-		 * We are holding the kvm->mmu_lock, and we are blowing up
-		 * shadow PTEs. MMU notifier consumers need to be kept at bay.
-		 * This is correct as long as we don't decouple the mmu_lock
-		 * protected regions (like invalidate_range_start|end does).
-		 */
-		kvm->mmu_notifier_seq++;
+	if (!shadow_accessed_mask)
 		return kvm_handle_hva_range(kvm, start, end, 0,
 					    kvm_unmap_rmapp);
-	}
 
 	return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
 }
@@ -4509,7 +4501,7 @@ static void make_mmu_pages_available(struct kvm_vcpu *vcpu)
 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 }
 
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
 		       void *insn, int insn_len)
 {
 	int r, emulation_type = EMULTYPE_RETRY;
@@ -4528,12 +4520,28 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
 			return r;
 	}
 
-	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
+	r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
+				      false);
 	if (r < 0)
 		return r;
 	if (!r)
 		return 1;
 
+	/*
+	 * Before emulating the instruction, check if the error code
+	 * was due to a RO violation while translating the guest page.
+	 * This can occur when using nested virtualization with nested
+	 * paging in both guests. If true, we simply unprotect the page
+	 * and resume the guest.
+	 *
+	 * Note: AMD only (since it supports the PFERR_GUEST_PAGE_MASK used
+	 *       in PFERR_NEXT_GUEST_PAGE)
+	 */
+	if (error_code == PFERR_NESTED_GUEST_PAGE) {
+		kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
+		return 1;
+	}
+
 	if (mmio_info_in_cache(vcpu, cr2, direct))
 		emulation_type = 0;
 emulate:
@@ -4967,7 +4975,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots)
 	 * zap all shadow pages.
 	 */
 	if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) {
-		printk_ratelimited(KERN_DEBUG "kvm: zapping shadow pages for mmio generation wraparound\n");
+		kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
 		kvm_mmu_invalidate_zap_all_pages(kvm);
 	}
 }
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8ca1eca5038d..08a4d3ab3455 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2074,7 +2074,7 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
 static int pf_interception(struct vcpu_svm *svm)
 {
 	u64 fault_address = svm->vmcb->control.exit_info_2;
-	u32 error_code;
+	u64 error_code;
 	int r = 1;
 
 	switch (svm->apf_reason) {
@@ -2270,7 +2270,7 @@ static int io_interception(struct vcpu_svm *svm)
 	++svm->vcpu.stat.io_exits;
 	string = (io_info & SVM_IOIO_STR_MASK) != 0;
 	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
-	if (string || in)
+	if (string)
 		return emulate_instruction(vcpu, 0) == EMULATE_DONE;
 
 	port = io_info >> 16;
@@ -2278,7 +2278,8 @@ static int io_interception(struct vcpu_svm *svm)
 	svm->next_rip = svm->vmcb->control.exit_info_2;
 	skip_emulated_instruction(&svm->vcpu);
 
-	return kvm_fast_pio_out(vcpu, size, port);
+	return in ? kvm_fast_pio_in(vcpu, size, port)
+		  : kvm_fast_pio_out(vcpu, size, port);
 }
 
 static int nmi_interception(struct vcpu_svm *svm)
@@ -3150,8 +3151,7 @@ static int skinit_interception(struct vcpu_svm *svm)
 
 static int wbinvd_interception(struct vcpu_svm *svm)
 {
-	kvm_emulate_wbinvd(&svm->vcpu);
-	return 1;
+	return kvm_emulate_wbinvd(&svm->vcpu);
 }
 
 static int xsetbv_interception(struct vcpu_svm *svm)
@@ -3238,8 +3238,7 @@ static int task_switch_interception(struct vcpu_svm *svm)
 static int cpuid_interception(struct vcpu_svm *svm)
 {
 	svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
-	kvm_emulate_cpuid(&svm->vcpu);
-	return 1;
+	return kvm_emulate_cpuid(&svm->vcpu);
 }
 
 static int iret_interception(struct vcpu_svm *svm)
@@ -3275,9 +3274,7 @@ static int rdpmc_interception(struct vcpu_svm *svm)
 		return emulate_on_interception(svm);
 
 	err = kvm_rdpmc(&svm->vcpu);
-	kvm_complete_insn_gp(&svm->vcpu, err);
-
-	return 1;
+	return kvm_complete_insn_gp(&svm->vcpu, err);
 }
 
 static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
@@ -3374,9 +3371,7 @@ static int cr_interception(struct vcpu_svm *svm)
 		}
 		kvm_register_write(&svm->vcpu, reg, val);
 	}
-	kvm_complete_insn_gp(&svm->vcpu, err);
-
-	return 1;
+	return kvm_complete_insn_gp(&svm->vcpu, err);
 }
 
 static int dr_interception(struct vcpu_svm *svm)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3980da515fd0..aae43c6f2472 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -132,6 +132,22 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
 
 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
 
+#define VMX_VPID_EXTENT_SUPPORTED_MASK		\
+	(VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT |	\
+	VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |	\
+	VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT |	\
+	VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
+
+/*
+ * Hyper-V requires all of these, so mark them as supported even though
+ * they are just treated the same as all-context.
+ */
+#define VMX_VPID_EXTENT_SUPPORTED_MASK		\
+	(VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT |	\
+	VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |	\
+	VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT |	\
+	VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
+
 /*
  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
  * ple_gap:    upper bound on the amount of time between two successive
@@ -446,23 +462,31 @@ struct nested_vmx {
 	u16 vpid02;
 	u16 last_vpid;
 
+	/*
+	 * We only store the "true" versions of the VMX capability MSRs. We
+	 * generate the "non-true" versions by setting the must-be-1 bits
+	 * according to the SDM.
+	 */
 	u32 nested_vmx_procbased_ctls_low;
 	u32 nested_vmx_procbased_ctls_high;
-	u32 nested_vmx_true_procbased_ctls_low;
 	u32 nested_vmx_secondary_ctls_low;
 	u32 nested_vmx_secondary_ctls_high;
 	u32 nested_vmx_pinbased_ctls_low;
 	u32 nested_vmx_pinbased_ctls_high;
 	u32 nested_vmx_exit_ctls_low;
 	u32 nested_vmx_exit_ctls_high;
-	u32 nested_vmx_true_exit_ctls_low;
 	u32 nested_vmx_entry_ctls_low;
 	u32 nested_vmx_entry_ctls_high;
-	u32 nested_vmx_true_entry_ctls_low;
 	u32 nested_vmx_misc_low;
 	u32 nested_vmx_misc_high;
 	u32 nested_vmx_ept_caps;
 	u32 nested_vmx_vpid_caps;
+	u64 nested_vmx_basic;
+	u64 nested_vmx_cr0_fixed0;
+	u64 nested_vmx_cr0_fixed1;
+	u64 nested_vmx_cr4_fixed0;
+	u64 nested_vmx_cr4_fixed1;
+	u64 nested_vmx_vmcs_enum;
 };
 
 #define POSTED_INTR_ON  0
@@ -520,6 +544,12 @@ static inline void pi_set_sn(struct pi_desc *pi_desc)
 			(unsigned long *)&pi_desc->control);
 }
 
+static inline void pi_clear_on(struct pi_desc *pi_desc)
+{
+	clear_bit(POSTED_INTR_ON,
+  		  (unsigned long *)&pi_desc->control);
+}
+
 static inline int pi_test_on(struct pi_desc *pi_desc)
 {
 	return test_bit(POSTED_INTR_ON,
@@ -920,16 +950,32 @@ static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
 static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
 static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
 
-static unsigned long *vmx_io_bitmap_a;
-static unsigned long *vmx_io_bitmap_b;
-static unsigned long *vmx_msr_bitmap_legacy;
-static unsigned long *vmx_msr_bitmap_longmode;
-static unsigned long *vmx_msr_bitmap_legacy_x2apic;
-static unsigned long *vmx_msr_bitmap_longmode_x2apic;
-static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
-static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
-static unsigned long *vmx_vmread_bitmap;
-static unsigned long *vmx_vmwrite_bitmap;
+enum {
+	VMX_IO_BITMAP_A,
+	VMX_IO_BITMAP_B,
+	VMX_MSR_BITMAP_LEGACY,
+	VMX_MSR_BITMAP_LONGMODE,
+	VMX_MSR_BITMAP_LEGACY_X2APIC_APICV,
+	VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV,
+	VMX_MSR_BITMAP_LEGACY_X2APIC,
+	VMX_MSR_BITMAP_LONGMODE_X2APIC,
+	VMX_VMREAD_BITMAP,
+	VMX_VMWRITE_BITMAP,
+	VMX_BITMAP_NR
+};
+
+static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
+
+#define vmx_io_bitmap_a                      (vmx_bitmap[VMX_IO_BITMAP_A])
+#define vmx_io_bitmap_b                      (vmx_bitmap[VMX_IO_BITMAP_B])
+#define vmx_msr_bitmap_legacy                (vmx_bitmap[VMX_MSR_BITMAP_LEGACY])
+#define vmx_msr_bitmap_longmode              (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE])
+#define vmx_msr_bitmap_legacy_x2apic_apicv   (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV])
+#define vmx_msr_bitmap_longmode_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV])
+#define vmx_msr_bitmap_legacy_x2apic         (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC])
+#define vmx_msr_bitmap_longmode_x2apic       (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC])
+#define vmx_vmread_bitmap                    (vmx_bitmap[VMX_VMREAD_BITMAP])
+#define vmx_vmwrite_bitmap                   (vmx_bitmap[VMX_VMWRITE_BITMAP])
 
 static bool cpu_has_load_ia32_efer;
 static bool cpu_has_load_perf_global_ctrl;
@@ -2523,14 +2569,14 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
 		  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
 		if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
 			if (is_long_mode(vcpu))
-				msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
+				msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv;
 			else
-				msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
+				msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv;
 		} else {
 			if (is_long_mode(vcpu))
-				msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
+				msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
 			else
-				msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
+				msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
 		}
 	} else {
 		if (is_long_mode(vcpu))
@@ -2706,9 +2752,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 		vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
 
 	/* We support free control of debug control saving. */
-	vmx->nested.nested_vmx_true_exit_ctls_low =
-		vmx->nested.nested_vmx_exit_ctls_low &
-		~VM_EXIT_SAVE_DEBUG_CONTROLS;
+	vmx->nested.nested_vmx_exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
 
 	/* entry controls */
 	rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2727,9 +2771,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 		vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
 
 	/* We support free control of debug control loading. */
-	vmx->nested.nested_vmx_true_entry_ctls_low =
-		vmx->nested.nested_vmx_entry_ctls_low &
-		~VM_ENTRY_LOAD_DEBUG_CONTROLS;
+	vmx->nested.nested_vmx_entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
 
 	/* cpu-based controls */
 	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -2762,8 +2804,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 		CPU_BASED_USE_MSR_BITMAPS;
 
 	/* We support free control of CR3 access interception. */
-	vmx->nested.nested_vmx_true_procbased_ctls_low =
-		vmx->nested.nested_vmx_procbased_ctls_low &
+	vmx->nested.nested_vmx_procbased_ctls_low &=
 		~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
 
 	/* secondary cpu-based controls */
@@ -2774,6 +2815,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	vmx->nested.nested_vmx_secondary_ctls_high &=
 		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 		SECONDARY_EXEC_RDTSCP |
+		SECONDARY_EXEC_DESC |
 		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
 		SECONDARY_EXEC_ENABLE_VPID |
 		SECONDARY_EXEC_APIC_REGISTER_VIRT |
@@ -2805,8 +2847,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	 */
 	if (enable_vpid)
 		vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
-				VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |
-				VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
+			VMX_VPID_EXTENT_SUPPORTED_MASK;
 	else
 		vmx->nested.nested_vmx_vpid_caps = 0;
 
@@ -2823,14 +2864,52 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 		VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
 		VMX_MISC_ACTIVITY_HLT;
 	vmx->nested.nested_vmx_misc_high = 0;
+
+	/*
+	 * This MSR reports some information about VMX support. We
+	 * should return information about the VMX we emulate for the
+	 * guest, and the VMCS structure we give it - not about the
+	 * VMX support of the underlying hardware.
+	 */
+	vmx->nested.nested_vmx_basic =
+		VMCS12_REVISION |
+		VMX_BASIC_TRUE_CTLS |
+		((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
+		(VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
+
+	if (cpu_has_vmx_basic_inout())
+		vmx->nested.nested_vmx_basic |= VMX_BASIC_INOUT;
+
+	/*
+	 * These MSRs specify bits which the guest must keep fixed on
+	 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
+	 * We picked the standard core2 setting.
+	 */
+#define VMXON_CR0_ALWAYSON     (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
+#define VMXON_CR4_ALWAYSON     X86_CR4_VMXE
+	vmx->nested.nested_vmx_cr0_fixed0 = VMXON_CR0_ALWAYSON;
+	vmx->nested.nested_vmx_cr4_fixed0 = VMXON_CR4_ALWAYSON;
+
+	/* These MSRs specify bits which the guest must keep fixed off. */
+	rdmsrl(MSR_IA32_VMX_CR0_FIXED1, vmx->nested.nested_vmx_cr0_fixed1);
+	rdmsrl(MSR_IA32_VMX_CR4_FIXED1, vmx->nested.nested_vmx_cr4_fixed1);
+
+	/* highest index: VMX_PREEMPTION_TIMER_VALUE */
+	vmx->nested.nested_vmx_vmcs_enum = 0x2e;
+}
+
+/*
+ * if fixed0[i] == 1: val[i] must be 1
+ * if fixed1[i] == 0: val[i] must be 0
+ */
+static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1)
+{
+	return ((val & fixed1) | fixed0) == val;
 }
 
 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
 {
-	/*
-	 * Bits 0 in high must be 0, and bits 1 in low must be 1.
-	 */
-	return ((control & high) | low) == control;
+	return fixed_bits_valid(control, low, high);
 }
 
 static inline u64 vmx_control_msr(u32 low, u32 high)
@@ -2838,87 +2917,285 @@ static inline u64 vmx_control_msr(u32 low, u32 high)
 	return low | ((u64)high << 32);
 }
 
-/* Returns 0 on success, non-0 otherwise. */
-static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
+{
+	superset &= mask;
+	subset &= mask;
+
+	return (superset | subset) == superset;
+}
+
+static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
+{
+	const u64 feature_and_reserved =
+		/* feature (except bit 48; see below) */
+		BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
+		/* reserved */
+		BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
+	u64 vmx_basic = vmx->nested.nested_vmx_basic;
+
+	if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
+		return -EINVAL;
+
+	/*
+	 * KVM does not emulate a version of VMX that constrains physical
+	 * addresses of VMX structures (e.g. VMCS) to 32-bits.
+	 */
+	if (data & BIT_ULL(48))
+		return -EINVAL;
+
+	if (vmx_basic_vmcs_revision_id(vmx_basic) !=
+	    vmx_basic_vmcs_revision_id(data))
+		return -EINVAL;
+
+	if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
+		return -EINVAL;
+
+	vmx->nested.nested_vmx_basic = data;
+	return 0;
+}
+
+static int
+vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
+{
+	u64 supported;
+	u32 *lowp, *highp;
+
+	switch (msr_index) {
+	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+		lowp = &vmx->nested.nested_vmx_pinbased_ctls_low;
+		highp = &vmx->nested.nested_vmx_pinbased_ctls_high;
+		break;
+	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+		lowp = &vmx->nested.nested_vmx_procbased_ctls_low;
+		highp = &vmx->nested.nested_vmx_procbased_ctls_high;
+		break;
+	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+		lowp = &vmx->nested.nested_vmx_exit_ctls_low;
+		highp = &vmx->nested.nested_vmx_exit_ctls_high;
+		break;
+	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+		lowp = &vmx->nested.nested_vmx_entry_ctls_low;
+		highp = &vmx->nested.nested_vmx_entry_ctls_high;
+		break;
+	case MSR_IA32_VMX_PROCBASED_CTLS2:
+		lowp = &vmx->nested.nested_vmx_secondary_ctls_low;
+		highp = &vmx->nested.nested_vmx_secondary_ctls_high;
+		break;
+	default:
+		BUG();
+	}
+
+	supported = vmx_control_msr(*lowp, *highp);
+
+	/* Check must-be-1 bits are still 1. */
+	if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
+		return -EINVAL;
+
+	/* Check must-be-0 bits are still 0. */
+	if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
+		return -EINVAL;
+
+	*lowp = data;
+	*highp = data >> 32;
+	return 0;
+}
+
+static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
+{
+	const u64 feature_and_reserved_bits =
+		/* feature */
+		BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
+		BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
+		/* reserved */
+		GENMASK_ULL(13, 9) | BIT_ULL(31);
+	u64 vmx_misc;
+
+	vmx_misc = vmx_control_msr(vmx->nested.nested_vmx_misc_low,
+				   vmx->nested.nested_vmx_misc_high);
+
+	if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
+		return -EINVAL;
+
+	if ((vmx->nested.nested_vmx_pinbased_ctls_high &
+	     PIN_BASED_VMX_PREEMPTION_TIMER) &&
+	    vmx_misc_preemption_timer_rate(data) !=
+	    vmx_misc_preemption_timer_rate(vmx_misc))
+		return -EINVAL;
+
+	if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
+		return -EINVAL;
+
+	if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
+		return -EINVAL;
+
+	if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
+		return -EINVAL;
+
+	vmx->nested.nested_vmx_misc_low = data;
+	vmx->nested.nested_vmx_misc_high = data >> 32;
+	return 0;
+}
+
+static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
+{
+	u64 vmx_ept_vpid_cap;
+
+	vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.nested_vmx_ept_caps,
+					   vmx->nested.nested_vmx_vpid_caps);
+
+	/* Every bit is either reserved or a feature bit. */
+	if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
+		return -EINVAL;
+
+	vmx->nested.nested_vmx_ept_caps = data;
+	vmx->nested.nested_vmx_vpid_caps = data >> 32;
+	return 0;
+}
+
+static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
+{
+	u64 *msr;
+
+	switch (msr_index) {
+	case MSR_IA32_VMX_CR0_FIXED0:
+		msr = &vmx->nested.nested_vmx_cr0_fixed0;
+		break;
+	case MSR_IA32_VMX_CR4_FIXED0:
+		msr = &vmx->nested.nested_vmx_cr4_fixed0;
+		break;
+	default:
+		BUG();
+	}
+
+	/*
+	 * 1 bits (which indicates bits which "must-be-1" during VMX operation)
+	 * must be 1 in the restored value.
+	 */
+	if (!is_bitwise_subset(data, *msr, -1ULL))
+		return -EINVAL;
+
+	*msr = data;
+	return 0;
+}
+
+/*
+ * Called when userspace is restoring VMX MSRs.
+ *
+ * Returns 0 on success, non-0 otherwise.
+ */
+static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 	switch (msr_index) {
 	case MSR_IA32_VMX_BASIC:
+		return vmx_restore_vmx_basic(vmx, data);
+	case MSR_IA32_VMX_PINBASED_CTLS:
+	case MSR_IA32_VMX_PROCBASED_CTLS:
+	case MSR_IA32_VMX_EXIT_CTLS:
+	case MSR_IA32_VMX_ENTRY_CTLS:
+		/*
+		 * The "non-true" VMX capability MSRs are generated from the
+		 * "true" MSRs, so we do not support restoring them directly.
+		 *
+		 * If userspace wants to emulate VMX_BASIC[55]=0, userspace
+		 * should restore the "true" MSRs with the must-be-1 bits
+		 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
+		 * DEFAULT SETTINGS".
+		 */
+		return -EINVAL;
+	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+	case MSR_IA32_VMX_PROCBASED_CTLS2:
+		return vmx_restore_control_msr(vmx, msr_index, data);
+	case MSR_IA32_VMX_MISC:
+		return vmx_restore_vmx_misc(vmx, data);
+	case MSR_IA32_VMX_CR0_FIXED0:
+	case MSR_IA32_VMX_CR4_FIXED0:
+		return vmx_restore_fixed0_msr(vmx, msr_index, data);
+	case MSR_IA32_VMX_CR0_FIXED1:
+	case MSR_IA32_VMX_CR4_FIXED1:
+		/*
+		 * These MSRs are generated based on the vCPU's CPUID, so we
+		 * do not support restoring them directly.
+		 */
+		return -EINVAL;
+	case MSR_IA32_VMX_EPT_VPID_CAP:
+		return vmx_restore_vmx_ept_vpid_cap(vmx, data);
+	case MSR_IA32_VMX_VMCS_ENUM:
+		vmx->nested.nested_vmx_vmcs_enum = data;
+		return 0;
+	default:
 		/*
-		 * This MSR reports some information about VMX support. We
-		 * should return information about the VMX we emulate for the
-		 * guest, and the VMCS structure we give it - not about the
-		 * VMX support of the underlying hardware.
+		 * The rest of the VMX capability MSRs do not support restore.
 		 */
-		*pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS |
-			   ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
-			   (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
-		if (cpu_has_vmx_basic_inout())
-			*pdata |= VMX_BASIC_INOUT;
+		return -EINVAL;
+	}
+}
+
+/* Returns 0 on success, non-0 otherwise. */
+static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	switch (msr_index) {
+	case MSR_IA32_VMX_BASIC:
+		*pdata = vmx->nested.nested_vmx_basic;
 		break;
 	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
 	case MSR_IA32_VMX_PINBASED_CTLS:
 		*pdata = vmx_control_msr(
 			vmx->nested.nested_vmx_pinbased_ctls_low,
 			vmx->nested.nested_vmx_pinbased_ctls_high);
+		if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
+			*pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
 		break;
 	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
-		*pdata = vmx_control_msr(
-			vmx->nested.nested_vmx_true_procbased_ctls_low,
-			vmx->nested.nested_vmx_procbased_ctls_high);
-		break;
 	case MSR_IA32_VMX_PROCBASED_CTLS:
 		*pdata = vmx_control_msr(
 			vmx->nested.nested_vmx_procbased_ctls_low,
 			vmx->nested.nested_vmx_procbased_ctls_high);
+		if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
+			*pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
 		break;
 	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
-		*pdata = vmx_control_msr(
-			vmx->nested.nested_vmx_true_exit_ctls_low,
-			vmx->nested.nested_vmx_exit_ctls_high);
-		break;
 	case MSR_IA32_VMX_EXIT_CTLS:
 		*pdata = vmx_control_msr(
 			vmx->nested.nested_vmx_exit_ctls_low,
 			vmx->nested.nested_vmx_exit_ctls_high);
+		if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
+			*pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
 		break;
 	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
-		*pdata = vmx_control_msr(
-			vmx->nested.nested_vmx_true_entry_ctls_low,
-			vmx->nested.nested_vmx_entry_ctls_high);
-		break;
 	case MSR_IA32_VMX_ENTRY_CTLS:
 		*pdata = vmx_control_msr(
 			vmx->nested.nested_vmx_entry_ctls_low,
 			vmx->nested.nested_vmx_entry_ctls_high);
+		if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
+			*pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
 		break;
 	case MSR_IA32_VMX_MISC:
 		*pdata = vmx_control_msr(
 			vmx->nested.nested_vmx_misc_low,
 			vmx->nested.nested_vmx_misc_high);
 		break;
-	/*
-	 * These MSRs specify bits which the guest must keep fixed (on or off)
-	 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
-	 * We picked the standard core2 setting.
-	 */
-#define VMXON_CR0_ALWAYSON	(X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
-#define VMXON_CR4_ALWAYSON	X86_CR4_VMXE
 	case MSR_IA32_VMX_CR0_FIXED0:
-		*pdata = VMXON_CR0_ALWAYSON;
+		*pdata = vmx->nested.nested_vmx_cr0_fixed0;
 		break;
 	case MSR_IA32_VMX_CR0_FIXED1:
-		*pdata = -1ULL;
+		*pdata = vmx->nested.nested_vmx_cr0_fixed1;
 		break;
 	case MSR_IA32_VMX_CR4_FIXED0:
-		*pdata = VMXON_CR4_ALWAYSON;
+		*pdata = vmx->nested.nested_vmx_cr4_fixed0;
 		break;
 	case MSR_IA32_VMX_CR4_FIXED1:
-		*pdata = -1ULL;
+		*pdata = vmx->nested.nested_vmx_cr4_fixed1;
 		break;
 	case MSR_IA32_VMX_VMCS_ENUM:
-		*pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */
+		*pdata = vmx->nested.nested_vmx_vmcs_enum;
 		break;
 	case MSR_IA32_VMX_PROCBASED_CTLS2:
 		*pdata = vmx_control_msr(
@@ -3101,7 +3378,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			vmx_leave_nested(vcpu);
 		break;
 	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
-		return 1; /* they are read-only */
+		if (!msr_info->host_initiated)
+			return 1; /* they are read-only */
+		if (!nested_vmx_allowed(vcpu))
+			return 1;
+		return vmx_set_vmx_msr(vcpu, msr_index, data);
 	case MSR_IA32_XSS:
 		if (!vmx_xsaves_supported())
 			return 1;
@@ -3863,6 +4144,40 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
 		  (unsigned long *)&vcpu->arch.regs_dirty);
 }
 
+static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+	u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0;
+	u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1;
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+	if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high &
+		SECONDARY_EXEC_UNRESTRICTED_GUEST &&
+	    nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
+		fixed0 &= ~(X86_CR0_PE | X86_CR0_PG);
+
+	return fixed_bits_valid(val, fixed0, fixed1);
+}
+
+static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+	u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0;
+	u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1;
+
+	return fixed_bits_valid(val, fixed0, fixed1);
+}
+
+static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+	u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed0;
+	u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed1;
+
+	return fixed_bits_valid(val, fixed0, fixed1);
+}
+
+/* No difference in the restrictions on guest and host CR4 in VMX operation. */
+#define nested_guest_cr4_valid	nested_cr4_valid
+#define nested_host_cr4_valid	nested_cr4_valid
+
 static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
 
 static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
@@ -3991,8 +4306,8 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 		if (!nested_vmx_allowed(vcpu))
 			return 1;
 	}
-	if (to_vmx(vcpu)->nested.vmxon &&
-	    ((cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON))
+
+	if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
 		return 1;
 
 	vcpu->arch.cr4 = cr4;
@@ -4569,41 +4884,6 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
 	}
 }
 
-static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
-						u32 msr, int type)
-{
-	int f = sizeof(unsigned long);
-
-	if (!cpu_has_vmx_msr_bitmap())
-		return;
-
-	/*
-	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
-	 * have the write-low and read-high bitmap offsets the wrong way round.
-	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
-	 */
-	if (msr <= 0x1fff) {
-		if (type & MSR_TYPE_R)
-			/* read-low */
-			__set_bit(msr, msr_bitmap + 0x000 / f);
-
-		if (type & MSR_TYPE_W)
-			/* write-low */
-			__set_bit(msr, msr_bitmap + 0x800 / f);
-
-	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
-		msr &= 0x1fff;
-		if (type & MSR_TYPE_R)
-			/* read-high */
-			__set_bit(msr, msr_bitmap + 0x400 / f);
-
-		if (type & MSR_TYPE_W)
-			/* write-high */
-			__set_bit(msr, msr_bitmap + 0xc00 / f);
-
-	}
-}
-
 /*
  * If a msr is allowed by L0, we should check whether it is allowed by L1.
  * The corresponding bit will be cleared unless both of L0 and L1 allow it.
@@ -4659,48 +4939,18 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
 						msr, MSR_TYPE_R | MSR_TYPE_W);
 }
 
-static void vmx_enable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
-{
-	if (apicv_active) {
-		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
-				msr, MSR_TYPE_R);
-		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
-				msr, MSR_TYPE_R);
-	} else {
-		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
-				msr, MSR_TYPE_R);
-		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
-				msr, MSR_TYPE_R);
-	}
-}
-
-static void vmx_disable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
+static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_active)
 {
 	if (apicv_active) {
-		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
-				msr, MSR_TYPE_R);
-		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
-				msr, MSR_TYPE_R);
+		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv,
+				msr, type);
+		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv,
+				msr, type);
 	} else {
-		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
-				msr, MSR_TYPE_R);
-		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
-				msr, MSR_TYPE_R);
-	}
-}
-
-static void vmx_disable_intercept_msr_write_x2apic(u32 msr, bool apicv_active)
-{
-	if (apicv_active) {
 		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
-				msr, MSR_TYPE_W);
+				msr, type);
 		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
-				msr, MSR_TYPE_W);
-	} else {
-		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
-				msr, MSR_TYPE_W);
-		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
-				msr, MSR_TYPE_W);
+				msr, type);
 	}
 }
 
@@ -4822,9 +5072,15 @@ static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-	if (!pi_test_and_clear_on(&vmx->pi_desc))
+	if (!pi_test_on(&vmx->pi_desc))
 		return;
 
+	pi_clear_on(&vmx->pi_desc);
+	/*
+	 * IOMMU can write to PIR.ON, so the barrier matters even on UP.
+	 * But on x86 this is just a compiler barrier anyway.
+	 */
+	smp_mb__after_atomic();
 	kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
 }
 
@@ -5583,7 +5839,7 @@ static int handle_triple_fault(struct kvm_vcpu *vcpu)
 static int handle_io(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification;
-	int size, in, string;
+	int size, in, string, ret;
 	unsigned port;
 
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -5597,9 +5853,14 @@ static int handle_io(struct kvm_vcpu *vcpu)
 
 	port = exit_qualification >> 16;
 	size = (exit_qualification & 7) + 1;
-	skip_emulated_instruction(vcpu);
 
-	return kvm_fast_pio_out(vcpu, size, port);
+	ret = kvm_skip_emulated_instruction(vcpu);
+
+	/*
+	 * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered
+	 * KVM_EXIT_DEBUG here.
+	 */
+	return kvm_fast_pio_out(vcpu, size, port) && ret;
 }
 
 static void
@@ -5613,18 +5874,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
 	hypercall[2] = 0xc1;
 }
 
-static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
-{
-	unsigned long always_on = VMXON_CR0_ALWAYSON;
-	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-
-	if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high &
-		SECONDARY_EXEC_UNRESTRICTED_GUEST &&
-	    nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
-		always_on &= ~(X86_CR0_PE | X86_CR0_PG);
-	return (val & always_on) == always_on;
-}
-
 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */
 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
 {
@@ -5643,7 +5892,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
 		val = (val & ~vmcs12->cr0_guest_host_mask) |
 			(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
 
-		if (!nested_cr0_valid(vcpu, val))
+		if (!nested_guest_cr0_valid(vcpu, val))
 			return 1;
 
 		if (kvm_set_cr0(vcpu, val))
@@ -5652,8 +5901,9 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
 		return 0;
 	} else {
 		if (to_vmx(vcpu)->nested.vmxon &&
-		    ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
+		    !nested_host_cr0_valid(vcpu, val))
 			return 1;
+
 		return kvm_set_cr0(vcpu, val);
 	}
 }
@@ -5697,6 +5947,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 	int cr;
 	int reg;
 	int err;
+	int ret;
 
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 	cr = exit_qualification & 15;
@@ -5708,25 +5959,27 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 		switch (cr) {
 		case 0:
 			err = handle_set_cr0(vcpu, val);
-			kvm_complete_insn_gp(vcpu, err);
-			return 1;
+			return kvm_complete_insn_gp(vcpu, err);
 		case 3:
 			err = kvm_set_cr3(vcpu, val);
-			kvm_complete_insn_gp(vcpu, err);
-			return 1;
+			return kvm_complete_insn_gp(vcpu, err);
 		case 4:
 			err = handle_set_cr4(vcpu, val);
-			kvm_complete_insn_gp(vcpu, err);
-			return 1;
+			return kvm_complete_insn_gp(vcpu, err);
 		case 8: {
 				u8 cr8_prev = kvm_get_cr8(vcpu);
 				u8 cr8 = (u8)val;
 				err = kvm_set_cr8(vcpu, cr8);
-				kvm_complete_insn_gp(vcpu, err);
+				ret = kvm_complete_insn_gp(vcpu, err);
 				if (lapic_in_kernel(vcpu))
-					return 1;
+					return ret;
 				if (cr8_prev <= cr8)
-					return 1;
+					return ret;
+				/*
+				 * TODO: we might be squashing a
+				 * KVM_GUESTDBG_SINGLESTEP-triggered
+				 * KVM_EXIT_DEBUG here.
+				 */
 				vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
 				return 0;
 			}
@@ -5735,23 +5988,20 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 	case 2: /* clts */
 		handle_clts(vcpu);
 		trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
-		skip_emulated_instruction(vcpu);
 		vmx_fpu_activate(vcpu);
-		return 1;
+		return kvm_skip_emulated_instruction(vcpu);
 	case 1: /*mov from cr*/
 		switch (cr) {
 		case 3:
 			val = kvm_read_cr3(vcpu);
 			kvm_register_write(vcpu, reg, val);
 			trace_kvm_cr_read(cr, val);
-			skip_emulated_instruction(vcpu);
-			return 1;
+			return kvm_skip_emulated_instruction(vcpu);
 		case 8:
 			val = kvm_get_cr8(vcpu);
 			kvm_register_write(vcpu, reg, val);
 			trace_kvm_cr_read(cr, val);
-			skip_emulated_instruction(vcpu);
-			return 1;
+			return kvm_skip_emulated_instruction(vcpu);
 		}
 		break;
 	case 3: /* lmsw */
@@ -5759,8 +6009,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 		trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
 		kvm_lmsw(vcpu, val);
 
-		skip_emulated_instruction(vcpu);
-		return 1;
+		return kvm_skip_emulated_instruction(vcpu);
 	default:
 		break;
 	}
@@ -5831,8 +6080,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
 		if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
 			return 1;
 
-	skip_emulated_instruction(vcpu);
-	return 1;
+	return kvm_skip_emulated_instruction(vcpu);
 }
 
 static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
@@ -5864,8 +6112,7 @@ static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
 
 static int handle_cpuid(struct kvm_vcpu *vcpu)
 {
-	kvm_emulate_cpuid(vcpu);
-	return 1;
+	return kvm_emulate_cpuid(vcpu);
 }
 
 static int handle_rdmsr(struct kvm_vcpu *vcpu)
@@ -5886,8 +6133,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu)
 	/* FIXME: handling of bits 32:63 of rax, rdx */
 	vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u;
 	vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u;
-	skip_emulated_instruction(vcpu);
-	return 1;
+	return kvm_skip_emulated_instruction(vcpu);
 }
 
 static int handle_wrmsr(struct kvm_vcpu *vcpu)
@@ -5907,8 +6153,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
 	}
 
 	trace_kvm_msr_write(ecx, data);
-	skip_emulated_instruction(vcpu);
-	return 1;
+	return kvm_skip_emulated_instruction(vcpu);
 }
 
 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
@@ -5952,8 +6197,7 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
 	kvm_mmu_invlpg(vcpu, exit_qualification);
-	skip_emulated_instruction(vcpu);
-	return 1;
+	return kvm_skip_emulated_instruction(vcpu);
 }
 
 static int handle_rdpmc(struct kvm_vcpu *vcpu)
@@ -5961,15 +6205,12 @@ static int handle_rdpmc(struct kvm_vcpu *vcpu)
 	int err;
 
 	err = kvm_rdpmc(vcpu);
-	kvm_complete_insn_gp(vcpu, err);
-
-	return 1;
+	return kvm_complete_insn_gp(vcpu, err);
 }
 
 static int handle_wbinvd(struct kvm_vcpu *vcpu)
 {
-	kvm_emulate_wbinvd(vcpu);
-	return 1;
+	return kvm_emulate_wbinvd(vcpu);
 }
 
 static int handle_xsetbv(struct kvm_vcpu *vcpu)
@@ -5978,20 +6219,20 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
 	u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
 
 	if (kvm_set_xcr(vcpu, index, new_bv) == 0)
-		skip_emulated_instruction(vcpu);
+		return kvm_skip_emulated_instruction(vcpu);
 	return 1;
 }
 
 static int handle_xsaves(struct kvm_vcpu *vcpu)
 {
-	skip_emulated_instruction(vcpu);
+	kvm_skip_emulated_instruction(vcpu);
 	WARN(1, "this should never happen\n");
 	return 1;
 }
 
 static int handle_xrstors(struct kvm_vcpu *vcpu)
 {
-	skip_emulated_instruction(vcpu);
+	kvm_skip_emulated_instruction(vcpu);
 	WARN(1, "this should never happen\n");
 	return 1;
 }
@@ -6012,8 +6253,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu)
 		if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
 		    (offset == APIC_EOI)) {
 			kvm_lapic_set_eoi(vcpu);
-			skip_emulated_instruction(vcpu);
-			return 1;
+			return kvm_skip_emulated_instruction(vcpu);
 		}
 	}
 	return emulate_instruction(vcpu, 0) == EMULATE_DONE;
@@ -6161,9 +6401,8 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 
 	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
 	if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
-		skip_emulated_instruction(vcpu);
 		trace_kvm_fast_mmio(gpa);
-		return 1;
+		return kvm_skip_emulated_instruction(vcpu);
 	}
 
 	ret = handle_mmio_page_fault(vcpu, gpa, true);
@@ -6348,50 +6587,13 @@ static __init int hardware_setup(void)
 	for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
 		kvm_define_shared_msr(i, vmx_msr_index[i]);
 
-	vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
-	if (!vmx_io_bitmap_a)
-		return r;
+	for (i = 0; i < VMX_BITMAP_NR; i++) {
+		vmx_bitmap[i] = (unsigned long *)__get_free_page(GFP_KERNEL);
+		if (!vmx_bitmap[i])
+			goto out;
+	}
 
 	vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
-	if (!vmx_io_bitmap_b)
-		goto out;
-
-	vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
-	if (!vmx_msr_bitmap_legacy)
-		goto out1;
-
-	vmx_msr_bitmap_legacy_x2apic =
-				(unsigned long *)__get_free_page(GFP_KERNEL);
-	if (!vmx_msr_bitmap_legacy_x2apic)
-		goto out2;
-
-	vmx_msr_bitmap_legacy_x2apic_apicv_inactive =
-				(unsigned long *)__get_free_page(GFP_KERNEL);
-	if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive)
-		goto out3;
-
-	vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
-	if (!vmx_msr_bitmap_longmode)
-		goto out4;
-
-	vmx_msr_bitmap_longmode_x2apic =
-				(unsigned long *)__get_free_page(GFP_KERNEL);
-	if (!vmx_msr_bitmap_longmode_x2apic)
-		goto out5;
-
-	vmx_msr_bitmap_longmode_x2apic_apicv_inactive =
-				(unsigned long *)__get_free_page(GFP_KERNEL);
-	if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive)
-		goto out6;
-
-	vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
-	if (!vmx_vmread_bitmap)
-		goto out7;
-
-	vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
-	if (!vmx_vmwrite_bitmap)
-		goto out8;
-
 	memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
 	memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
 
@@ -6409,7 +6611,7 @@ static __init int hardware_setup(void)
 
 	if (setup_vmcs_config(&vmcs_config) < 0) {
 		r = -EIO;
-		goto out9;
+		goto out;
 	}
 
 	if (boot_cpu_has(X86_FEATURE_NX))
@@ -6472,39 +6674,34 @@ static __init int hardware_setup(void)
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
 	vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true);
 
-	memcpy(vmx_msr_bitmap_legacy_x2apic,
+	memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
 			vmx_msr_bitmap_legacy, PAGE_SIZE);
-	memcpy(vmx_msr_bitmap_longmode_x2apic,
+	memcpy(vmx_msr_bitmap_longmode_x2apic_apicv,
 			vmx_msr_bitmap_longmode, PAGE_SIZE);
-	memcpy(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+	memcpy(vmx_msr_bitmap_legacy_x2apic,
 			vmx_msr_bitmap_legacy, PAGE_SIZE);
-	memcpy(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+	memcpy(vmx_msr_bitmap_longmode_x2apic,
 			vmx_msr_bitmap_longmode, PAGE_SIZE);
 
 	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 
+	for (msr = 0x800; msr <= 0x8ff; msr++) {
+		if (msr == 0x839 /* TMCCT */)
+			continue;
+		vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true);
+	}
+
 	/*
-	 * enable_apicv && kvm_vcpu_apicv_active()
+	 * TPR reads and writes can be virtualized even if virtual interrupt
+	 * delivery is not in use.
 	 */
-	for (msr = 0x800; msr <= 0x8ff; msr++)
-		vmx_disable_intercept_msr_read_x2apic(msr, true);
+	vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_W, true);
+	vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false);
 
-	/* TMCCT */
-	vmx_enable_intercept_msr_read_x2apic(0x839, true);
-	/* TPR */
-	vmx_disable_intercept_msr_write_x2apic(0x808, true);
 	/* EOI */
-	vmx_disable_intercept_msr_write_x2apic(0x80b, true);
+	vmx_disable_intercept_msr_x2apic(0x80b, MSR_TYPE_W, true);
 	/* SELF-IPI */
-	vmx_disable_intercept_msr_write_x2apic(0x83f, true);
-
-	/*
-	 * (enable_apicv && !kvm_vcpu_apicv_active()) ||
-	 * 	!enable_apicv
-	 */
-	/* TPR */
-	vmx_disable_intercept_msr_read_x2apic(0x808, false);
-	vmx_disable_intercept_msr_write_x2apic(0x808, false);
+	vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true);
 
 	if (enable_ept) {
 		kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
@@ -6551,42 +6748,19 @@ static __init int hardware_setup(void)
 
 	return alloc_kvm_area();
 
-out9:
-	free_page((unsigned long)vmx_vmwrite_bitmap);
-out8:
-	free_page((unsigned long)vmx_vmread_bitmap);
-out7:
-	free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
-out6:
-	free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
-out5:
-	free_page((unsigned long)vmx_msr_bitmap_longmode);
-out4:
-	free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
-out3:
-	free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
-out2:
-	free_page((unsigned long)vmx_msr_bitmap_legacy);
-out1:
-	free_page((unsigned long)vmx_io_bitmap_b);
 out:
-	free_page((unsigned long)vmx_io_bitmap_a);
+	for (i = 0; i < VMX_BITMAP_NR; i++)
+		free_page((unsigned long)vmx_bitmap[i]);
 
     return r;
 }
 
 static __exit void hardware_unsetup(void)
 {
-	free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
-	free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
-	free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
-	free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
-	free_page((unsigned long)vmx_msr_bitmap_legacy);
-	free_page((unsigned long)vmx_msr_bitmap_longmode);
-	free_page((unsigned long)vmx_io_bitmap_b);
-	free_page((unsigned long)vmx_io_bitmap_a);
-	free_page((unsigned long)vmx_vmwrite_bitmap);
-	free_page((unsigned long)vmx_vmread_bitmap);
+	int i;
+
+	for (i = 0; i < VMX_BITMAP_NR; i++)
+		free_page((unsigned long)vmx_bitmap[i]);
 
 	free_kvm_area();
 }
@@ -6600,16 +6774,13 @@ static int handle_pause(struct kvm_vcpu *vcpu)
 	if (ple_gap)
 		grow_ple_window(vcpu);
 
-	skip_emulated_instruction(vcpu);
 	kvm_vcpu_on_spin(vcpu);
-
-	return 1;
+	return kvm_skip_emulated_instruction(vcpu);
 }
 
 static int handle_nop(struct kvm_vcpu *vcpu)
 {
-	skip_emulated_instruction(vcpu);
-	return 1;
+	return kvm_skip_emulated_instruction(vcpu);
 }
 
 static int handle_mwait(struct kvm_vcpu *vcpu)
@@ -6916,8 +7087,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
 		 */
 		if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
 			nested_vmx_failInvalid(vcpu);
-			skip_emulated_instruction(vcpu);
-			return 1;
+			return kvm_skip_emulated_instruction(vcpu);
 		}
 
 		page = nested_get_page(vcpu, vmptr);
@@ -6925,8 +7095,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
 		    *(u32 *)kmap(page) != VMCS12_REVISION) {
 			nested_vmx_failInvalid(vcpu);
 			kunmap(page);
-			skip_emulated_instruction(vcpu);
-			return 1;
+			return kvm_skip_emulated_instruction(vcpu);
 		}
 		kunmap(page);
 		vmx->nested.vmxon_ptr = vmptr;
@@ -6935,30 +7104,26 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
 		if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
 			nested_vmx_failValid(vcpu,
 					     VMXERR_VMCLEAR_INVALID_ADDRESS);
-			skip_emulated_instruction(vcpu);
-			return 1;
+			return kvm_skip_emulated_instruction(vcpu);
 		}
 
 		if (vmptr == vmx->nested.vmxon_ptr) {
 			nested_vmx_failValid(vcpu,
 					     VMXERR_VMCLEAR_VMXON_POINTER);
-			skip_emulated_instruction(vcpu);
-			return 1;
+			return kvm_skip_emulated_instruction(vcpu);
 		}
 		break;
 	case EXIT_REASON_VMPTRLD:
 		if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
 			nested_vmx_failValid(vcpu,
 					     VMXERR_VMPTRLD_INVALID_ADDRESS);
-			skip_emulated_instruction(vcpu);
-			return 1;
+			return kvm_skip_emulated_instruction(vcpu);
 		}
 
 		if (vmptr == vmx->nested.vmxon_ptr) {
 			nested_vmx_failValid(vcpu,
 					     VMXERR_VMCLEAR_VMXON_POINTER);
-			skip_emulated_instruction(vcpu);
-			return 1;
+			return kvm_skip_emulated_instruction(vcpu);
 		}
 		break;
 	default:
@@ -7014,8 +7179,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 
 	if (vmx->nested.vmxon) {
 		nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
-		skip_emulated_instruction(vcpu);
-		return 1;
+		return kvm_skip_emulated_instruction(vcpu);
 	}
 
 	if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
@@ -7055,9 +7219,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 
 	vmx->nested.vmxon = true;
 
-	skip_emulated_instruction(vcpu);
 	nested_vmx_succeed(vcpu);
-	return 1;
+	return kvm_skip_emulated_instruction(vcpu);
 
 out_shadow_vmcs:
 	kfree(vmx->nested.cached_vmcs12);
@@ -7176,9 +7339,8 @@ static int handle_vmoff(struct kvm_vcpu *vcpu)
 	if (!nested_vmx_check_permission(vcpu))
 		return 1;
 	free_nested(to_vmx(vcpu));
-	skip_emulated_instruction(vcpu);
 	nested_vmx_succeed(vcpu);
-	return 1;
+	return kvm_skip_emulated_instruction(vcpu);
 }
 
 /* Emulate the VMCLEAR instruction */
@@ -7217,9 +7379,8 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
 
 	nested_free_vmcs02(vmx, vmptr);
 
-	skip_emulated_instruction(vcpu);
 	nested_vmx_succeed(vcpu);
-	return 1;
+	return kvm_skip_emulated_instruction(vcpu);
 }
 
 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
@@ -7417,7 +7578,6 @@ static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	if (vmx->nested.current_vmptr == -1ull) {
 		nested_vmx_failInvalid(vcpu);
-		skip_emulated_instruction(vcpu);
 		return 0;
 	}
 	return 1;
@@ -7431,17 +7591,18 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
 	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
 	gva_t gva = 0;
 
-	if (!nested_vmx_check_permission(vcpu) ||
-	    !nested_vmx_check_vmcs12(vcpu))
+	if (!nested_vmx_check_permission(vcpu))
 		return 1;
 
+	if (!nested_vmx_check_vmcs12(vcpu))
+		return kvm_skip_emulated_instruction(vcpu);
+
 	/* Decode instruction info and find the field to read */
 	field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
 	/* Read the field, zero-extended to a u64 field_value */
 	if (vmcs12_read_any(vcpu, field, &field_value) < 0) {
 		nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
-		skip_emulated_instruction(vcpu);
-		return 1;
+		return kvm_skip_emulated_instruction(vcpu);
 	}
 	/*
 	 * Now copy part of this value to register or memory, as requested.
@@ -7461,8 +7622,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
 	}
 
 	nested_vmx_succeed(vcpu);
-	skip_emulated_instruction(vcpu);
-	return 1;
+	return kvm_skip_emulated_instruction(vcpu);
 }
 
 
@@ -7481,10 +7641,12 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 	u64 field_value = 0;
 	struct x86_exception e;
 
-	if (!nested_vmx_check_permission(vcpu) ||
-	    !nested_vmx_check_vmcs12(vcpu))
+	if (!nested_vmx_check_permission(vcpu))
 		return 1;
 
+	if (!nested_vmx_check_vmcs12(vcpu))
+		return kvm_skip_emulated_instruction(vcpu);
+
 	if (vmx_instruction_info & (1u << 10))
 		field_value = kvm_register_readl(vcpu,
 			(((vmx_instruction_info) >> 3) & 0xf));
@@ -7504,19 +7666,16 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 	if (vmcs_field_readonly(field)) {
 		nested_vmx_failValid(vcpu,
 			VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
-		skip_emulated_instruction(vcpu);
-		return 1;
+		return kvm_skip_emulated_instruction(vcpu);
 	}
 
 	if (vmcs12_write_any(vcpu, field, field_value) < 0) {
 		nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
-		skip_emulated_instruction(vcpu);
-		return 1;
+		return kvm_skip_emulated_instruction(vcpu);
 	}
 
 	nested_vmx_succeed(vcpu);
-	skip_emulated_instruction(vcpu);
-	return 1;
+	return kvm_skip_emulated_instruction(vcpu);
 }
 
 /* Emulate the VMPTRLD instruction */
@@ -7537,8 +7696,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 		page = nested_get_page(vcpu, vmptr);
 		if (page == NULL) {
 			nested_vmx_failInvalid(vcpu);
-			skip_emulated_instruction(vcpu);
-			return 1;
+			return kvm_skip_emulated_instruction(vcpu);
 		}
 		new_vmcs12 = kmap(page);
 		if (new_vmcs12->revision_id != VMCS12_REVISION) {
@@ -7546,8 +7704,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 			nested_release_page_clean(page);
 			nested_vmx_failValid(vcpu,
 				VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
-			skip_emulated_instruction(vcpu);
-			return 1;
+			return kvm_skip_emulated_instruction(vcpu);
 		}
 
 		nested_release_vmcs12(vmx);
@@ -7571,8 +7728,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 	}
 
 	nested_vmx_succeed(vcpu);
-	skip_emulated_instruction(vcpu);
-	return 1;
+	return kvm_skip_emulated_instruction(vcpu);
 }
 
 /* Emulate the VMPTRST instruction */
@@ -7597,8 +7753,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
 		return 1;
 	}
 	nested_vmx_succeed(vcpu);
-	skip_emulated_instruction(vcpu);
-	return 1;
+	return kvm_skip_emulated_instruction(vcpu);
 }
 
 /* Emulate the INVEPT instruction */
@@ -7636,8 +7791,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
 	if (type >= 32 || !(types & (1 << type))) {
 		nested_vmx_failValid(vcpu,
 				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-		skip_emulated_instruction(vcpu);
-		return 1;
+		return kvm_skip_emulated_instruction(vcpu);
 	}
 
 	/* According to the Intel VMX instruction reference, the memory
@@ -7668,8 +7822,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
 		break;
 	}
 
-	skip_emulated_instruction(vcpu);
-	return 1;
+	return kvm_skip_emulated_instruction(vcpu);
 }
 
 static int handle_invvpid(struct kvm_vcpu *vcpu)
@@ -7694,13 +7847,13 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
 	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
 	type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
 
-	types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7;
+	types = (vmx->nested.nested_vmx_vpid_caps &
+			VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
 
 	if (type >= 32 || !(types & (1 << type))) {
 		nested_vmx_failValid(vcpu,
 			VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-		skip_emulated_instruction(vcpu);
-		return 1;
+		return kvm_skip_emulated_instruction(vcpu);
 	}
 
 	/* according to the intel vmx instruction reference, the memory
@@ -7716,23 +7869,26 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
 	}
 
 	switch (type) {
+	case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
 	case VMX_VPID_EXTENT_SINGLE_CONTEXT:
-		/*
-		 * Old versions of KVM use the single-context version so we
-		 * have to support it; just treat it the same as all-context.
-		 */
+	case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
+		if (!vpid) {
+			nested_vmx_failValid(vcpu,
+				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+			return kvm_skip_emulated_instruction(vcpu);
+		}
+		break;
 	case VMX_VPID_EXTENT_ALL_CONTEXT:
-		__vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
-		nested_vmx_succeed(vcpu);
 		break;
 	default:
-		/* Trap individual address invalidation invvpid calls */
-		BUG_ON(1);
-		break;
+		WARN_ON_ONCE(1);
+		return kvm_skip_emulated_instruction(vcpu);
 	}
 
-	skip_emulated_instruction(vcpu);
-	return 1;
+	__vmx_flush_tlb(vcpu, vmx->nested.vpid02);
+	nested_vmx_succeed(vcpu);
+
+	return kvm_skip_emulated_instruction(vcpu);
 }
 
 static int handle_pml_full(struct kvm_vcpu *vcpu)
@@ -8071,6 +8227,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 		return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
 	case EXIT_REASON_IO_INSTRUCTION:
 		return nested_vmx_exit_handled_io(vcpu, vmcs12);
+	case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
+		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
 	case EXIT_REASON_MSR_READ:
 	case EXIT_REASON_MSR_WRITE:
 		return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
@@ -8620,11 +8778,6 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 	u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 	register void *__sp asm(_ASM_SP);
 
-	/*
-	 * If external interrupt exists, IF bit is set in rflags/eflags on the
-	 * interrupt stack frame, and interrupt will be enabled on a return
-	 * from interrupt handler.
-	 */
 	if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
 			== (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
 		unsigned int vector;
@@ -8809,7 +8962,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
 					msrs[i].host);
 }
 
-void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
+static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u64 tscl;
@@ -9279,6 +9432,50 @@ static void vmcs_set_secondary_exec_control(u32 new_ctl)
 		     (new_ctl & ~mask) | (cur_ctl & mask));
 }
 
+/*
+ * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
+ * (indicating "allowed-1") if they are supported in the guest's CPUID.
+ */
+static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct kvm_cpuid_entry2 *entry;
+
+	vmx->nested.nested_vmx_cr0_fixed1 = 0xffffffff;
+	vmx->nested.nested_vmx_cr4_fixed1 = X86_CR4_PCE;
+
+#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do {		\
+	if (entry && (entry->_reg & (_cpuid_mask)))			\
+		vmx->nested.nested_vmx_cr4_fixed1 |= (_cr4_mask);	\
+} while (0)
+
+	entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+	cr4_fixed1_update(X86_CR4_VME,        edx, bit(X86_FEATURE_VME));
+	cr4_fixed1_update(X86_CR4_PVI,        edx, bit(X86_FEATURE_VME));
+	cr4_fixed1_update(X86_CR4_TSD,        edx, bit(X86_FEATURE_TSC));
+	cr4_fixed1_update(X86_CR4_DE,         edx, bit(X86_FEATURE_DE));
+	cr4_fixed1_update(X86_CR4_PSE,        edx, bit(X86_FEATURE_PSE));
+	cr4_fixed1_update(X86_CR4_PAE,        edx, bit(X86_FEATURE_PAE));
+	cr4_fixed1_update(X86_CR4_MCE,        edx, bit(X86_FEATURE_MCE));
+	cr4_fixed1_update(X86_CR4_PGE,        edx, bit(X86_FEATURE_PGE));
+	cr4_fixed1_update(X86_CR4_OSFXSR,     edx, bit(X86_FEATURE_FXSR));
+	cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM));
+	cr4_fixed1_update(X86_CR4_VMXE,       ecx, bit(X86_FEATURE_VMX));
+	cr4_fixed1_update(X86_CR4_SMXE,       ecx, bit(X86_FEATURE_SMX));
+	cr4_fixed1_update(X86_CR4_PCIDE,      ecx, bit(X86_FEATURE_PCID));
+	cr4_fixed1_update(X86_CR4_OSXSAVE,    ecx, bit(X86_FEATURE_XSAVE));
+
+	entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
+	cr4_fixed1_update(X86_CR4_FSGSBASE,   ebx, bit(X86_FEATURE_FSGSBASE));
+	cr4_fixed1_update(X86_CR4_SMEP,       ebx, bit(X86_FEATURE_SMEP));
+	cr4_fixed1_update(X86_CR4_SMAP,       ebx, bit(X86_FEATURE_SMAP));
+	cr4_fixed1_update(X86_CR4_PKE,        ecx, bit(X86_FEATURE_PKU));
+	/* TODO: Use X86_CR4_UMIP and X86_FEATURE_UMIP macros */
+	cr4_fixed1_update(bit(11),            ecx, bit(2));
+
+#undef cr4_fixed1_update
+}
+
 static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
@@ -9320,6 +9517,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 	else
 		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
 			~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+
+	if (nested_vmx_allowed(vcpu))
+		nested_vmx_cr_fixed1_bits_update(vcpu);
 }
 
 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -9774,6 +9974,49 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
 	return 0;
 }
 
+static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+	unsigned long invalid_mask;
+
+	invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
+	return (val & invalid_mask) == 0;
+}
+
+/*
+ * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are
+ * emulating VM entry into a guest with EPT enabled.
+ * Returns 0 on success, 1 on failure. Invalid state exit qualification code
+ * is assigned to entry_failure_code on failure.
+ */
+static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
+			       unsigned long *entry_failure_code)
+{
+	if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
+		if (!nested_cr3_valid(vcpu, cr3)) {
+			*entry_failure_code = ENTRY_FAIL_DEFAULT;
+			return 1;
+		}
+
+		/*
+		 * If PAE paging and EPT are both on, CR3 is not used by the CPU and
+		 * must not be dereferenced.
+		 */
+		if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) &&
+		    !nested_ept) {
+			if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) {
+				*entry_failure_code = ENTRY_FAIL_PDPTE;
+				return 1;
+			}
+		}
+
+		vcpu->arch.cr3 = cr3;
+		__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+	}
+
+	kvm_mmu_reset_context(vcpu);
+	return 0;
+}
+
 /*
  * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
  * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -9782,11 +10025,15 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
  * needs. In addition to modifying the active vmcs (which is vmcs02), this
  * function also has additional necessary side-effects, like setting various
  * vcpu->arch fields.
+ * Returns 0 on success, 1 on failure. Invalid state exit qualification code
+ * is assigned to entry_failure_code on failure.
  */
-static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+			  unsigned long *entry_failure_code)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 exec_control;
+	bool nested_ept_enabled = false;
 
 	vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
 	vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
@@ -9951,6 +10198,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 				vmcs12->guest_intr_status);
 		}
 
+		nested_ept_enabled = (exec_control & SECONDARY_EXEC_ENABLE_EPT) != 0;
 		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
 	}
 
@@ -9964,6 +10212,15 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	vmx_set_constant_host_state(vmx);
 
 	/*
+	 * Set the MSR load/store lists to match L0's settings.
+	 */
+	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
+	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
+	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
+	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
+	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
+
+	/*
 	 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
 	 * entry, but only if the current (host) sp changed from the value
 	 * we wrote last (vmx->host_rsp). This cache is no longer relevant
@@ -10069,15 +10326,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 		nested_ept_init_mmu_context(vcpu);
 	}
 
-	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
-		vcpu->arch.efer = vmcs12->guest_ia32_efer;
-	else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
-		vcpu->arch.efer |= (EFER_LMA | EFER_LME);
-	else
-		vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
-	/* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
-	vmx_set_efer(vcpu, vcpu->arch.efer);
-
 	/*
 	 * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified
 	 * TS bit (for lazy fpu) and bits which we consider mandatory enabled.
@@ -10092,8 +10340,20 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	vmx_set_cr4(vcpu, vmcs12->guest_cr4);
 	vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
 
-	/* shadow page tables on either EPT or shadow page tables */
-	kvm_set_cr3(vcpu, vmcs12->guest_cr3);
+	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
+		vcpu->arch.efer = vmcs12->guest_ia32_efer;
+	else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
+		vcpu->arch.efer |= (EFER_LMA | EFER_LME);
+	else
+		vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
+	/* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
+	vmx_set_efer(vcpu, vcpu->arch.efer);
+
+	/* Shadow page tables on either EPT or shadow page tables. */
+	if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_ept_enabled,
+				entry_failure_code))
+		return 1;
+
 	kvm_mmu_reset_context(vcpu);
 
 	if (!enable_ept)
@@ -10111,6 +10371,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
 	kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
 	kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
+	return 0;
 }
 
 /*
@@ -10125,12 +10386,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	struct loaded_vmcs *vmcs02;
 	bool ia32e;
 	u32 msr_entry_idx;
+	unsigned long exit_qualification;
 
-	if (!nested_vmx_check_permission(vcpu) ||
-	    !nested_vmx_check_vmcs12(vcpu))
+	if (!nested_vmx_check_permission(vcpu))
 		return 1;
 
-	skip_emulated_instruction(vcpu);
+	if (!nested_vmx_check_vmcs12(vcpu))
+		goto out;
+
 	vmcs12 = get_vmcs12(vcpu);
 
 	if (enable_shadow_vmcs)
@@ -10150,37 +10413,37 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 		nested_vmx_failValid(vcpu,
 			launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
 			       : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
-		return 1;
+		goto out;
 	}
 
 	if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
 	    vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) {
 		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-		return 1;
+		goto out;
 	}
 
 	if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
 		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-		return 1;
+		goto out;
 	}
 
 	if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) {
 		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-		return 1;
+		goto out;
 	}
 
 	if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) {
 		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-		return 1;
+		goto out;
 	}
 
 	if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
 		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-		return 1;
+		goto out;
 	}
 
 	if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
-				vmx->nested.nested_vmx_true_procbased_ctls_low,
+				vmx->nested.nested_vmx_procbased_ctls_low,
 				vmx->nested.nested_vmx_procbased_ctls_high) ||
 	    !vmx_control_verify(vmcs12->secondary_vm_exec_control,
 				vmx->nested.nested_vmx_secondary_ctls_low,
@@ -10189,33 +10452,34 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 				vmx->nested.nested_vmx_pinbased_ctls_low,
 				vmx->nested.nested_vmx_pinbased_ctls_high) ||
 	    !vmx_control_verify(vmcs12->vm_exit_controls,
-				vmx->nested.nested_vmx_true_exit_ctls_low,
+				vmx->nested.nested_vmx_exit_ctls_low,
 				vmx->nested.nested_vmx_exit_ctls_high) ||
 	    !vmx_control_verify(vmcs12->vm_entry_controls,
-				vmx->nested.nested_vmx_true_entry_ctls_low,
+				vmx->nested.nested_vmx_entry_ctls_low,
 				vmx->nested.nested_vmx_entry_ctls_high))
 	{
 		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-		return 1;
+		goto out;
 	}
 
-	if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
-	    ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
+	if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) ||
+	    !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
+	    !nested_cr3_valid(vcpu, vmcs12->host_cr3)) {
 		nested_vmx_failValid(vcpu,
 			VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
-		return 1;
+		goto out;
 	}
 
-	if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) ||
-	    ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
+	if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
+	    !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) {
 		nested_vmx_entry_failure(vcpu, vmcs12,
 			EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
-		return 1;
+		goto out;
 	}
 	if (vmcs12->vmcs_link_pointer != -1ull) {
 		nested_vmx_entry_failure(vcpu, vmcs12,
 			EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR);
-		return 1;
+		goto out;
 	}
 
 	/*
@@ -10235,7 +10499,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 		     ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) {
 			nested_vmx_entry_failure(vcpu, vmcs12,
 				EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
-			return 1;
+			goto out;
 		}
 	}
 
@@ -10253,7 +10517,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 		    ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) {
 			nested_vmx_entry_failure(vcpu, vmcs12,
 				EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
-			return 1;
+			goto out;
 		}
 	}
 
@@ -10266,6 +10530,12 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	if (!vmcs02)
 		return -ENOMEM;
 
+	/*
+	 * After this point, the trap flag no longer triggers a singlestep trap
+	 * on the vm entry instructions. Don't call
+	 * kvm_skip_emulated_instruction.
+	 */
+	skip_emulated_instruction(vcpu);
 	enter_guest_mode(vcpu);
 
 	if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
@@ -10280,7 +10550,13 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 
 	vmx_segment_cache_clear(vmx);
 
-	prepare_vmcs02(vcpu, vmcs12);
+	if (prepare_vmcs02(vcpu, vmcs12, &exit_qualification)) {
+		leave_guest_mode(vcpu);
+		vmx_load_vmcs01(vcpu);
+		nested_vmx_entry_failure(vcpu, vmcs12,
+				EXIT_REASON_INVALID_STATE, exit_qualification);
+		return 1;
+	}
 
 	msr_entry_idx = nested_vmx_load_msr(vcpu,
 					    vmcs12->vm_entry_msr_load_addr,
@@ -10307,6 +10583,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	 * the success flag) when L2 exits (see nested_vmx_vmexit()).
 	 */
 	return 1;
+
+out:
+	return kvm_skip_emulated_instruction(vcpu);
 }
 
 /*
@@ -10612,6 +10891,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 				   struct vmcs12 *vmcs12)
 {
 	struct kvm_segment seg;
+	unsigned long entry_failure_code;
 
 	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
 		vcpu->arch.efer = vmcs12->host_ia32_efer;
@@ -10649,8 +10929,12 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 
 	nested_ept_uninit_mmu_context(vcpu);
 
-	kvm_set_cr3(vcpu, vmcs12->host_cr3);
-	kvm_mmu_reset_context(vcpu);
+	/*
+	 * Only PDPTE load can fail as the value of cr3 was checked on entry and
+	 * couldn't have changed.
+	 */
+	if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
+		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
 
 	if (!enable_ept)
 		vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
@@ -10751,6 +11035,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+	u32 vm_inst_error = 0;
 
 	/* trying to cancel vmlaunch/vmresume is a bug */
 	WARN_ON_ONCE(vmx->nested.nested_run_pending);
@@ -10763,6 +11048,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 				 vmcs12->vm_exit_msr_store_count))
 		nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
 
+	if (unlikely(vmx->fail))
+		vm_inst_error = vmcs_read32(VM_INSTRUCTION_ERROR);
+
 	vmx_load_vmcs01(vcpu);
 
 	if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
@@ -10791,6 +11079,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 	load_vmcs12_host_state(vcpu, vmcs12);
 
 	/* Update any VMCS fields that might have changed while L2 ran */
+	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
+	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
 	vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
 	if (vmx->hv_deadline_tsc == -1)
 		vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
@@ -10839,7 +11129,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 	 */
 	if (unlikely(vmx->fail)) {
 		vmx->fail = 0;
-		nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR));
+		nested_vmx_failValid(vcpu, vm_inst_error);
 	} else
 		nested_vmx_succeed(vcpu);
 	if (enable_shadow_vmcs)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2912684ff902..1f0d2383f5ee 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -434,12 +434,14 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 }
 EXPORT_SYMBOL_GPL(kvm_requeue_exception);
 
-void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
+int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
 {
 	if (err)
 		kvm_inject_gp(vcpu, 0);
 	else
-		kvm_x86_ops->skip_emulated_instruction(vcpu);
+		return kvm_skip_emulated_instruction(vcpu);
+
+	return 1;
 }
 EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
 
@@ -573,7 +575,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(load_pdptrs);
 
-static bool pdptrs_changed(struct kvm_vcpu *vcpu)
+bool pdptrs_changed(struct kvm_vcpu *vcpu)
 {
 	u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
 	bool changed = true;
@@ -599,6 +601,7 @@ out:
 
 	return changed;
 }
+EXPORT_SYMBOL_GPL(pdptrs_changed);
 
 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
@@ -2178,7 +2181,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		break;
 	case MSR_KVM_SYSTEM_TIME_NEW:
 	case MSR_KVM_SYSTEM_TIME: {
-		u64 gpa_offset;
 		struct kvm_arch *ka = &vcpu->kvm->arch;
 
 		kvmclock_reset(vcpu);
@@ -2200,8 +2202,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (!(data & 1))
 			break;
 
-		gpa_offset = data & ~(PAGE_MASK | 1);
-
 		if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
 		     &vcpu->arch.pv_time, data & ~1ULL,
 		     sizeof(struct pvclock_vcpu_time_info)))
@@ -2296,7 +2296,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (kvm_pmu_is_valid_msr(vcpu, msr))
 			return kvm_pmu_set_msr(vcpu, msr_info);
 		if (!ignore_msrs) {
-			vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n",
+			vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n",
 				    msr, data);
 			return 1;
 		} else {
@@ -2508,7 +2508,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
 			return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
 		if (!ignore_msrs) {
-			vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr_info->index);
+			vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n",
+					       msr_info->index);
 			return 1;
 		} else {
 			vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr_info->index);
@@ -2812,7 +2813,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		}
 		if (kvm_lapic_hv_timer_in_use(vcpu) &&
 				kvm_x86_ops->set_hv_timer(vcpu,
-					kvm_get_lapic_tscdeadline_msr(vcpu)))
+					kvm_get_lapic_target_expiration_tsc(vcpu)))
 			kvm_lapic_switch_to_sw_timer(vcpu);
 		/*
 		 * On a host with synchronized TSC, there is no need to update
@@ -4832,7 +4833,7 @@ static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
 	kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
 }
 
-int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
+static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
 {
 	if (!need_emulate_wbinvd(vcpu))
 		return X86EMUL_CONTINUE;
@@ -4852,8 +4853,8 @@ int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
 
 int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
 {
-	kvm_x86_ops->skip_emulated_instruction(vcpu);
-	return kvm_emulate_wbinvd_noskip(vcpu);
+	kvm_emulate_wbinvd_noskip(vcpu);
+	return kvm_skip_emulated_instruction(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
 
@@ -5451,7 +5452,6 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflag
 			kvm_run->exit_reason = KVM_EXIT_DEBUG;
 			*r = EMULATE_USER_EXIT;
 		} else {
-			vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF;
 			/*
 			 * "Certain debug exceptions may clear bit 0-3.  The
 			 * remaining contents of the DR6 register are never
@@ -5464,6 +5464,17 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflag
 	}
 }
 
+int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+	unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
+	int r = EMULATE_DONE;
+
+	kvm_x86_ops->skip_emulated_instruction(vcpu);
+	kvm_vcpu_check_singlestep(vcpu, rflags, &r);
+	return r == EMULATE_DONE;
+}
+EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
+
 static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
 {
 	if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
@@ -5649,6 +5660,49 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
 }
 EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
 
+static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
+{
+	unsigned long val;
+
+	/* We should only ever be called with arch.pio.count equal to 1 */
+	BUG_ON(vcpu->arch.pio.count != 1);
+
+	/* For size less than 4 we merge, else we zero extend */
+	val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX)
+					: 0;
+
+	/*
+	 * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform
+	 * the copy and tracing
+	 */
+	emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size,
+				 vcpu->arch.pio.port, &val, 1);
+	kvm_register_write(vcpu, VCPU_REGS_RAX, val);
+
+	return 1;
+}
+
+int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port)
+{
+	unsigned long val;
+	int ret;
+
+	/* For size less than 4 we merge, else we zero extend */
+	val = (size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) : 0;
+
+	ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port,
+				       &val, 1);
+	if (ret) {
+		kvm_register_write(vcpu, VCPU_REGS_RAX, val);
+		return ret;
+	}
+
+	vcpu->arch.complete_userspace_io = complete_fast_pio_in;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_fast_pio_in);
+
 static int kvmclock_cpu_down_prep(unsigned int cpu)
 {
 	__this_cpu_write(cpu_tsc_khz, 0);
@@ -5998,8 +6052,12 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
 
 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 {
-	kvm_x86_ops->skip_emulated_instruction(vcpu);
-	return kvm_vcpu_halt(vcpu);
+	int ret = kvm_skip_emulated_instruction(vcpu);
+	/*
+	 * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
+	 * KVM_EXIT_DEBUG here.
+	 */
+	return kvm_vcpu_halt(vcpu) && ret;
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 
@@ -6030,9 +6088,9 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
 	unsigned long nr, a0, a1, a2, a3, ret;
-	int op_64_bit, r = 1;
+	int op_64_bit, r;
 
-	kvm_x86_ops->skip_emulated_instruction(vcpu);
+	r = kvm_skip_emulated_instruction(vcpu);
 
 	if (kvm_hv_hypercall_enabled(vcpu->kvm))
 		return kvm_hv_hypercall(vcpu);
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index bedfab98077a..e1fb269c87af 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -264,8 +264,8 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	return 0;
 
 error:
-	dev_err(&dev->dev,
-		"Xen PCI frontend has not registered MSI/MSI-X support!\n");
+	dev_err(&dev->dev, "Failed to create MSI%s! ret=%d!\n",
+		type == PCI_CAP_ID_MSI ? "" : "-X", irq);
 	return irq;
 }
 
diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c
index b27bccd4390f..821cb41f00e6 100644
--- a/arch/x86/platform/ce4100/ce4100.c
+++ b/arch/x86/platform/ce4100/ce4100.c
@@ -89,7 +89,7 @@ static void ce4100_mem_serial_out(struct uart_port *p, int offset, int value)
 }
 
 static void ce4100_serial_fixup(int port, struct uart_port *up,
-	unsigned short *capabilites)
+	u32 *capabilites)
 {
 #ifdef CONFIG_EARLY_PRINTK
 	/*
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
index 0e98e5d241d0..a9fafb5c8738 100644
--- a/arch/x86/xen/pci-swiotlb-xen.c
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -19,7 +19,6 @@
 int xen_swiotlb __read_mostly;
 
 static struct dma_map_ops xen_swiotlb_dma_ops = {
-	.mapping_error = xen_swiotlb_dma_mapping_error,
 	.alloc = xen_swiotlb_alloc_coherent,
 	.free = xen_swiotlb_free_coherent,
 	.sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index f8960fca0827..8c394e30e5fe 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -41,7 +41,7 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
 unsigned long xen_released_pages;
 
 /* E820 map used during setting up memory. */
-static struct e820entry xen_e820_map[E820MAX] __initdata;
+static struct e820entry xen_e820_map[E820_X_MAX] __initdata;
 static u32 xen_e820_map_entries __initdata;
 
 /*
@@ -750,7 +750,7 @@ char * __init xen_memory_setup(void)
 	max_pfn = min(max_pfn, xen_start_info->nr_pages);
 	mem_end = PFN_PHYS(max_pfn);
 
-	memmap.nr_entries = E820MAX;
+	memmap.nr_entries = ARRAY_SIZE(xen_e820_map);
 	set_xen_guest_handle(memmap.buffer, xen_e820_map);
 
 	op = xen_initial_domain() ?
@@ -923,7 +923,7 @@ char * __init xen_auto_xlated_memory_setup(void)
 	int i;
 	int rc;
 
-	memmap.nr_entries = E820MAX;
+	memmap.nr_entries = ARRAY_SIZE(xen_e820_map);
 	set_xen_guest_handle(memmap.buffer, xen_e820_map);
 
 	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);