diff options
Diffstat (limited to 'arch')
231 files changed, 4974 insertions, 2556 deletions
diff --git a/arch/arm/boot/dts/rk3036.dtsi b/arch/arm/boot/dts/rk3036.dtsi index a935523a1eb8..7c2dc19925a1 100644 --- a/arch/arm/boot/dts/rk3036.dtsi +++ b/arch/arm/boot/dts/rk3036.dtsi @@ -204,7 +204,6 @@ g-np-tx-fifo-size = <16>; g-rx-fifo-size = <275>; g-tx-fifo-size = <256 128 128 64 64 32>; - g-use-dma; status = "disabled"; }; diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi index 17ec2e2d7a60..74a749c566ee 100644 --- a/arch/arm/boot/dts/rk3288.dtsi +++ b/arch/arm/boot/dts/rk3288.dtsi @@ -596,7 +596,6 @@ g-np-tx-fifo-size = <16>; g-rx-fifo-size = <275>; g-tx-fifo-size = <256 128 128 64 64 32>; - g-use-dma; phys = <&usbphy0>; phy-names = "usb2-phy"; status = "disabled"; diff --git a/arch/arm/boot/dts/rk3xxx.dtsi b/arch/arm/boot/dts/rk3xxx.dtsi index e15beb3c671e..8fbd3c806fa0 100644 --- a/arch/arm/boot/dts/rk3xxx.dtsi +++ b/arch/arm/boot/dts/rk3xxx.dtsi @@ -181,7 +181,6 @@ g-np-tx-fifo-size = <16>; g-rx-fifo-size = <275>; g-tx-fifo-size = <256 128 128 64 64 32>; - g-use-dma; phys = <&usbphy0>; phy-names = "usb2-phy"; status = "disabled"; diff --git a/arch/arm/include/asm/xen/hypercall.h b/arch/arm/include/asm/xen/hypercall.h index 9d874db13c0e..3522cbaed316 100644 --- a/arch/arm/include/asm/xen/hypercall.h +++ b/arch/arm/include/asm/xen/hypercall.h @@ -1,87 +1 @@ -/****************************************************************************** - * hypercall.h - * - * Linux-specific hypervisor handling. - * - * Stefano Stabellini <stefano.stabellini@eu.citrix.com>, Citrix, 2012 - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation; or, when distributed - * separately from the Linux kernel or incorporated into other - * software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef _ASM_ARM_XEN_HYPERCALL_H -#define _ASM_ARM_XEN_HYPERCALL_H - -#include <linux/bug.h> - -#include <xen/interface/xen.h> -#include <xen/interface/sched.h> -#include <xen/interface/platform.h> - -long privcmd_call(unsigned call, unsigned long a1, - unsigned long a2, unsigned long a3, - unsigned long a4, unsigned long a5); -int HYPERVISOR_xen_version(int cmd, void *arg); -int HYPERVISOR_console_io(int cmd, int count, char *str); -int HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count); -int HYPERVISOR_sched_op(int cmd, void *arg); -int HYPERVISOR_event_channel_op(int cmd, void *arg); -unsigned long HYPERVISOR_hvm_op(int op, void *arg); -int HYPERVISOR_memory_op(unsigned int cmd, void *arg); -int HYPERVISOR_physdev_op(int cmd, void *arg); -int HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args); -int HYPERVISOR_tmem_op(void *arg); -int HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type); -int HYPERVISOR_platform_op_raw(void *arg); -static inline int HYPERVISOR_platform_op(struct xen_platform_op *op) -{ - op->interface_version = XENPF_INTERFACE_VERSION; - return HYPERVISOR_platform_op_raw(op); -} -int HYPERVISOR_multicall(struct multicall_entry *calls, uint32_t nr); - -static inline int -HYPERVISOR_suspend(unsigned long start_info_mfn) -{ - struct sched_shutdown r = { .reason = SHUTDOWN_suspend }; - - /* start_info_mfn is unused on ARM */ - return HYPERVISOR_sched_op(SCHEDOP_shutdown, &r); -} - -static inline void -MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va, - unsigned int new_val, unsigned long flags) -{ - BUG(); -} - -static inline void -MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req, - int count, int *success_count, domid_t domid) -{ - BUG(); -} - -#endif /* _ASM_ARM_XEN_HYPERCALL_H */ +#include <xen/arm/hypercall.h> diff --git a/arch/arm/include/asm/xen/hypervisor.h b/arch/arm/include/asm/xen/hypervisor.h index 95251512e2c4..d6e7709d0688 100644 --- a/arch/arm/include/asm/xen/hypervisor.h +++ b/arch/arm/include/asm/xen/hypervisor.h @@ -1,39 +1 @@ -#ifndef _ASM_ARM_XEN_HYPERVISOR_H -#define _ASM_ARM_XEN_HYPERVISOR_H - -#include <linux/init.h> - -extern struct shared_info *HYPERVISOR_shared_info; -extern struct start_info *xen_start_info; - -/* Lazy mode for batching updates / context switch */ -enum paravirt_lazy_mode { - PARAVIRT_LAZY_NONE, - PARAVIRT_LAZY_MMU, - PARAVIRT_LAZY_CPU, -}; - -static inline enum paravirt_lazy_mode paravirt_get_lazy_mode(void) -{ - return PARAVIRT_LAZY_NONE; -} - -extern struct dma_map_ops *xen_dma_ops; - -#ifdef CONFIG_XEN -void __init xen_early_init(void); -#else -static inline void xen_early_init(void) { return; } -#endif - -#ifdef CONFIG_HOTPLUG_CPU -static inline void xen_arch_register_cpu(int num) -{ -} - -static inline void xen_arch_unregister_cpu(int num) -{ -} -#endif - -#endif /* _ASM_ARM_XEN_HYPERVISOR_H */ +#include <xen/arm/hypervisor.h> diff --git a/arch/arm/include/asm/xen/interface.h b/arch/arm/include/asm/xen/interface.h index 75d596862892..88c0d75da190 100644 --- a/arch/arm/include/asm/xen/interface.h +++ b/arch/arm/include/asm/xen/interface.h @@ -1,85 +1 @@ -/****************************************************************************** - * Guest OS interface to ARM Xen. - * - * Stefano Stabellini <stefano.stabellini@eu.citrix.com>, Citrix, 2012 - */ - -#ifndef _ASM_ARM_XEN_INTERFACE_H -#define _ASM_ARM_XEN_INTERFACE_H - -#include <linux/types.h> - -#define uint64_aligned_t uint64_t __attribute__((aligned(8))) - -#define __DEFINE_GUEST_HANDLE(name, type) \ - typedef struct { union { type *p; uint64_aligned_t q; }; } \ - __guest_handle_ ## name - -#define DEFINE_GUEST_HANDLE_STRUCT(name) \ - __DEFINE_GUEST_HANDLE(name, struct name) -#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name) -#define GUEST_HANDLE(name) __guest_handle_ ## name - -#define set_xen_guest_handle(hnd, val) \ - do { \ - if (sizeof(hnd) == 8) \ - *(uint64_t *)&(hnd) = 0; \ - (hnd).p = val; \ - } while (0) - -#define __HYPERVISOR_platform_op_raw __HYPERVISOR_platform_op - -#ifndef __ASSEMBLY__ -/* Explicitly size integers that represent pfns in the interface with - * Xen so that we can have one ABI that works for 32 and 64 bit guests. - * Note that this means that the xen_pfn_t type may be capable of - * representing pfn's which the guest cannot represent in its own pfn - * type. However since pfn space is controlled by the guest this is - * fine since it simply wouldn't be able to create any sure pfns in - * the first place. - */ -typedef uint64_t xen_pfn_t; -#define PRI_xen_pfn "llx" -typedef uint64_t xen_ulong_t; -#define PRI_xen_ulong "llx" -typedef int64_t xen_long_t; -#define PRI_xen_long "llx" -/* Guest handles for primitive C types. */ -__DEFINE_GUEST_HANDLE(uchar, unsigned char); -__DEFINE_GUEST_HANDLE(uint, unsigned int); -DEFINE_GUEST_HANDLE(char); -DEFINE_GUEST_HANDLE(int); -DEFINE_GUEST_HANDLE(void); -DEFINE_GUEST_HANDLE(uint64_t); -DEFINE_GUEST_HANDLE(uint32_t); -DEFINE_GUEST_HANDLE(xen_pfn_t); -DEFINE_GUEST_HANDLE(xen_ulong_t); - -/* Maximum number of virtual CPUs in multi-processor guests. */ -#define MAX_VIRT_CPUS 1 - -struct arch_vcpu_info { }; -struct arch_shared_info { }; - -/* TODO: Move pvclock definitions some place arch independent */ -struct pvclock_vcpu_time_info { - u32 version; - u32 pad0; - u64 tsc_timestamp; - u64 system_time; - u32 tsc_to_system_mul; - s8 tsc_shift; - u8 flags; - u8 pad[2]; -} __attribute__((__packed__)); /* 32 bytes */ - -/* It is OK to have a 12 bytes struct with no padding because it is packed */ -struct pvclock_wall_clock { - u32 version; - u32 sec; - u32 nsec; - u32 sec_hi; -} __attribute__((__packed__)); -#endif - -#endif /* _ASM_ARM_XEN_INTERFACE_H */ +#include <xen/arm/interface.h> diff --git a/arch/arm/include/asm/xen/page-coherent.h b/arch/arm/include/asm/xen/page-coherent.h index 95ce6ac3a971..b3ef061d8b74 100644 --- a/arch/arm/include/asm/xen/page-coherent.h +++ b/arch/arm/include/asm/xen/page-coherent.h @@ -1,98 +1 @@ -#ifndef _ASM_ARM_XEN_PAGE_COHERENT_H -#define _ASM_ARM_XEN_PAGE_COHERENT_H - -#include <asm/page.h> -#include <linux/dma-mapping.h> - -void __xen_dma_map_page(struct device *hwdev, struct page *page, - dma_addr_t dev_addr, unsigned long offset, size_t size, - enum dma_data_direction dir, unsigned long attrs); -void __xen_dma_unmap_page(struct device *hwdev, dma_addr_t handle, - size_t size, enum dma_data_direction dir, - unsigned long attrs); -void __xen_dma_sync_single_for_cpu(struct device *hwdev, - dma_addr_t handle, size_t size, enum dma_data_direction dir); - -void __xen_dma_sync_single_for_device(struct device *hwdev, - dma_addr_t handle, size_t size, enum dma_data_direction dir); - -static inline void *xen_alloc_coherent_pages(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, gfp_t flags, unsigned long attrs) -{ - return __generic_dma_ops(hwdev)->alloc(hwdev, size, dma_handle, flags, attrs); -} - -static inline void xen_free_coherent_pages(struct device *hwdev, size_t size, - void *cpu_addr, dma_addr_t dma_handle, unsigned long attrs) -{ - __generic_dma_ops(hwdev)->free(hwdev, size, cpu_addr, dma_handle, attrs); -} - -static inline void xen_dma_map_page(struct device *hwdev, struct page *page, - dma_addr_t dev_addr, unsigned long offset, size_t size, - enum dma_data_direction dir, unsigned long attrs) -{ - unsigned long page_pfn = page_to_xen_pfn(page); - unsigned long dev_pfn = XEN_PFN_DOWN(dev_addr); - unsigned long compound_pages = - (1<<compound_order(page)) * XEN_PFN_PER_PAGE; - bool local = (page_pfn <= dev_pfn) && - (dev_pfn - page_pfn < compound_pages); - - /* - * Dom0 is mapped 1:1, while the Linux page can span across - * multiple Xen pages, it's not possible for it to contain a - * mix of local and foreign Xen pages. So if the first xen_pfn - * == mfn the page is local otherwise it's a foreign page - * grant-mapped in dom0. If the page is local we can safely - * call the native dma_ops function, otherwise we call the xen - * specific function. - */ - if (local) - __generic_dma_ops(hwdev)->map_page(hwdev, page, offset, size, dir, attrs); - else - __xen_dma_map_page(hwdev, page, dev_addr, offset, size, dir, attrs); -} - -static inline void xen_dma_unmap_page(struct device *hwdev, dma_addr_t handle, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ - unsigned long pfn = PFN_DOWN(handle); - /* - * Dom0 is mapped 1:1, while the Linux page can be spanned accross - * multiple Xen page, it's not possible to have a mix of local and - * foreign Xen page. Dom0 is mapped 1:1, so calling pfn_valid on a - * foreign mfn will always return false. If the page is local we can - * safely call the native dma_ops function, otherwise we call the xen - * specific function. - */ - if (pfn_valid(pfn)) { - if (__generic_dma_ops(hwdev)->unmap_page) - __generic_dma_ops(hwdev)->unmap_page(hwdev, handle, size, dir, attrs); - } else - __xen_dma_unmap_page(hwdev, handle, size, dir, attrs); -} - -static inline void xen_dma_sync_single_for_cpu(struct device *hwdev, - dma_addr_t handle, size_t size, enum dma_data_direction dir) -{ - unsigned long pfn = PFN_DOWN(handle); - if (pfn_valid(pfn)) { - if (__generic_dma_ops(hwdev)->sync_single_for_cpu) - __generic_dma_ops(hwdev)->sync_single_for_cpu(hwdev, handle, size, dir); - } else - __xen_dma_sync_single_for_cpu(hwdev, handle, size, dir); -} - -static inline void xen_dma_sync_single_for_device(struct device *hwdev, - dma_addr_t handle, size_t size, enum dma_data_direction dir) -{ - unsigned long pfn = PFN_DOWN(handle); - if (pfn_valid(pfn)) { - if (__generic_dma_ops(hwdev)->sync_single_for_device) - __generic_dma_ops(hwdev)->sync_single_for_device(hwdev, handle, size, dir); - } else - __xen_dma_sync_single_for_device(hwdev, handle, size, dir); -} - -#endif /* _ASM_ARM_XEN_PAGE_COHERENT_H */ +#include <xen/arm/page-coherent.h> diff --git a/arch/arm/include/asm/xen/page.h b/arch/arm/include/asm/xen/page.h index 415dbc6e43fd..31bbc803cecb 100644 --- a/arch/arm/include/asm/xen/page.h +++ b/arch/arm/include/asm/xen/page.h @@ -1,122 +1 @@ -#ifndef _ASM_ARM_XEN_PAGE_H -#define _ASM_ARM_XEN_PAGE_H - -#include <asm/page.h> -#include <asm/pgtable.h> - -#include <linux/pfn.h> -#include <linux/types.h> -#include <linux/dma-mapping.h> - -#include <xen/xen.h> -#include <xen/interface/grant_table.h> - -#define phys_to_machine_mapping_valid(pfn) (1) - -/* Xen machine address */ -typedef struct xmaddr { - phys_addr_t maddr; -} xmaddr_t; - -/* Xen pseudo-physical address */ -typedef struct xpaddr { - phys_addr_t paddr; -} xpaddr_t; - -#define XMADDR(x) ((xmaddr_t) { .maddr = (x) }) -#define XPADDR(x) ((xpaddr_t) { .paddr = (x) }) - -#define INVALID_P2M_ENTRY (~0UL) - -/* - * The pseudo-physical frame (pfn) used in all the helpers is always based - * on Xen page granularity (i.e 4KB). - * - * A Linux page may be split across multiple non-contiguous Xen page so we - * have to keep track with frame based on 4KB page granularity. - * - * PV drivers should never make a direct usage of those helpers (particularly - * pfn_to_gfn and gfn_to_pfn). - */ - -unsigned long __pfn_to_mfn(unsigned long pfn); -extern struct rb_root phys_to_mach; - -/* Pseudo-physical <-> Guest conversion */ -static inline unsigned long pfn_to_gfn(unsigned long pfn) -{ - return pfn; -} - -static inline unsigned long gfn_to_pfn(unsigned long gfn) -{ - return gfn; -} - -/* Pseudo-physical <-> BUS conversion */ -static inline unsigned long pfn_to_bfn(unsigned long pfn) -{ - unsigned long mfn; - - if (phys_to_mach.rb_node != NULL) { - mfn = __pfn_to_mfn(pfn); - if (mfn != INVALID_P2M_ENTRY) - return mfn; - } - - return pfn; -} - -static inline unsigned long bfn_to_pfn(unsigned long bfn) -{ - return bfn; -} - -#define bfn_to_local_pfn(bfn) bfn_to_pfn(bfn) - -/* VIRT <-> GUEST conversion */ -#define virt_to_gfn(v) (pfn_to_gfn(virt_to_phys(v) >> XEN_PAGE_SHIFT)) -#define gfn_to_virt(m) (__va(gfn_to_pfn(m) << XEN_PAGE_SHIFT)) - -/* Only used in PV code. But ARM guests are always HVM. */ -static inline xmaddr_t arbitrary_virt_to_machine(void *vaddr) -{ - BUG(); -} - -/* TODO: this shouldn't be here but it is because the frontend drivers - * are using it (its rolled in headers) even though we won't hit the code path. - * So for right now just punt with this. - */ -static inline pte_t *lookup_address(unsigned long address, unsigned int *level) -{ - BUG(); - return NULL; -} - -extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, - struct gnttab_map_grant_ref *kmap_ops, - struct page **pages, unsigned int count); - -extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, - struct gnttab_unmap_grant_ref *kunmap_ops, - struct page **pages, unsigned int count); - -bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); -bool __set_phys_to_machine_multi(unsigned long pfn, unsigned long mfn, - unsigned long nr_pages); - -static inline bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) -{ - return __set_phys_to_machine(pfn, mfn); -} - -#define xen_remap(cookie, size) ioremap_cache((cookie), (size)) -#define xen_unmap(cookie) iounmap((cookie)) - -bool xen_arch_need_swiotlb(struct device *dev, - phys_addr_t phys, - dma_addr_t dev_addr); -unsigned long xen_get_swiotlb_free_pages(unsigned int order); - -#endif /* _ASM_ARM_XEN_PAGE_H */ +#include <xen/arm/page.h> diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index b38c10c73579..af05f8e0903e 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -87,9 +87,11 @@ struct kvm_regs { /* Supported VGICv3 address types */ #define KVM_VGIC_V3_ADDR_TYPE_DIST 2 #define KVM_VGIC_V3_ADDR_TYPE_REDIST 3 +#define KVM_VGIC_ITS_ADDR_TYPE 4 #define KVM_VGIC_V3_DIST_SIZE SZ_64K #define KVM_VGIC_V3_REDIST_SIZE (2 * SZ_64K) +#define KVM_VGIC_V3_ITS_SIZE (2 * SZ_64K) #define KVM_ARM_VCPU_POWER_OFF 0 /* CPU is started in OFF state */ #define KVM_ARM_VCPU_PSCI_0_2 1 /* CPU uses PSCI v0.2 */ diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 3e1cd0452d67..90d0176fb30d 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -34,6 +34,7 @@ config KVM select HAVE_KVM_IRQFD select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQ_ROUTING + select HAVE_KVM_MSI depends on ARM_VIRT_EXT && ARM_LPAE && ARM_ARCH_TIMER ---help--- Support hosting virtualized guest machines. diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index f19842ea5418..d571243ab4d1 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -32,5 +32,6 @@ obj-y += $(KVM)/arm/vgic/vgic-mmio.o obj-y += $(KVM)/arm/vgic/vgic-mmio-v2.o obj-y += $(KVM)/arm/vgic/vgic-mmio-v3.o obj-y += $(KVM)/arm/vgic/vgic-kvm-device.o +obj-y += $(KVM)/arm/vgic/vgic-its.o obj-y += $(KVM)/irqchip.o obj-y += $(KVM)/arm/arch_timer.o diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 19b5f5c1c0ff..8f92efa8460e 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -221,6 +221,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; + case KVM_CAP_MSI_DEVID: + if (!kvm) + r = -EINVAL; + else + r = kvm->arch.vgic.msis_require_devid; + break; default: r = kvm_arch_dev_ioctl_check_extension(kvm, ext); break; diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index f193414d0f6f..4986dc0c1dff 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -372,8 +372,7 @@ static int __init xen_guest_init(void) * for secondary CPUs as they are brought up. * For uniformity we use VCPUOP_register_vcpu_info even on cpu0. */ - xen_vcpu_info = __alloc_percpu(sizeof(struct vcpu_info), - sizeof(struct vcpu_info)); + xen_vcpu_info = alloc_percpu(struct vcpu_info); if (xen_vcpu_info == NULL) return -ENOMEM; diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c index d062f08f5020..bd62d94f8ac5 100644 --- a/arch/arm/xen/mm.c +++ b/arch/arm/xen/mm.c @@ -186,7 +186,6 @@ struct dma_map_ops *xen_dma_ops; EXPORT_SYMBOL(xen_dma_ops); static struct dma_map_ops xen_swiotlb_dma_ops = { - .mapping_error = xen_swiotlb_dma_mapping_error, .alloc = xen_swiotlb_alloc_coherent, .free = xen_swiotlb_free_coherent, .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 657be7f5014e..111742126897 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -110,6 +110,7 @@ config ARM64 select POWER_SUPPLY select SPARSE_IRQ select SYSCTL_EXCEPTION_TRACE + select THREAD_INFO_IN_TASK help ARM 64-bit (AArch64) Linux support. @@ -239,6 +240,9 @@ config PGTABLE_LEVELS default 3 if ARM64_16K_PAGES && ARM64_VA_BITS_47 default 4 if !ARM64_64K_PAGES && ARM64_VA_BITS_48 +config ARCH_SUPPORTS_UPROBES + def_bool y + source "init/Kconfig" source "kernel/Kconfig.freezer" @@ -791,6 +795,14 @@ config SETEND_EMULATION If unsure, say Y endif +config ARM64_SW_TTBR0_PAN + bool "Emulate Privileged Access Never using TTBR0_EL1 switching" + help + Enabling this option prevents the kernel from accessing + user-space memory directly by pointing TTBR0_EL1 to a reserved + zeroed area and reserved ASID. The user access routines + restore the valid TTBR0_EL1 temporarily. + menu "ARMv8.1 architectural features" config ARM64_HW_AFDBM diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug index b661fe742615..d1ebd46872fd 100644 --- a/arch/arm64/Kconfig.debug +++ b/arch/arm64/Kconfig.debug @@ -2,9 +2,13 @@ menu "Kernel hacking" source "lib/Kconfig.debug" -config ARM64_PTDUMP +config ARM64_PTDUMP_CORE + def_bool n + +config ARM64_PTDUMP_DEBUGFS bool "Export kernel pagetable layout to userspace via debugfs" depends on DEBUG_KERNEL + select ARM64_PTDUMP_CORE select DEBUG_FS help Say Y here if you want to show the kernel pagetable layout in a @@ -38,6 +42,35 @@ config ARM64_RANDOMIZE_TEXT_OFFSET of TEXT_OFFSET and platforms must not require a specific value. +config DEBUG_WX + bool "Warn on W+X mappings at boot" + select ARM64_PTDUMP_CORE + ---help--- + Generate a warning if any W+X mappings are found at boot. + + This is useful for discovering cases where the kernel is leaving + W+X mappings after applying NX, as such mappings are a security risk. + This check also includes UXN, which should be set on all kernel + mappings. + + Look for a message in dmesg output like this: + + arm64/mm: Checked W+X mappings: passed, no W+X pages found. + + or like this, if the check failed: + + arm64/mm: Checked W+X mappings: FAILED, <N> W+X pages found. + + Note that even if the check fails, your kernel is possibly + still fine, as W+X mappings are not a security hole in + themselves, what they do is that they make the exploitation + of other unfixed kernel bugs easier. + + There is no runtime or memory usage effect of this option + once the kernel has booted up - it's a one time check. + + If in doubt, say "Y". + config DEBUG_SET_MODULE_RONX bool "Set loadable kernel module data as NX and text as RO" depends on MODULES diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 3635b8662724..b9a4a934ca05 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -37,10 +37,16 @@ $(warning LSE atomics not supported by binutils) endif endif -KBUILD_CFLAGS += -mgeneral-regs-only $(lseinstr) +brokengasinst := $(call as-instr,1:\n.inst 0\n.rept . - 1b\n\nnop\n.endr\n,,-DCONFIG_BROKEN_GAS_INST=1) + +ifneq ($(brokengasinst),) +$(warning Detected assembler with broken .inst; disassembly will be unreliable) +endif + +KBUILD_CFLAGS += -mgeneral-regs-only $(lseinstr) $(brokengasinst) KBUILD_CFLAGS += -fno-asynchronous-unwind-tables KBUILD_CFLAGS += $(call cc-option, -mpc-relative-literal-loads) -KBUILD_AFLAGS += $(lseinstr) +KBUILD_AFLAGS += $(lseinstr) $(brokengasinst) ifeq ($(CONFIG_CPU_BIG_ENDIAN), y) KBUILD_CPPFLAGS += -mbig-endian diff --git a/arch/arm64/boot/dts/hisilicon/hi6220.dtsi b/arch/arm64/boot/dts/hisilicon/hi6220.dtsi index 17839db585d5..e0ea60382087 100644 --- a/arch/arm64/boot/dts/hisilicon/hi6220.dtsi +++ b/arch/arm64/boot/dts/hisilicon/hi6220.dtsi @@ -747,7 +747,6 @@ clocks = <&sys_ctrl HI6220_USBOTG_HCLK>; clock-names = "otg"; dr_mode = "otg"; - g-use-dma; g-rx-fifo-size = <512>; g-np-tx-fifo-size = <128>; g-tx-fifo-size = <128 128 128 128 128 128>; diff --git a/arch/arm64/boot/dts/mediatek/mt8173-evb.dts b/arch/arm64/boot/dts/mediatek/mt8173-evb.dts index 2a7f731c7759..0ecaad4333a7 100644 --- a/arch/arm64/boot/dts/mediatek/mt8173-evb.dts +++ b/arch/arm64/boot/dts/mediatek/mt8173-evb.dts @@ -34,15 +34,6 @@ chosen { }; - usb_p1_vbus: regulator@0 { - compatible = "regulator-fixed"; - regulator-name = "usb_vbus"; - regulator-min-microvolt = <5000000>; - regulator-max-microvolt = <5000000>; - gpio = <&pio 130 GPIO_ACTIVE_HIGH>; - enable-active-high; - }; - connector { compatible = "hdmi-connector"; label = "hdmi"; @@ -54,6 +45,29 @@ }; }; }; + + extcon_usb: extcon_iddig { + compatible = "linux,extcon-usb-gpio"; + id-gpio = <&pio 16 GPIO_ACTIVE_HIGH>; + }; + + usb_p1_vbus: regulator@0 { + compatible = "regulator-fixed"; + regulator-name = "usb_vbus"; + regulator-min-microvolt = <5000000>; + regulator-max-microvolt = <5000000>; + gpio = <&pio 130 GPIO_ACTIVE_HIGH>; + enable-active-high; + }; + + usb_p0_vbus: regulator@1 { + compatible = "regulator-fixed"; + regulator-name = "vbus"; + regulator-min-microvolt = <5000000>; + regulator-max-microvolt = <5000000>; + gpio = <&pio 9 GPIO_ACTIVE_HIGH>; + enable-active-high; + }; }; &cec { @@ -243,6 +257,20 @@ bias-pull-down = <MTK_PUPD_SET_R1R0_10>; }; }; + + usb_id_pins_float: usb_iddig_pull_up { + pins_iddig { + pinmux = <MT8173_PIN_16_IDDIG__FUNC_IDDIG>; + bias-pull-up; + }; + }; + + usb_id_pins_ground: usb_iddig_pull_down { + pins_iddig { + pinmux = <MT8173_PIN_16_IDDIG__FUNC_IDDIG>; + bias-pull-down; + }; + }; }; &pwm0 { @@ -469,12 +497,25 @@ status = "okay"; }; +&ssusb { + vusb33-supply = <&mt6397_vusb_reg>; + vbus-supply = <&usb_p0_vbus>; + extcon = <&extcon_usb>; + dr_mode = "otg"; + mediatek,enable-wakeup; + pinctrl-names = "default", "id_float", "id_ground"; + pinctrl-0 = <&usb_id_pins_float>; + pinctrl-1 = <&usb_id_pins_float>; + pinctrl-2 = <&usb_id_pins_ground>; + status = "okay"; +}; + &uart0 { status = "okay"; }; -&usb30 { +&usb_host { vusb33-supply = <&mt6397_vusb_reg>; vbus-supply = <&usb_p1_vbus>; - mediatek,wakeup-src = <1>; + status = "okay"; }; diff --git a/arch/arm64/boot/dts/mediatek/mt8173.dtsi b/arch/arm64/boot/dts/mediatek/mt8173.dtsi index 1c71e256601d..c2d588ca59b7 100644 --- a/arch/arm64/boot/dts/mediatek/mt8173.dtsi +++ b/arch/arm64/boot/dts/mediatek/mt8173.dtsi @@ -707,11 +707,14 @@ status = "disabled"; }; - usb30: usb@11270000 { - compatible = "mediatek,mt8173-xhci"; - reg = <0 0x11270000 0 0x1000>, + ssusb: usb@11271000 { + compatible = "mediatek,mt8173-mtu3"; + reg = <0 0x11271000 0 0x3000>, <0 0x11280700 0 0x0100>; - interrupts = <GIC_SPI 115 IRQ_TYPE_LEVEL_LOW>; + reg-names = "mac", "ippc"; + interrupts = <GIC_SPI 64 IRQ_TYPE_LEVEL_LOW>; + phys = <&phy_port0 PHY_TYPE_USB3>, + <&phy_port1 PHY_TYPE_USB2>; power-domains = <&scpsys MT8173_POWER_DOMAIN_USB>; clocks = <&topckgen CLK_TOP_USB30_SEL>, <&pericfg CLK_PERI_USB0>, @@ -719,10 +722,22 @@ clock-names = "sys_ck", "wakeup_deb_p0", "wakeup_deb_p1"; - phys = <&phy_port0 PHY_TYPE_USB3>, - <&phy_port1 PHY_TYPE_USB2>; mediatek,syscon-wakeup = <&pericfg>; - status = "okay"; + #address-cells = <2>; + #size-cells = <2>; + ranges; + status = "disabled"; + + usb_host: xhci@11270000 { + compatible = "mediatek,mt8173-xhci"; + reg = <0 0x11270000 0 0x1000>; + reg-names = "mac"; + interrupts = <GIC_SPI 115 IRQ_TYPE_LEVEL_LOW>; + power-domains = <&scpsys MT8173_POWER_DOMAIN_USB>; + clocks = <&topckgen CLK_TOP_USB30_SEL>; + clock-names = "sys_ck"; + status = "disabled"; + }; }; u3phy: usb-phy@11290000 { diff --git a/arch/arm64/boot/dts/rockchip/rk3368.dtsi b/arch/arm64/boot/dts/rockchip/rk3368.dtsi index 0fcb2147c9f9..df231c4df5a5 100644 --- a/arch/arm64/boot/dts/rockchip/rk3368.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3368.dtsi @@ -537,7 +537,6 @@ g-np-tx-fifo-size = <16>; g-rx-fifo-size = <275>; g-tx-fifo-size = <256 128 128 64 64 32>; - g-use-dma; status = "disabled"; }; diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 6be08113a96d..c3caaddde6cc 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -82,6 +82,7 @@ CONFIG_KEXEC=y # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set CONFIG_COMPAT=y CONFIG_CPU_IDLE=y +CONFIG_HIBERNATION=y CONFIG_ARM_CPUIDLE=y CONFIG_CPU_FREQ=y CONFIG_CPUFREQ_DT=y diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index b4ab238a59ec..8365a84c2640 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -1,7 +1,6 @@ generic-y += bugs.h generic-y += clkdev.h generic-y += cputime.h -generic-y += current.h generic-y += delay.h generic-y += div64.h generic-y += dma.h diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 28bfe6132eb6..446f6c46d4b1 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -41,6 +41,15 @@ msr daifclr, #2 .endm + .macro save_and_disable_irq, flags + mrs \flags, daif + msr daifset, #2 + .endm + + .macro restore_irq, flags + msr daif, \flags + .endm + /* * Enable and disable debug exceptions. */ @@ -202,14 +211,25 @@ lr .req x30 // link register .endm /* + * @dst: Result of per_cpu(sym, smp_processor_id()) * @sym: The name of the per-cpu variable - * @reg: Result of per_cpu(sym, smp_processor_id()) * @tmp: scratch register */ - .macro this_cpu_ptr, sym, reg, tmp - adr_l \reg, \sym + .macro adr_this_cpu, dst, sym, tmp + adr_l \dst, \sym mrs \tmp, tpidr_el1 - add \reg, \reg, \tmp + add \dst, \dst, \tmp + .endm + + /* + * @dst: Result of READ_ONCE(per_cpu(sym, smp_processor_id())) + * @sym: The name of the per-cpu variable + * @tmp: scratch register + */ + .macro ldr_this_cpu dst, sym, tmp + adr_l \dst, \sym + mrs \tmp, tpidr_el1 + ldr \dst, [\dst, \tmp] .endm /* @@ -395,4 +415,24 @@ alternative_endif movk \reg, :abs_g0_nc:\val .endm +/* + * Return the current thread_info. + */ + .macro get_thread_info, rd + mrs \rd, sp_el0 + .endm + +/* + * Errata workaround post TTBR0_EL1 update. + */ + .macro post_ttbr0_update_workaround +#ifdef CONFIG_CAVIUM_ERRATUM_27456 +alternative_if ARM64_WORKAROUND_CAVIUM_27456 + ic iallu + dsb nsh + isb +alternative_else_nop_endif +#endif + .endm + #endif /* __ASM_ASSEMBLER_H */ diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 2e5fb976a572..5a2a6ee65f65 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -65,12 +65,12 @@ * - kaddr - page address * - size - region size */ -extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void flush_icache_range(unsigned long start, unsigned long end); extern void __flush_dcache_area(void *addr, size_t len); extern void __clean_dcache_area_poc(void *addr, size_t len); extern void __clean_dcache_area_pou(void *addr, size_t len); extern long __flush_cache_user_range(unsigned long start, unsigned long end); +extern void sync_icache_aliases(void *kaddr, unsigned long len); static inline void flush_cache_mm(struct mm_struct *mm) { @@ -81,6 +81,11 @@ static inline void flush_cache_page(struct vm_area_struct *vma, { } +static inline void flush_cache_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ +} + /* * Cache maintenance functions used by the DMA API. No to be used directly. */ diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index 87b446535185..4174f09678c4 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -34,7 +34,8 @@ #define ARM64_HAS_32BIT_EL0 13 #define ARM64_HYP_OFFSET_LOW 14 #define ARM64_MISMATCHED_CACHE_LINE_SIZE 15 +#define ARM64_HAS_NO_FPSIMD 16 -#define ARM64_NCAPS 16 +#define ARM64_NCAPS 17 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 0bc0b1de90c4..b4989df48670 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -9,8 +9,6 @@ #ifndef __ASM_CPUFEATURE_H #define __ASM_CPUFEATURE_H -#include <linux/jump_label.h> - #include <asm/cpucaps.h> #include <asm/hwcap.h> #include <asm/sysreg.h> @@ -27,6 +25,8 @@ #ifndef __ASSEMBLY__ +#include <linux/bug.h> +#include <linux/jump_label.h> #include <linux/kernel.h> /* CPU feature register tracking */ @@ -104,14 +104,19 @@ static inline bool cpu_have_feature(unsigned int num) return elf_hwcap & (1UL << num); } +/* System capability check for constant caps */ +static inline bool cpus_have_const_cap(int num) +{ + if (num >= ARM64_NCAPS) + return false; + return static_branch_unlikely(&cpu_hwcap_keys[num]); +} + static inline bool cpus_have_cap(unsigned int num) { if (num >= ARM64_NCAPS) return false; - if (__builtin_constant_p(num)) - return static_branch_unlikely(&cpu_hwcap_keys[num]); - else - return test_bit(num, cpu_hwcaps); + return test_bit(num, cpu_hwcaps); } static inline void cpus_set_cap(unsigned int num) @@ -200,7 +205,7 @@ static inline bool cpu_supports_mixed_endian_el0(void) static inline bool system_supports_32bit_el0(void) { - return cpus_have_cap(ARM64_HAS_32BIT_EL0); + return cpus_have_const_cap(ARM64_HAS_32BIT_EL0); } static inline bool system_supports_mixed_endian_el0(void) @@ -208,6 +213,17 @@ static inline bool system_supports_mixed_endian_el0(void) return id_aa64mmfr0_mixed_endian_el0(read_system_reg(SYS_ID_AA64MMFR0_EL1)); } +static inline bool system_supports_fpsimd(void) +{ + return !cpus_have_const_cap(ARM64_HAS_NO_FPSIMD); +} + +static inline bool system_uses_ttbr0_pan(void) +{ + return IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN) && + !cpus_have_cap(ARM64_HAS_PAN); +} + #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/arm64/include/asm/current.h b/arch/arm64/include/asm/current.h new file mode 100644 index 000000000000..f2bcbe2d9889 --- /dev/null +++ b/arch/arm64/include/asm/current.h @@ -0,0 +1,22 @@ +#ifndef __ASM_CURRENT_H +#define __ASM_CURRENT_H + +#include <linux/compiler.h> + +#include <asm/sysreg.h> + +#ifndef __ASSEMBLY__ + +struct task_struct; + +static __always_inline struct task_struct *get_current(void) +{ + return (struct task_struct *)read_sysreg(sp_el0); +} + +#define current get_current() + +#endif /* __ASSEMBLY__ */ + +#endif /* __ASM_CURRENT_H */ + diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h index b71420a12f26..a44cf5225429 100644 --- a/arch/arm64/include/asm/debug-monitors.h +++ b/arch/arm64/include/asm/debug-monitors.h @@ -68,6 +68,9 @@ #define BRK64_ESR_MASK 0xFFFF #define BRK64_ESR_KPROBES 0x0004 #define BRK64_OPCODE_KPROBES (AARCH64_BREAK_MON | (BRK64_ESR_KPROBES << 5)) +/* uprobes BRK opcodes with ESR encoding */ +#define BRK64_ESR_UPROBES 0x0005 +#define BRK64_OPCODE_UPROBES (AARCH64_BREAK_MON | (BRK64_ESR_UPROBES << 5)) /* AArch32 */ #define DBG_ESR_EVT_BKPT 0x4 diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index 771b3f0bc757..0b6b1633017f 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -1,6 +1,7 @@ #ifndef _ASM_EFI_H #define _ASM_EFI_H +#include <asm/cpufeature.h> #include <asm/io.h> #include <asm/mmu_context.h> #include <asm/neon.h> @@ -78,7 +79,30 @@ static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt) static inline void efi_set_pgd(struct mm_struct *mm) { - switch_mm(NULL, mm, NULL); + __switch_mm(mm); + + if (system_uses_ttbr0_pan()) { + if (mm != current->active_mm) { + /* + * Update the current thread's saved ttbr0 since it is + * restored as part of a return from exception. Set + * the hardware TTBR0_EL1 using cpu_switch_mm() + * directly to enable potential errata workarounds. + */ + update_saved_ttbr0(current, mm); + cpu_switch_mm(mm->pgd, mm); + } else { + /* + * Defer the switch to the current thread's TTBR0_EL1 + * until uaccess_enable(). Restore the current + * thread's saved ttbr0 corresponding to its active_mm + * (if different from init_mm). + */ + cpu_set_reserved_ttbr0(); + if (current->active_mm != &init_mm) + update_saved_ttbr0(current, current->active_mm); + } + } } void efi_virtmap_load(void); diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index a55384f4a5d7..5d1700425efe 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -138,7 +138,11 @@ typedef struct user_fpsimd_state elf_fpregset_t; */ #define ELF_PLAT_INIT(_r, load_addr) (_r)->regs[0] = 0 -#define SET_PERSONALITY(ex) clear_thread_flag(TIF_32BIT); +#define SET_PERSONALITY(ex) \ +({ \ + clear_bit(TIF_32BIT, ¤t->mm->context.flags); \ + clear_thread_flag(TIF_32BIT); \ +}) /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ #define ARCH_DLINFO \ @@ -183,7 +187,11 @@ typedef compat_elf_greg_t compat_elf_gregset_t[COMPAT_ELF_NGREG]; ((x)->e_flags & EF_ARM_EABI_MASK)) #define compat_start_thread compat_start_thread -#define COMPAT_SET_PERSONALITY(ex) set_thread_flag(TIF_32BIT); +#define COMPAT_SET_PERSONALITY(ex) \ +({ \ + set_bit(TIF_32BIT, ¤t->mm->context.flags); \ + set_thread_flag(TIF_32BIT); \ + }) #define COMPAT_ARCH_DLINFO extern int aarch32_setup_vectors_page(struct linux_binprm *bprm, int uses_interp); diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index f2585cdd32c2..85c4a8981d47 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -21,15 +21,12 @@ #include <linux/futex.h> #include <linux/uaccess.h> -#include <asm/alternative.h> -#include <asm/cpufeature.h> #include <asm/errno.h> -#include <asm/sysreg.h> #define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg) \ +do { \ + uaccess_enable(); \ asm volatile( \ - ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \ - CONFIG_ARM64_PAN) \ " prfm pstl1strm, %2\n" \ "1: ldxr %w1, %2\n" \ insn "\n" \ @@ -44,11 +41,11 @@ " .popsection\n" \ _ASM_EXTABLE(1b, 4b) \ _ASM_EXTABLE(2b, 4b) \ - ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ - CONFIG_ARM64_PAN) \ : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp) \ : "r" (oparg), "Ir" (-EFAULT) \ - : "memory") + : "memory"); \ + uaccess_disable(); \ +} while (0) static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) @@ -118,8 +115,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; + uaccess_enable(); asm volatile("// futex_atomic_cmpxchg_inatomic\n" -ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN) " prfm pstl1strm, %2\n" "1: ldxr %w1, %2\n" " sub %w3, %w1, %w4\n" @@ -134,10 +131,10 @@ ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN) " .popsection\n" _ASM_EXTABLE(1b, 4b) _ASM_EXTABLE(2b, 4b) -ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN) : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp) : "r" (oldval), "r" (newval), "Ir" (-EFAULT) : "memory"); + uaccess_disable(); *uval = val; return ret; diff --git a/arch/arm64/include/asm/hw_breakpoint.h b/arch/arm64/include/asm/hw_breakpoint.h index 9510ace570e2..b6b167ac082b 100644 --- a/arch/arm64/include/asm/hw_breakpoint.h +++ b/arch/arm64/include/asm/hw_breakpoint.h @@ -77,7 +77,11 @@ static inline void decode_ctrl_reg(u32 reg, /* Lengths */ #define ARM_BREAKPOINT_LEN_1 0x1 #define ARM_BREAKPOINT_LEN_2 0x3 +#define ARM_BREAKPOINT_LEN_3 0x7 #define ARM_BREAKPOINT_LEN_4 0xf +#define ARM_BREAKPOINT_LEN_5 0x1f +#define ARM_BREAKPOINT_LEN_6 0x3f +#define ARM_BREAKPOINT_LEN_7 0x7f #define ARM_BREAKPOINT_LEN_8 0xff /* Kernel stepping */ @@ -119,7 +123,7 @@ struct perf_event; struct pmu; extern int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, - int *gen_len, int *gen_type); + int *gen_len, int *gen_type, int *offset); extern int arch_check_bp_in_kernelspace(struct perf_event *bp); extern int arch_validate_hwbkpt_settings(struct perf_event *bp); extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 7e51d1b57c0c..7803343e5881 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -19,6 +19,7 @@ #ifndef __ASM_KERNEL_PGTABLE_H #define __ASM_KERNEL_PGTABLE_H +#include <asm/pgtable.h> #include <asm/sparsemem.h> /* @@ -54,6 +55,12 @@ #define SWAPPER_DIR_SIZE (SWAPPER_PGTABLE_LEVELS * PAGE_SIZE) #define IDMAP_DIR_SIZE (IDMAP_PGTABLE_LEVELS * PAGE_SIZE) +#ifdef CONFIG_ARM64_SW_TTBR0_PAN +#define RESERVED_TTBR0_SIZE (PAGE_SIZE) +#else +#define RESERVED_TTBR0_SIZE (0) +#endif + /* Initial memory map size */ #if ARM64_SWAPPER_USES_SECTION_MAPS #define SWAPPER_BLOCK_SHIFT SECTION_SHIFT diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 8d9fce037b2f..47619411f0ff 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -19,6 +19,7 @@ typedef struct { atomic64_t id; void *vdso; + unsigned long flags; } mm_context_t; /* @@ -34,7 +35,7 @@ extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt); extern void init_mem_pgprot(void); extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, unsigned long virt, phys_addr_t size, - pgprot_t prot, bool allow_block_mappings); + pgprot_t prot, bool page_mappings_only); extern void *fixmap_remap_fdt(phys_addr_t dt_phys); #endif diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index a50185375f09..0363fe80455c 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -23,6 +23,7 @@ #include <linux/sched.h> #include <asm/cacheflush.h> +#include <asm/cpufeature.h> #include <asm/proc-fns.h> #include <asm-generic/mm_hooks.h> #include <asm/cputype.h> @@ -103,7 +104,7 @@ static inline void cpu_uninstall_idmap(void) local_flush_tlb_all(); cpu_set_default_tcr_t0sz(); - if (mm != &init_mm) + if (mm != &init_mm && !system_uses_ttbr0_pan()) cpu_switch_mm(mm->pgd, mm); } @@ -163,20 +164,26 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { } -/* - * This is the actual mm switch as far as the scheduler - * is concerned. No registers are touched. We avoid - * calling the CPU specific function when the mm hasn't - * actually changed. - */ -static inline void -switch_mm(struct mm_struct *prev, struct mm_struct *next, - struct task_struct *tsk) +#ifdef CONFIG_ARM64_SW_TTBR0_PAN +static inline void update_saved_ttbr0(struct task_struct *tsk, + struct mm_struct *mm) { - unsigned int cpu = smp_processor_id(); + if (system_uses_ttbr0_pan()) { + BUG_ON(mm->pgd == swapper_pg_dir); + task_thread_info(tsk)->ttbr0 = + virt_to_phys(mm->pgd) | ASID(mm) << 48; + } +} +#else +static inline void update_saved_ttbr0(struct task_struct *tsk, + struct mm_struct *mm) +{ +} +#endif - if (prev == next) - return; +static inline void __switch_mm(struct mm_struct *next) +{ + unsigned int cpu = smp_processor_id(); /* * init_mm.pgd does not contain any user mappings and it is always @@ -190,8 +197,26 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next, check_and_switch_context(next, cpu); } +static inline void +switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + if (prev != next) + __switch_mm(next); + + /* + * Update the saved TTBR0_EL1 of the scheduled-in task as the previous + * value may have not been initialised yet (activate_mm caller) or the + * ASID has changed since the last run (following the context switch + * of another thread of the same process). Avoid setting the reserved + * TTBR0_EL1 to swapper_pg_dir (init_mm; e.g. via idle_task_exit). + */ + if (next != &init_mm) + update_saved_ttbr0(tsk, next); +} + #define deactivate_mm(tsk,mm) do { } while (0) -#define activate_mm(prev,next) switch_mm(prev, next, NULL) +#define activate_mm(prev,next) switch_mm(prev, next, current) void verify_cpu_asid_bits(void); diff --git a/arch/arm64/include/asm/neon.h b/arch/arm64/include/asm/neon.h index 13ce4cc18e26..ad4cdc966c0f 100644 --- a/arch/arm64/include/asm/neon.h +++ b/arch/arm64/include/asm/neon.h @@ -9,8 +9,9 @@ */ #include <linux/types.h> +#include <asm/fpsimd.h> -#define cpu_has_neon() (1) +#define cpu_has_neon() system_supports_fpsimd() #define kernel_neon_begin() kernel_neon_begin_partial(32) diff --git a/arch/arm64/include/asm/opcodes.h b/arch/arm64/include/asm/opcodes.h deleted file mode 100644 index 123f45d92cd1..000000000000 --- a/arch/arm64/include/asm/opcodes.h +++ /dev/null @@ -1,5 +0,0 @@ -#ifdef CONFIG_CPU_BIG_ENDIAN -#define CONFIG_CPU_ENDIAN_BE8 CONFIG_CPU_BIG_ENDIAN -#endif - -#include <../../arm/include/asm/opcodes.h> diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h index 5394c8405e66..3bd498e4de4c 100644 --- a/arch/arm64/include/asm/percpu.h +++ b/arch/arm64/include/asm/percpu.h @@ -16,6 +16,8 @@ #ifndef __ASM_PERCPU_H #define __ASM_PERCPU_H +#include <asm/stack_pointer.h> + static inline void set_my_cpu_offset(unsigned long off) { asm volatile("msr tpidr_el1, %0" :: "r" (off) : "memory"); @@ -101,16 +103,16 @@ static inline unsigned long __percpu_read(void *ptr, int size) switch (size) { case 1: - ret = ACCESS_ONCE(*(u8 *)ptr); + ret = READ_ONCE(*(u8 *)ptr); break; case 2: - ret = ACCESS_ONCE(*(u16 *)ptr); + ret = READ_ONCE(*(u16 *)ptr); break; case 4: - ret = ACCESS_ONCE(*(u32 *)ptr); + ret = READ_ONCE(*(u32 *)ptr); break; case 8: - ret = ACCESS_ONCE(*(u64 *)ptr); + ret = READ_ONCE(*(u64 *)ptr); break; default: BUILD_BUG(); @@ -123,16 +125,16 @@ static inline void __percpu_write(void *ptr, unsigned long val, int size) { switch (size) { case 1: - ACCESS_ONCE(*(u8 *)ptr) = (u8)val; + WRITE_ONCE(*(u8 *)ptr, (u8)val); break; case 2: - ACCESS_ONCE(*(u16 *)ptr) = (u16)val; + WRITE_ONCE(*(u16 *)ptr, (u16)val); break; case 4: - ACCESS_ONCE(*(u32 *)ptr) = (u32)val; + WRITE_ONCE(*(u32 *)ptr, (u32)val); break; case 8: - ACCESS_ONCE(*(u64 *)ptr) = (u64)val; + WRITE_ONCE(*(u64 *)ptr, (u64)val); break; default: BUILD_BUG(); diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h index 38b6a2b49d68..8d5cbec17d80 100644 --- a/arch/arm64/include/asm/perf_event.h +++ b/arch/arm64/include/asm/perf_event.h @@ -17,6 +17,8 @@ #ifndef __ASM_PERF_EVENT_H #define __ASM_PERF_EVENT_H +#include <asm/stack_pointer.h> + #define ARMV8_PMU_MAX_COUNTERS 32 #define ARMV8_PMU_COUNTER_MASK (ARMV8_PMU_MAX_COUNTERS - 1) diff --git a/arch/arm64/include/asm/probes.h b/arch/arm64/include/asm/probes.h index 5af574d632fa..6a5b28904c33 100644 --- a/arch/arm64/include/asm/probes.h +++ b/arch/arm64/include/asm/probes.h @@ -15,21 +15,22 @@ #ifndef _ARM_PROBES_H #define _ARM_PROBES_H -#include <asm/opcodes.h> - -struct kprobe; -struct arch_specific_insn; - -typedef u32 kprobe_opcode_t; -typedef void (kprobes_handler_t) (u32 opcode, long addr, struct pt_regs *); +typedef u32 probe_opcode_t; +typedef void (probes_handler_t) (u32 opcode, long addr, struct pt_regs *); /* architecture specific copy of original instruction */ -struct arch_specific_insn { - kprobe_opcode_t *insn; +struct arch_probe_insn { + probe_opcode_t *insn; pstate_check_t *pstate_cc; - kprobes_handler_t *handler; + probes_handler_t *handler; /* restore address after step xol */ unsigned long restore; }; +#ifdef CONFIG_KPROBES +typedef u32 kprobe_opcode_t; +struct arch_specific_insn { + struct arch_probe_insn api; +}; +#endif #endif diff --git a/arch/arm64/include/asm/ptdump.h b/arch/arm64/include/asm/ptdump.h index 07b8ed037dee..6afd8476c60c 100644 --- a/arch/arm64/include/asm/ptdump.h +++ b/arch/arm64/include/asm/ptdump.h @@ -16,9 +16,10 @@ #ifndef __ASM_PTDUMP_H #define __ASM_PTDUMP_H -#ifdef CONFIG_ARM64_PTDUMP +#ifdef CONFIG_ARM64_PTDUMP_CORE #include <linux/mm_types.h> +#include <linux/seq_file.h> struct addr_marker { unsigned long start_address; @@ -29,16 +30,25 @@ struct ptdump_info { struct mm_struct *mm; const struct addr_marker *markers; unsigned long base_addr; - unsigned long max_addr; }; -int ptdump_register(struct ptdump_info *info, const char *name); - +void ptdump_walk_pgd(struct seq_file *s, struct ptdump_info *info); +#ifdef CONFIG_ARM64_PTDUMP_DEBUGFS +int ptdump_debugfs_register(struct ptdump_info *info, const char *name); #else -static inline int ptdump_register(struct ptdump_info *info, const char *name) +static inline int ptdump_debugfs_register(struct ptdump_info *info, + const char *name) { return 0; } -#endif /* CONFIG_ARM64_PTDUMP */ +#endif +void ptdump_check_wx(void); +#endif /* CONFIG_ARM64_PTDUMP_CORE */ + +#ifdef CONFIG_DEBUG_WX +#define debug_checkwx() ptdump_check_wx() +#else +#define debug_checkwx() do { } while (0) +#endif #endif /* __ASM_PTDUMP_H */ diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index ada08b5b036d..513daf050e84 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -217,6 +217,14 @@ int valid_user_regs(struct user_pt_regs *regs, struct task_struct *task); #include <asm-generic/ptrace.h> +#define procedure_link_pointer(regs) ((regs)->regs[30]) + +static inline void procedure_link_pointer_set(struct pt_regs *regs, + unsigned long val) +{ + procedure_link_pointer(regs) = val; +} + #undef profile_pc extern unsigned long profile_pc(struct pt_regs *regs); diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h index 022644704a93..d050d720a1b4 100644 --- a/arch/arm64/include/asm/smp.h +++ b/arch/arm64/include/asm/smp.h @@ -29,11 +29,22 @@ #ifndef __ASSEMBLY__ +#include <asm/percpu.h> + #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/thread_info.h> -#define raw_smp_processor_id() (current_thread_info()->cpu) +DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number); + +/* + * We don't use this_cpu_read(cpu_number) as that has implicit writes to + * preempt_count, and associated (compiler) barriers, that we'd like to avoid + * the expense of. If we're preemptible, the value can be stale at use anyway. + * And we can't use this_cpu_ptr() either, as that winds up recursing back + * here under CONFIG_DEBUG_PREEMPT=y. + */ +#define raw_smp_processor_id() (*raw_cpu_ptr(&cpu_number)) struct seq_file; @@ -73,6 +84,7 @@ asmlinkage void secondary_start_kernel(void); */ struct secondary_data { void *stack; + struct task_struct *task; long status; }; diff --git a/arch/arm64/include/asm/stack_pointer.h b/arch/arm64/include/asm/stack_pointer.h new file mode 100644 index 000000000000..ffcdf742cddf --- /dev/null +++ b/arch/arm64/include/asm/stack_pointer.h @@ -0,0 +1,9 @@ +#ifndef __ASM_STACK_POINTER_H +#define __ASM_STACK_POINTER_H + +/* + * how to get the current stack pointer from C + */ +register unsigned long current_stack_pointer asm ("sp"); + +#endif /* __ASM_STACK_POINTER_H */ diff --git a/arch/arm64/include/asm/suspend.h b/arch/arm64/include/asm/suspend.h index b8a313fd7a09..de5600f40adf 100644 --- a/arch/arm64/include/asm/suspend.h +++ b/arch/arm64/include/asm/suspend.h @@ -1,7 +1,7 @@ #ifndef __ASM_SUSPEND_H #define __ASM_SUSPEND_H -#define NR_CTX_REGS 10 +#define NR_CTX_REGS 12 #define NR_CALLEE_SAVED_REGS 12 /* diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 6c80b3699cb8..98ae03f8eedd 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -22,8 +22,6 @@ #include <linux/stringify.h> -#include <asm/opcodes.h> - /* * ARMv8 ARM reserves the following encoding for system registers: * (Ref: ARMv8 ARM, Section: "System instruction class encoding overview", @@ -37,6 +35,33 @@ #define sys_reg(op0, op1, crn, crm, op2) \ ((((op0)&3)<<19)|((op1)<<16)|((crn)<<12)|((crm)<<8)|((op2)<<5)) +#ifndef CONFIG_BROKEN_GAS_INST + +#ifdef __ASSEMBLY__ +#define __emit_inst(x) .inst (x) +#else +#define __emit_inst(x) ".inst " __stringify((x)) "\n\t" +#endif + +#else /* CONFIG_BROKEN_GAS_INST */ + +#ifndef CONFIG_CPU_BIG_ENDIAN +#define __INSTR_BSWAP(x) (x) +#else /* CONFIG_CPU_BIG_ENDIAN */ +#define __INSTR_BSWAP(x) ((((x) << 24) & 0xff000000) | \ + (((x) << 8) & 0x00ff0000) | \ + (((x) >> 8) & 0x0000ff00) | \ + (((x) >> 24) & 0x000000ff)) +#endif /* CONFIG_CPU_BIG_ENDIAN */ + +#ifdef __ASSEMBLY__ +#define __emit_inst(x) .long __INSTR_BSWAP(x) +#else /* __ASSEMBLY__ */ +#define __emit_inst(x) ".long " __stringify(__INSTR_BSWAP(x)) "\n\t" +#endif /* __ASSEMBLY__ */ + +#endif /* CONFIG_BROKEN_GAS_INST */ + #define SYS_MIDR_EL1 sys_reg(3, 0, 0, 0, 0) #define SYS_MPIDR_EL1 sys_reg(3, 0, 0, 0, 5) #define SYS_REVIDR_EL1 sys_reg(3, 0, 0, 0, 6) @@ -81,10 +106,10 @@ #define REG_PSTATE_PAN_IMM sys_reg(0, 0, 4, 0, 4) #define REG_PSTATE_UAO_IMM sys_reg(0, 0, 4, 0, 3) -#define SET_PSTATE_PAN(x) __inst_arm(0xd5000000 | REG_PSTATE_PAN_IMM |\ - (!!x)<<8 | 0x1f) -#define SET_PSTATE_UAO(x) __inst_arm(0xd5000000 | REG_PSTATE_UAO_IMM |\ - (!!x)<<8 | 0x1f) +#define SET_PSTATE_PAN(x) __emit_inst(0xd5000000 | REG_PSTATE_PAN_IMM | \ + (!!x)<<8 | 0x1f) +#define SET_PSTATE_UAO(x) __emit_inst(0xd5000000 | REG_PSTATE_UAO_IMM | \ + (!!x)<<8 | 0x1f) /* Common SCTLR_ELx flags. */ #define SCTLR_ELx_EE (1 << 25) @@ -228,11 +253,11 @@ .equ .L__reg_num_xzr, 31 .macro mrs_s, rt, sreg - .inst 0xd5200000|(\sreg)|(.L__reg_num_\rt) + __emit_inst(0xd5200000|(\sreg)|(.L__reg_num_\rt)) .endm .macro msr_s, sreg, rt - .inst 0xd5000000|(\sreg)|(.L__reg_num_\rt) + __emit_inst(0xd5000000|(\sreg)|(.L__reg_num_\rt)) .endm #else @@ -246,11 +271,11 @@ asm( " .equ .L__reg_num_xzr, 31\n" "\n" " .macro mrs_s, rt, sreg\n" -" .inst 0xd5200000|(\\sreg)|(.L__reg_num_\\rt)\n" + __emit_inst(0xd5200000|(\\sreg)|(.L__reg_num_\\rt)) " .endm\n" "\n" " .macro msr_s, sreg, rt\n" -" .inst 0xd5000000|(\\sreg)|(.L__reg_num_\\rt)\n" + __emit_inst(0xd5000000|(\\sreg)|(.L__reg_num_\\rt)) " .endm\n" ); diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index e9ea5a6bd449..46c3b93cf865 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -36,58 +36,31 @@ struct task_struct; +#include <asm/stack_pointer.h> #include <asm/types.h> typedef unsigned long mm_segment_t; /* * low level task data that entry.S needs immediate access to. - * __switch_to() assumes cpu_context follows immediately after cpu_domain. */ struct thread_info { unsigned long flags; /* low level flags */ mm_segment_t addr_limit; /* address limit */ - struct task_struct *task; /* main task structure */ +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + u64 ttbr0; /* saved TTBR0_EL1 */ +#endif int preempt_count; /* 0 => preemptable, <0 => bug */ - int cpu; /* cpu */ }; #define INIT_THREAD_INFO(tsk) \ { \ - .task = &tsk, \ - .flags = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ } -#define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) -/* - * how to get the current stack pointer from C - */ -register unsigned long current_stack_pointer asm ("sp"); - -/* - * how to get the thread information struct from C - */ -static inline struct thread_info *current_thread_info(void) __attribute_const__; - -/* - * struct thread_info can be accessed directly via sp_el0. - * - * We don't use read_sysreg() as we want the compiler to cache the value where - * possible. - */ -static inline struct thread_info *current_thread_info(void) -{ - unsigned long sp_el0; - - asm ("mrs %0, sp_el0" : "=r" (sp_el0)); - - return (struct thread_info *)sp_el0; -} - #define thread_saved_pc(tsk) \ ((unsigned long)(tsk->thread.cpu_context.pc)) #define thread_saved_sp(tsk) \ @@ -112,6 +85,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_NEED_RESCHED 1 #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */ +#define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ #define TIF_NOHZ 7 #define TIF_SYSCALL_TRACE 8 #define TIF_SYSCALL_AUDIT 9 @@ -132,10 +106,12 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) +#define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_32BIT (1 << TIF_32BIT) #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ - _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE) + _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ + _TIF_UPROBE) #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 55d0adbf6509..d26750ca6e06 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -18,6 +18,12 @@ #ifndef __ASM_UACCESS_H #define __ASM_UACCESS_H +#include <asm/alternative.h> +#include <asm/kernel-pgtable.h> +#include <asm/sysreg.h> + +#ifndef __ASSEMBLY__ + /* * User space memory access functions */ @@ -26,10 +32,8 @@ #include <linux/string.h> #include <linux/thread_info.h> -#include <asm/alternative.h> #include <asm/cpufeature.h> #include <asm/ptrace.h> -#include <asm/sysreg.h> #include <asm/errno.h> #include <asm/memory.h> #include <asm/compiler.h> @@ -120,6 +124,99 @@ static inline void set_fs(mm_segment_t fs) " .popsection\n" /* + * User access enabling/disabling. + */ +#ifdef CONFIG_ARM64_SW_TTBR0_PAN +static inline void __uaccess_ttbr0_disable(void) +{ + unsigned long ttbr; + + /* reserved_ttbr0 placed at the end of swapper_pg_dir */ + ttbr = read_sysreg(ttbr1_el1) + SWAPPER_DIR_SIZE; + write_sysreg(ttbr, ttbr0_el1); + isb(); +} + +static inline void __uaccess_ttbr0_enable(void) +{ + unsigned long flags; + + /* + * Disable interrupts to avoid preemption between reading the 'ttbr0' + * variable and the MSR. A context switch could trigger an ASID + * roll-over and an update of 'ttbr0'. + */ + local_irq_save(flags); + write_sysreg(current_thread_info()->ttbr0, ttbr0_el1); + isb(); + local_irq_restore(flags); +} + +static inline bool uaccess_ttbr0_disable(void) +{ + if (!system_uses_ttbr0_pan()) + return false; + __uaccess_ttbr0_disable(); + return true; +} + +static inline bool uaccess_ttbr0_enable(void) +{ + if (!system_uses_ttbr0_pan()) + return false; + __uaccess_ttbr0_enable(); + return true; +} +#else +static inline bool uaccess_ttbr0_disable(void) +{ + return false; +} + +static inline bool uaccess_ttbr0_enable(void) +{ + return false; +} +#endif + +#define __uaccess_disable(alt) \ +do { \ + if (!uaccess_ttbr0_disable()) \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt, \ + CONFIG_ARM64_PAN)); \ +} while (0) + +#define __uaccess_enable(alt) \ +do { \ + if (!uaccess_ttbr0_enable()) \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt, \ + CONFIG_ARM64_PAN)); \ +} while (0) + +static inline void uaccess_disable(void) +{ + __uaccess_disable(ARM64_HAS_PAN); +} + +static inline void uaccess_enable(void) +{ + __uaccess_enable(ARM64_HAS_PAN); +} + +/* + * These functions are no-ops when UAO is present. + */ +static inline void uaccess_disable_not_uao(void) +{ + __uaccess_disable(ARM64_ALT_PAN_NOT_UAO); +} + +static inline void uaccess_enable_not_uao(void) +{ + __uaccess_enable(ARM64_ALT_PAN_NOT_UAO); +} + +/* * The "__xxx" versions of the user access functions do not verify the address * space - it must have been done previously with a separate "access_ok()" * call. @@ -146,8 +243,7 @@ static inline void set_fs(mm_segment_t fs) do { \ unsigned long __gu_val; \ __chk_user_ptr(ptr); \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\ - CONFIG_ARM64_PAN)); \ + uaccess_enable_not_uao(); \ switch (sizeof(*(ptr))) { \ case 1: \ __get_user_asm("ldrb", "ldtrb", "%w", __gu_val, (ptr), \ @@ -168,9 +264,8 @@ do { \ default: \ BUILD_BUG(); \ } \ + uaccess_disable_not_uao(); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\ - CONFIG_ARM64_PAN)); \ } while (0) #define __get_user(x, ptr) \ @@ -215,8 +310,7 @@ do { \ do { \ __typeof__(*(ptr)) __pu_val = (x); \ __chk_user_ptr(ptr); \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\ - CONFIG_ARM64_PAN)); \ + uaccess_enable_not_uao(); \ switch (sizeof(*(ptr))) { \ case 1: \ __put_user_asm("strb", "sttrb", "%w", __pu_val, (ptr), \ @@ -237,8 +331,7 @@ do { \ default: \ BUILD_BUG(); \ } \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\ - CONFIG_ARM64_PAN)); \ + uaccess_disable_not_uao(); \ } while (0) #define __put_user(x, ptr) \ @@ -331,4 +424,66 @@ extern long strncpy_from_user(char *dest, const char __user *src, long count); extern __must_check long strlen_user(const char __user *str); extern __must_check long strnlen_user(const char __user *str, long n); +#else /* __ASSEMBLY__ */ + +#include <asm/assembler.h> + +/* + * User access enabling/disabling macros. + */ +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + .macro __uaccess_ttbr0_disable, tmp1 + mrs \tmp1, ttbr1_el1 // swapper_pg_dir + add \tmp1, \tmp1, #SWAPPER_DIR_SIZE // reserved_ttbr0 at the end of swapper_pg_dir + msr ttbr0_el1, \tmp1 // set reserved TTBR0_EL1 + isb + .endm + + .macro __uaccess_ttbr0_enable, tmp1 + get_thread_info \tmp1 + ldr \tmp1, [\tmp1, #TSK_TI_TTBR0] // load saved TTBR0_EL1 + msr ttbr0_el1, \tmp1 // set the non-PAN TTBR0_EL1 + isb + .endm + + .macro uaccess_ttbr0_disable, tmp1 +alternative_if_not ARM64_HAS_PAN + __uaccess_ttbr0_disable \tmp1 +alternative_else_nop_endif + .endm + + .macro uaccess_ttbr0_enable, tmp1, tmp2 +alternative_if_not ARM64_HAS_PAN + save_and_disable_irq \tmp2 // avoid preemption + __uaccess_ttbr0_enable \tmp1 + restore_irq \tmp2 +alternative_else_nop_endif + .endm +#else + .macro uaccess_ttbr0_disable, tmp1 + .endm + + .macro uaccess_ttbr0_enable, tmp1, tmp2 + .endm +#endif + +/* + * These macros are no-ops when UAO is present. + */ + .macro uaccess_disable_not_uao, tmp1 + uaccess_ttbr0_disable \tmp1 +alternative_if ARM64_ALT_PAN_NOT_UAO + SET_PSTATE_PAN(1) +alternative_else_nop_endif + .endm + + .macro uaccess_enable_not_uao, tmp1, tmp2 + uaccess_ttbr0_enable \tmp1, \tmp2 +alternative_if ARM64_ALT_PAN_NOT_UAO + SET_PSTATE_PAN(0) +alternative_else_nop_endif + .endm + +#endif /* __ASSEMBLY__ */ + #endif /* __ASM_UACCESS_H */ diff --git a/arch/arm64/include/asm/uprobes.h b/arch/arm64/include/asm/uprobes.h new file mode 100644 index 000000000000..8d004073d0e8 --- /dev/null +++ b/arch/arm64/include/asm/uprobes.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2014-2016 Pratyush Anand <panand@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_UPROBES_H +#define _ASM_UPROBES_H + +#include <asm/debug-monitors.h> +#include <asm/insn.h> +#include <asm/probes.h> + +#define MAX_UINSN_BYTES AARCH64_INSN_SIZE + +#define UPROBE_SWBP_INSN BRK64_OPCODE_UPROBES +#define UPROBE_SWBP_INSN_SIZE AARCH64_INSN_SIZE +#define UPROBE_XOL_SLOT_BYTES MAX_UINSN_BYTES + +typedef u32 uprobe_opcode_t; + +struct arch_uprobe_task { +}; + +struct arch_uprobe { + union { + u8 insn[MAX_UINSN_BYTES]; + u8 ixol[MAX_UINSN_BYTES]; + }; + struct arch_probe_insn api; + bool simulate; +}; + +#endif diff --git a/arch/arm64/include/asm/xen/hypercall.h b/arch/arm64/include/asm/xen/hypercall.h index 74b0c423ff5b..3522cbaed316 100644 --- a/arch/arm64/include/asm/xen/hypercall.h +++ b/arch/arm64/include/asm/xen/hypercall.h @@ -1 +1 @@ -#include <../../arm/include/asm/xen/hypercall.h> +#include <xen/arm/hypercall.h> diff --git a/arch/arm64/include/asm/xen/hypervisor.h b/arch/arm64/include/asm/xen/hypervisor.h index f263da8e8769..d6e7709d0688 100644 --- a/arch/arm64/include/asm/xen/hypervisor.h +++ b/arch/arm64/include/asm/xen/hypervisor.h @@ -1 +1 @@ -#include <../../arm/include/asm/xen/hypervisor.h> +#include <xen/arm/hypervisor.h> diff --git a/arch/arm64/include/asm/xen/interface.h b/arch/arm64/include/asm/xen/interface.h index 44457aebeed4..88c0d75da190 100644 --- a/arch/arm64/include/asm/xen/interface.h +++ b/arch/arm64/include/asm/xen/interface.h @@ -1 +1 @@ -#include <../../arm/include/asm/xen/interface.h> +#include <xen/arm/interface.h> diff --git a/arch/arm64/include/asm/xen/page-coherent.h b/arch/arm64/include/asm/xen/page-coherent.h index 2052102b4e02..b3ef061d8b74 100644 --- a/arch/arm64/include/asm/xen/page-coherent.h +++ b/arch/arm64/include/asm/xen/page-coherent.h @@ -1 +1 @@ -#include <../../arm/include/asm/xen/page-coherent.h> +#include <xen/arm/page-coherent.h> diff --git a/arch/arm64/include/asm/xen/page.h b/arch/arm64/include/asm/xen/page.h index bed87ec36780..31bbc803cecb 100644 --- a/arch/arm64/include/asm/xen/page.h +++ b/arch/arm64/include/asm/xen/page.h @@ -1 +1 @@ -#include <../../arm/include/asm/xen/page.h> +#include <xen/arm/page.h> diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index b0988bb1bf64..04de188a36c9 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -14,10 +14,8 @@ #include <linux/slab.h> #include <linux/sysctl.h> -#include <asm/alternative.h> #include <asm/cpufeature.h> #include <asm/insn.h> -#include <asm/opcodes.h> #include <asm/sysreg.h> #include <asm/system_misc.h> #include <asm/traps.h> @@ -285,10 +283,10 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table) #define __SWP_LL_SC_LOOPS 4 #define __user_swpX_asm(data, addr, res, temp, temp2, B) \ +do { \ + uaccess_enable(); \ __asm__ __volatile__( \ " mov %w3, %w7\n" \ - ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \ - CONFIG_ARM64_PAN) \ "0: ldxr"B" %w2, [%4]\n" \ "1: stxr"B" %w0, %w1, [%4]\n" \ " cbz %w0, 2f\n" \ @@ -306,12 +304,12 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table) " .popsection" \ _ASM_EXTABLE(0b, 4b) \ _ASM_EXTABLE(1b, 4b) \ - ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ - CONFIG_ARM64_PAN) \ : "=&r" (res), "+r" (data), "=&r" (temp), "=&r" (temp2) \ : "r" (addr), "i" (-EAGAIN), "i" (-EFAULT), \ "i" (__SWP_LL_SC_LOOPS) \ - : "memory") + : "memory"); \ + uaccess_disable(); \ +} while (0) #define __user_swp_asm(data, addr, res, temp, temp2) \ __user_swpX_asm(data, addr, res, temp, temp2, "") @@ -352,6 +350,10 @@ static int emulate_swpX(unsigned int address, unsigned int *data, return res; } +#define ARM_OPCODE_CONDTEST_FAIL 0 +#define ARM_OPCODE_CONDTEST_PASS 1 +#define ARM_OPCODE_CONDTEST_UNCOND 2 + #define ARM_OPCODE_CONDITION_UNCOND 0xf static unsigned int __kprobes aarch32_check_condition(u32 opcode, u32 psr) diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 4a2f0f0fef32..bc049afc73a7 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -36,11 +36,13 @@ int main(void) { DEFINE(TSK_ACTIVE_MM, offsetof(struct task_struct, active_mm)); BLANK(); - DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); - DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); - DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit)); - DEFINE(TI_TASK, offsetof(struct thread_info, task)); - DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); + DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); + DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); + DEFINE(TSK_TI_ADDR_LIMIT, offsetof(struct task_struct, thread_info.addr_limit)); +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0)); +#endif + DEFINE(TSK_STACK, offsetof(struct task_struct, stack)); BLANK(); DEFINE(THREAD_CPU_CONTEXT, offsetof(struct task_struct, thread.cpu_context)); BLANK(); @@ -123,6 +125,7 @@ int main(void) DEFINE(TZ_DSTTIME, offsetof(struct timezone, tz_dsttime)); BLANK(); DEFINE(CPU_BOOT_STACK, offsetof(struct secondary_data, stack)); + DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task)); BLANK(); #ifdef CONFIG_KVM_ARM_HOST DEFINE(VCPU_CONTEXT, offsetof(struct kvm_vcpu, arch.ctxt)); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index c02504ea304b..fdf8f045929f 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -47,6 +47,7 @@ unsigned int compat_elf_hwcap2 __read_mostly; #endif DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS); +EXPORT_SYMBOL(cpu_hwcaps); DEFINE_STATIC_KEY_ARRAY_FALSE(cpu_hwcap_keys, ARM64_NCAPS); EXPORT_SYMBOL(cpu_hwcap_keys); @@ -746,6 +747,14 @@ static bool hyp_offset_low(const struct arm64_cpu_capabilities *entry, return idmap_addr > GENMASK(VA_BITS - 2, 0) && !is_kernel_in_hyp_mode(); } +static bool has_no_fpsimd(const struct arm64_cpu_capabilities *entry, int __unused) +{ + u64 pfr0 = read_system_reg(SYS_ID_AA64PFR0_EL1); + + return cpuid_feature_extract_signed_field(pfr0, + ID_AA64PFR0_FP_SHIFT) < 0; +} + static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "GIC system register CPU interface", @@ -829,6 +838,13 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .def_scope = SCOPE_SYSTEM, .matches = hyp_offset_low, }, + { + /* FP/SIMD is not implemented */ + .capability = ARM64_HAS_NO_FPSIMD, + .def_scope = SCOPE_SYSTEM, + .min_field_value = 0, + .matches = has_no_fpsimd, + }, {}, }; @@ -1102,5 +1118,5 @@ void __init setup_cpu_features(void) static bool __maybe_unused cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry, int __unused) { - return (cpus_have_cap(ARM64_HAS_PAN) && !cpus_have_cap(ARM64_HAS_UAO)); + return (cpus_have_const_cap(ARM64_HAS_PAN) && !cpus_have_const_cap(ARM64_HAS_UAO)); } diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index 73ae90ef434c..605df76f0a06 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -226,6 +226,8 @@ static void send_user_sigtrap(int si_code) static int single_step_handler(unsigned long addr, unsigned int esr, struct pt_regs *regs) { + bool handler_found = false; + /* * If we are stepping a pending breakpoint, call the hw_breakpoint * handler first. @@ -233,7 +235,14 @@ static int single_step_handler(unsigned long addr, unsigned int esr, if (!reinstall_suspended_bps(regs)) return 0; - if (user_mode(regs)) { +#ifdef CONFIG_KPROBES + if (kprobe_single_step_handler(regs, esr) == DBG_HOOK_HANDLED) + handler_found = true; +#endif + if (!handler_found && call_step_hook(regs, esr) == DBG_HOOK_HANDLED) + handler_found = true; + + if (!handler_found && user_mode(regs)) { send_user_sigtrap(TRAP_TRACE); /* @@ -243,15 +252,8 @@ static int single_step_handler(unsigned long addr, unsigned int esr, * to the active-not-pending state). */ user_rewind_single_step(current); - } else { -#ifdef CONFIG_KPROBES - if (kprobe_single_step_handler(regs, esr) == DBG_HOOK_HANDLED) - return 0; -#endif - if (call_step_hook(regs, esr) == DBG_HOOK_HANDLED) - return 0; - - pr_warning("Unexpected kernel single-step exception at EL1\n"); + } else if (!handler_found) { + pr_warn("Unexpected kernel single-step exception at EL1\n"); /* * Re-enable stepping since we know that we will be * returning to regs. @@ -304,16 +306,20 @@ NOKPROBE_SYMBOL(call_break_hook); static int brk_handler(unsigned long addr, unsigned int esr, struct pt_regs *regs) { - if (user_mode(regs)) { - send_user_sigtrap(TRAP_BRKPT); - } + bool handler_found = false; + #ifdef CONFIG_KPROBES - else if ((esr & BRK64_ESR_MASK) == BRK64_ESR_KPROBES) { - if (kprobe_breakpoint_handler(regs, esr) != DBG_HOOK_HANDLED) - return -EFAULT; + if ((esr & BRK64_ESR_MASK) == BRK64_ESR_KPROBES) { + if (kprobe_breakpoint_handler(regs, esr) == DBG_HOOK_HANDLED) + handler_found = true; } #endif - else if (call_break_hook(regs, esr) != DBG_HOOK_HANDLED) { + if (!handler_found && call_break_hook(regs, esr) == DBG_HOOK_HANDLED) + handler_found = true; + + if (!handler_found && user_mode(regs)) { + send_user_sigtrap(TRAP_BRKPT); + } else if (!handler_found) { pr_warn("Unexpected kernel BRK exception at EL1\n"); return -EFAULT; } diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index ba9bee389fd5..5d17f377d905 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -62,8 +62,8 @@ struct screen_info screen_info __section(.data); int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) { pteval_t prot_val = create_mapping_protection(md); - bool allow_block_mappings = (md->type != EFI_RUNTIME_SERVICES_CODE && - md->type != EFI_RUNTIME_SERVICES_DATA); + bool page_mappings_only = (md->type == EFI_RUNTIME_SERVICES_CODE || + md->type == EFI_RUNTIME_SERVICES_DATA); if (!PAGE_ALIGNED(md->phys_addr) || !PAGE_ALIGNED(md->num_pages << EFI_PAGE_SHIFT)) { @@ -76,12 +76,12 @@ int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) * from the MMU routines. So avoid block mappings altogether in * that case. */ - allow_block_mappings = false; + page_mappings_only = true; } create_pgd_mapping(mm, md->phys_addr, md->virt_addr, md->num_pages << EFI_PAGE_SHIFT, - __pgprot(prot_val | PTE_NG), allow_block_mappings); + __pgprot(prot_val | PTE_NG), page_mappings_only); return 0; } diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 223d54a4d66b..4f0d76339414 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -29,7 +29,9 @@ #include <asm/esr.h> #include <asm/irq.h> #include <asm/memory.h> +#include <asm/ptrace.h> #include <asm/thread_info.h> +#include <asm/uaccess.h> #include <asm/unistd.h> /* @@ -90,9 +92,8 @@ .if \el == 0 mrs x21, sp_el0 - mov tsk, sp - and tsk, tsk, #~(THREAD_SIZE - 1) // Ensure MDSCR_EL1.SS is clear, - ldr x19, [tsk, #TI_FLAGS] // since we can unmask debug + ldr_this_cpu tsk, __entry_task, x20 // Ensure MDSCR_EL1.SS is clear, + ldr x19, [tsk, #TSK_TI_FLAGS] // since we can unmask debug disable_step_tsk x19, x20 // exceptions when scheduling. mov x29, xzr // fp pointed to user-space @@ -100,15 +101,41 @@ add x21, sp, #S_FRAME_SIZE get_thread_info tsk /* Save the task's original addr_limit and set USER_DS (TASK_SIZE_64) */ - ldr x20, [tsk, #TI_ADDR_LIMIT] + ldr x20, [tsk, #TSK_TI_ADDR_LIMIT] str x20, [sp, #S_ORIG_ADDR_LIMIT] mov x20, #TASK_SIZE_64 - str x20, [tsk, #TI_ADDR_LIMIT] + str x20, [tsk, #TSK_TI_ADDR_LIMIT] /* No need to reset PSTATE.UAO, hardware's already set it to 0 for us */ .endif /* \el == 0 */ mrs x22, elr_el1 mrs x23, spsr_el1 stp lr, x21, [sp, #S_LR] + +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Set the TTBR0 PAN bit in SPSR. When the exception is taken from + * EL0, there is no need to check the state of TTBR0_EL1 since + * accesses are always enabled. + * Note that the meaning of this bit differs from the ARMv8.1 PAN + * feature as all TTBR0_EL1 accesses are disabled, not just those to + * user mappings. + */ +alternative_if ARM64_HAS_PAN + b 1f // skip TTBR0 PAN +alternative_else_nop_endif + + .if \el != 0 + mrs x21, ttbr0_el1 + tst x21, #0xffff << 48 // Check for the reserved ASID + orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR + b.eq 1f // TTBR0 access already disabled + and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR + .endif + + __uaccess_ttbr0_disable x21 +1: +#endif + stp x22, x23, [sp, #S_PC] /* @@ -139,7 +166,7 @@ .if \el != 0 /* Restore the task's original addr_limit. */ ldr x20, [sp, #S_ORIG_ADDR_LIMIT] - str x20, [tsk, #TI_ADDR_LIMIT] + str x20, [tsk, #TSK_TI_ADDR_LIMIT] /* No need to restore UAO, it will be restored from SPSR_EL1 */ .endif @@ -147,6 +174,40 @@ ldp x21, x22, [sp, #S_PC] // load ELR, SPSR .if \el == 0 ct_user_enter + .endif + +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR + * PAN bit checking. + */ +alternative_if ARM64_HAS_PAN + b 2f // skip TTBR0 PAN +alternative_else_nop_endif + + .if \el != 0 + tbnz x22, #22, 1f // Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set + .endif + + __uaccess_ttbr0_enable x0 + + .if \el == 0 + /* + * Enable errata workarounds only if returning to user. The only + * workaround currently required for TTBR0_EL1 changes are for the + * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache + * corruption). + */ + post_ttbr0_update_workaround + .endif +1: + .if \el != 0 + and x22, x22, #~PSR_PAN_BIT // ARMv8.0 CPUs do not understand this bit + .endif +2: +#endif + + .if \el == 0 ldr x23, [sp, #S_SP] // load return stack pointer msr sp_el0, x23 #ifdef CONFIG_ARM64_ERRATUM_845719 @@ -162,6 +223,7 @@ alternative_if ARM64_WORKAROUND_845719 alternative_else_nop_endif #endif .endif + msr elr_el1, x21 // set up the return data msr spsr_el1, x22 ldp x0, x1, [sp, #16 * 0] @@ -184,23 +246,20 @@ alternative_else_nop_endif eret // return to kernel .endm - .macro get_thread_info, rd - mrs \rd, sp_el0 - .endm - .macro irq_stack_entry mov x19, sp // preserve the original sp /* - * Compare sp with the current thread_info, if the top - * ~(THREAD_SIZE - 1) bits match, we are on a task stack, and - * should switch to the irq stack. + * Compare sp with the base of the task stack. + * If the top ~(THREAD_SIZE - 1) bits match, we are on a task stack, + * and should switch to the irq stack. */ - and x25, x19, #~(THREAD_SIZE - 1) - cmp x25, tsk - b.ne 9998f + ldr x25, [tsk, TSK_STACK] + eor x25, x25, x19 + and x25, x25, #~(THREAD_SIZE - 1) + cbnz x25, 9998f - this_cpu_ptr irq_stack, x25, x26 + adr_this_cpu x25, irq_stack, x26 mov x26, #IRQ_STACK_START_SP add x26, x25, x26 @@ -427,9 +486,9 @@ el1_irq: irq_handler #ifdef CONFIG_PREEMPT - ldr w24, [tsk, #TI_PREEMPT] // get preempt count + ldr w24, [tsk, #TSK_TI_PREEMPT] // get preempt count cbnz w24, 1f // preempt count != 0 - ldr x0, [tsk, #TI_FLAGS] // get flags + ldr x0, [tsk, #TSK_TI_FLAGS] // get flags tbz x0, #TIF_NEED_RESCHED, 1f // needs rescheduling? bl el1_preempt 1: @@ -444,7 +503,7 @@ ENDPROC(el1_irq) el1_preempt: mov x24, lr 1: bl preempt_schedule_irq // irq en/disable is done inside - ldr x0, [tsk, #TI_FLAGS] // get new tasks TI_FLAGS + ldr x0, [tsk, #TSK_TI_FLAGS] // get new tasks TI_FLAGS tbnz x0, #TIF_NEED_RESCHED, 1b // needs rescheduling? ret x24 #endif @@ -674,8 +733,7 @@ ENTRY(cpu_switch_to) ldp x29, x9, [x8], #16 ldr lr, [x8] mov sp, x9 - and x9, x9, #~(THREAD_SIZE - 1) - msr sp_el0, x9 + msr sp_el0, x1 ret ENDPROC(cpu_switch_to) @@ -686,7 +744,7 @@ ENDPROC(cpu_switch_to) ret_fast_syscall: disable_irq // disable interrupts str x0, [sp, #S_X0] // returned x0 - ldr x1, [tsk, #TI_FLAGS] // re-check for syscall tracing + ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for syscall tracing and x2, x1, #_TIF_SYSCALL_WORK cbnz x2, ret_fast_syscall_trace and x2, x1, #_TIF_WORK_MASK @@ -706,14 +764,14 @@ work_pending: #ifdef CONFIG_TRACE_IRQFLAGS bl trace_hardirqs_on // enabled while in userspace #endif - ldr x1, [tsk, #TI_FLAGS] // re-check for single-step + ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for single-step b finish_ret_to_user /* * "slow" syscall return path. */ ret_to_user: disable_irq // disable interrupts - ldr x1, [tsk, #TI_FLAGS] + ldr x1, [tsk, #TSK_TI_FLAGS] and x2, x1, #_TIF_WORK_MASK cbnz x2, work_pending finish_ret_to_user: @@ -746,7 +804,7 @@ el0_svc_naked: // compat entry point enable_dbg_and_irq ct_user_exit 1 - ldr x16, [tsk, #TI_FLAGS] // check for syscall hooks + ldr x16, [tsk, #TSK_TI_FLAGS] // check for syscall hooks tst x16, #_TIF_SYSCALL_WORK b.ne __sys_trace cmp scno, sc_nr // check upper syscall limit diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 394c61db5566..b883f1f75216 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -127,6 +127,8 @@ void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs) void fpsimd_thread_switch(struct task_struct *next) { + if (!system_supports_fpsimd()) + return; /* * Save the current FPSIMD state to memory, but only if whatever is in * the registers is in fact the most recent userland FPSIMD state of @@ -157,6 +159,8 @@ void fpsimd_thread_switch(struct task_struct *next) void fpsimd_flush_thread(void) { + if (!system_supports_fpsimd()) + return; memset(¤t->thread.fpsimd_state, 0, sizeof(struct fpsimd_state)); fpsimd_flush_task_state(current); set_thread_flag(TIF_FOREIGN_FPSTATE); @@ -168,6 +172,8 @@ void fpsimd_flush_thread(void) */ void fpsimd_preserve_current_state(void) { + if (!system_supports_fpsimd()) + return; preempt_disable(); if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) fpsimd_save_state(¤t->thread.fpsimd_state); @@ -181,6 +187,8 @@ void fpsimd_preserve_current_state(void) */ void fpsimd_restore_current_state(void) { + if (!system_supports_fpsimd()) + return; preempt_disable(); if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { struct fpsimd_state *st = ¤t->thread.fpsimd_state; @@ -199,6 +207,8 @@ void fpsimd_restore_current_state(void) */ void fpsimd_update_current_state(struct fpsimd_state *state) { + if (!system_supports_fpsimd()) + return; preempt_disable(); fpsimd_load_state(state); if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { @@ -228,6 +238,8 @@ static DEFINE_PER_CPU(struct fpsimd_partial_state, softirq_fpsimdstate); */ void kernel_neon_begin_partial(u32 num_regs) { + if (WARN_ON(!system_supports_fpsimd())) + return; if (in_interrupt()) { struct fpsimd_partial_state *s = this_cpu_ptr( in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); @@ -252,6 +264,8 @@ EXPORT_SYMBOL(kernel_neon_begin_partial); void kernel_neon_end(void) { + if (!system_supports_fpsimd()) + return; if (in_interrupt()) { struct fpsimd_partial_state *s = this_cpu_ptr( in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 332e33193ccf..4b1abac3485a 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -326,14 +326,14 @@ __create_page_tables: * dirty cache lines being evicted. */ adrp x0, idmap_pg_dir - adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE + adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE bl __inval_cache_range /* * Clear the idmap and swapper page tables. */ adrp x0, idmap_pg_dir - adrp x6, swapper_pg_dir + SWAPPER_DIR_SIZE + adrp x6, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE 1: stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 @@ -412,7 +412,7 @@ __create_page_tables: * tables again to remove any speculatively loaded cache lines. */ adrp x0, idmap_pg_dir - adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE + adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE dmb sy bl __inval_cache_range @@ -428,7 +428,8 @@ ENDPROC(__create_page_tables) __primary_switched: adrp x4, init_thread_union add sp, x4, #THREAD_SIZE - msr sp_el0, x4 // Save thread_info + adr_l x5, init_task + msr sp_el0, x5 // Save thread_info adr_l x8, vectors // load VBAR_EL1 with virtual msr vbar_el1, x8 // vector table address @@ -524,10 +525,21 @@ set_hcr: msr hcr_el2, x0 isb - /* Generic timers. */ + /* + * Allow Non-secure EL1 and EL0 to access physical timer and counter. + * This is not necessary for VHE, since the host kernel runs in EL2, + * and EL0 accesses are configured in the later stage of boot process. + * Note that when HCR_EL2.E2H == 1, CNTHCTL_EL2 has the same bit layout + * as CNTKCTL_EL1, and CNTKCTL_EL1 accessing instructions are redefined + * to access CNTHCTL_EL2. This allows the kernel designed to run at EL1 + * to transparently mess with the EL0 bits via CNTKCTL_EL1 access in + * EL2. + */ + cbnz x2, 1f mrs x0, cnthctl_el2 orr x0, x0, #3 // Enable EL1 physical timers msr cnthctl_el2, x0 +1: msr cntvoff_el2, xzr // Clear virtual offset #ifdef CONFIG_ARM_GIC_V3 @@ -699,10 +711,10 @@ __secondary_switched: isb adr_l x0, secondary_data - ldr x0, [x0, #CPU_BOOT_STACK] // get secondary_data.stack - mov sp, x0 - and x0, x0, #~(THREAD_SIZE - 1) - msr sp_el0, x0 // save thread_info + ldr x1, [x0, #CPU_BOOT_STACK] // get secondary_data.stack + mov sp, x1 + ldr x2, [x0, #CPU_BOOT_TASK] + msr sp_el0, x2 mov x29, #0 b secondary_start_kernel ENDPROC(__secondary_switched) diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index d55a7b09959b..fe301cbcb442 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -136,7 +136,7 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size) /* Save the mpidr of the cpu we called cpu_suspend() on... */ if (sleep_cpu < 0) { - pr_err("Failing to hibernate on an unkown CPU.\n"); + pr_err("Failing to hibernate on an unknown CPU.\n"); return -ENODEV; } hdr->sleep_cpu_mpidr = cpu_logical_map(sleep_cpu); @@ -547,7 +547,7 @@ out: int hibernate_resume_nonboot_cpu_disable(void) { if (sleep_cpu < 0) { - pr_err("Failing to resume from hibernate on an unkown CPU.\n"); + pr_err("Failing to resume from hibernate on an unknown CPU.\n"); return -ENODEV; } diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index 948b73148d56..1b3c747fedda 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -317,9 +317,21 @@ static int get_hbp_len(u8 hbp_len) case ARM_BREAKPOINT_LEN_2: len_in_bytes = 2; break; + case ARM_BREAKPOINT_LEN_3: + len_in_bytes = 3; + break; case ARM_BREAKPOINT_LEN_4: len_in_bytes = 4; break; + case ARM_BREAKPOINT_LEN_5: + len_in_bytes = 5; + break; + case ARM_BREAKPOINT_LEN_6: + len_in_bytes = 6; + break; + case ARM_BREAKPOINT_LEN_7: + len_in_bytes = 7; + break; case ARM_BREAKPOINT_LEN_8: len_in_bytes = 8; break; @@ -349,7 +361,7 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp) * to generic breakpoint descriptions. */ int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, - int *gen_len, int *gen_type) + int *gen_len, int *gen_type, int *offset) { /* Type */ switch (ctrl.type) { @@ -369,17 +381,33 @@ int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, return -EINVAL; } + if (!ctrl.len) + return -EINVAL; + *offset = __ffs(ctrl.len); + /* Len */ - switch (ctrl.len) { + switch (ctrl.len >> *offset) { case ARM_BREAKPOINT_LEN_1: *gen_len = HW_BREAKPOINT_LEN_1; break; case ARM_BREAKPOINT_LEN_2: *gen_len = HW_BREAKPOINT_LEN_2; break; + case ARM_BREAKPOINT_LEN_3: + *gen_len = HW_BREAKPOINT_LEN_3; + break; case ARM_BREAKPOINT_LEN_4: *gen_len = HW_BREAKPOINT_LEN_4; break; + case ARM_BREAKPOINT_LEN_5: + *gen_len = HW_BREAKPOINT_LEN_5; + break; + case ARM_BREAKPOINT_LEN_6: + *gen_len = HW_BREAKPOINT_LEN_6; + break; + case ARM_BREAKPOINT_LEN_7: + *gen_len = HW_BREAKPOINT_LEN_7; + break; case ARM_BREAKPOINT_LEN_8: *gen_len = HW_BREAKPOINT_LEN_8; break; @@ -423,9 +451,21 @@ static int arch_build_bp_info(struct perf_event *bp) case HW_BREAKPOINT_LEN_2: info->ctrl.len = ARM_BREAKPOINT_LEN_2; break; + case HW_BREAKPOINT_LEN_3: + info->ctrl.len = ARM_BREAKPOINT_LEN_3; + break; case HW_BREAKPOINT_LEN_4: info->ctrl.len = ARM_BREAKPOINT_LEN_4; break; + case HW_BREAKPOINT_LEN_5: + info->ctrl.len = ARM_BREAKPOINT_LEN_5; + break; + case HW_BREAKPOINT_LEN_6: + info->ctrl.len = ARM_BREAKPOINT_LEN_6; + break; + case HW_BREAKPOINT_LEN_7: + info->ctrl.len = ARM_BREAKPOINT_LEN_7; + break; case HW_BREAKPOINT_LEN_8: info->ctrl.len = ARM_BREAKPOINT_LEN_8; break; @@ -517,18 +557,17 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) default: return -EINVAL; } - - info->address &= ~alignment_mask; - info->ctrl.len <<= offset; } else { if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) alignment_mask = 0x3; else alignment_mask = 0x7; - if (info->address & alignment_mask) - return -EINVAL; + offset = info->address & alignment_mask; } + info->address &= ~alignment_mask; + info->ctrl.len <<= offset; + /* * Disallow per-task kernel breakpoints since these would * complicate the stepping code. @@ -661,12 +700,47 @@ unlock: } NOKPROBE_SYMBOL(breakpoint_handler); +/* + * Arm64 hardware does not always report a watchpoint hit address that matches + * one of the watchpoints set. It can also report an address "near" the + * watchpoint if a single instruction access both watched and unwatched + * addresses. There is no straight-forward way, short of disassembling the + * offending instruction, to map that address back to the watchpoint. This + * function computes the distance of the memory access from the watchpoint as a + * heuristic for the likelyhood that a given access triggered the watchpoint. + * + * See Section D2.10.5 "Determining the memory location that caused a Watchpoint + * exception" of ARMv8 Architecture Reference Manual for details. + * + * The function returns the distance of the address from the bytes watched by + * the watchpoint. In case of an exact match, it returns 0. + */ +static u64 get_distance_from_watchpoint(unsigned long addr, u64 val, + struct arch_hw_breakpoint_ctrl *ctrl) +{ + u64 wp_low, wp_high; + u32 lens, lene; + + lens = __ffs(ctrl->len); + lene = __fls(ctrl->len); + + wp_low = val + lens; + wp_high = val + lene; + if (addr < wp_low) + return wp_low - addr; + else if (addr > wp_high) + return addr - wp_high; + else + return 0; +} + static int watchpoint_handler(unsigned long addr, unsigned int esr, struct pt_regs *regs) { - int i, step = 0, *kernel_step, access; + int i, step = 0, *kernel_step, access, closest_match = 0; + u64 min_dist = -1, dist; u32 ctrl_reg; - u64 val, alignment_mask; + u64 val; struct perf_event *wp, **slots; struct debug_info *debug_info; struct arch_hw_breakpoint *info; @@ -675,35 +749,15 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, slots = this_cpu_ptr(wp_on_reg); debug_info = ¤t->thread.debug; + /* + * Find all watchpoints that match the reported address. If no exact + * match is found. Attribute the hit to the closest watchpoint. + */ + rcu_read_lock(); for (i = 0; i < core_num_wrps; ++i) { - rcu_read_lock(); - wp = slots[i]; - if (wp == NULL) - goto unlock; - - info = counter_arch_bp(wp); - /* AArch32 watchpoints are either 4 or 8 bytes aligned. */ - if (is_compat_task()) { - if (info->ctrl.len == ARM_BREAKPOINT_LEN_8) - alignment_mask = 0x7; - else - alignment_mask = 0x3; - } else { - alignment_mask = 0x7; - } - - /* Check if the watchpoint value matches. */ - val = read_wb_reg(AARCH64_DBG_REG_WVR, i); - if (val != (addr & ~alignment_mask)) - goto unlock; - - /* Possible match, check the byte address select to confirm. */ - ctrl_reg = read_wb_reg(AARCH64_DBG_REG_WCR, i); - decode_ctrl_reg(ctrl_reg, &ctrl); - if (!((1 << (addr & alignment_mask)) & ctrl.len)) - goto unlock; + continue; /* * Check that the access type matches. @@ -712,18 +766,41 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, access = (esr & AARCH64_ESR_ACCESS_MASK) ? HW_BREAKPOINT_W : HW_BREAKPOINT_R; if (!(access & hw_breakpoint_type(wp))) - goto unlock; + continue; + /* Check if the watchpoint value and byte select match. */ + val = read_wb_reg(AARCH64_DBG_REG_WVR, i); + ctrl_reg = read_wb_reg(AARCH64_DBG_REG_WCR, i); + decode_ctrl_reg(ctrl_reg, &ctrl); + dist = get_distance_from_watchpoint(addr, val, &ctrl); + if (dist < min_dist) { + min_dist = dist; + closest_match = i; + } + /* Is this an exact match? */ + if (dist != 0) + continue; + + info = counter_arch_bp(wp); info->trigger = addr; perf_bp_event(wp, regs); /* Do we need to handle the stepping? */ if (is_default_overflow_handler(wp)) step = 1; + } + if (min_dist > 0 && min_dist != -1) { + /* No exact match found. */ + wp = slots[closest_match]; + info = counter_arch_bp(wp); + info->trigger = addr; + perf_bp_event(wp, regs); -unlock: - rcu_read_unlock(); + /* Do we need to handle the stepping? */ + if (is_default_overflow_handler(wp)) + step = 1; } + rcu_read_unlock(); if (!step) return 0; diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c index 6f2ac4fc66ca..94b62c1fa4df 100644 --- a/arch/arm64/kernel/insn.c +++ b/arch/arm64/kernel/insn.c @@ -30,7 +30,6 @@ #include <asm/cacheflush.h> #include <asm/debug-monitors.h> #include <asm/fixmap.h> -#include <asm/opcodes.h> #include <asm/insn.h> #define AARCH64_INSN_SF_BIT BIT(31) diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c index e017a9493b92..d217c9e95b06 100644 --- a/arch/arm64/kernel/kgdb.c +++ b/arch/arm64/kernel/kgdb.c @@ -247,6 +247,9 @@ NOKPROBE_SYMBOL(kgdb_compiled_brk_fn); static int kgdb_step_brk_fn(struct pt_regs *regs, unsigned int esr) { + if (!kgdb_single_step) + return DBG_HOOK_ERROR; + kgdb_handle_exception(1, SIGTRAP, 0, regs); return 0; } diff --git a/arch/arm64/kernel/probes/Makefile b/arch/arm64/kernel/probes/Makefile index ce06312e3d34..89b6df613dde 100644 --- a/arch/arm64/kernel/probes/Makefile +++ b/arch/arm64/kernel/probes/Makefile @@ -1,3 +1,5 @@ obj-$(CONFIG_KPROBES) += kprobes.o decode-insn.o \ kprobes_trampoline.o \ simulate-insn.o +obj-$(CONFIG_UPROBES) += uprobes.o decode-insn.o \ + simulate-insn.o diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index d1731bf977ef..6bf6657a5a52 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -17,7 +17,6 @@ #include <linux/kprobes.h> #include <linux/module.h> #include <linux/kallsyms.h> -#include <asm/kprobes.h> #include <asm/insn.h> #include <asm/sections.h> @@ -78,8 +77,8 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) * INSN_GOOD If instruction is supported and uses instruction slot, * INSN_GOOD_NO_SLOT If instruction is supported but doesn't use its slot. */ -static enum kprobe_insn __kprobes -arm_probe_decode_insn(kprobe_opcode_t insn, struct arch_specific_insn *asi) +enum probe_insn __kprobes +arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *api) { /* * Instructions reading or modifying the PC won't work from the XOL @@ -89,26 +88,26 @@ arm_probe_decode_insn(kprobe_opcode_t insn, struct arch_specific_insn *asi) return INSN_GOOD; if (aarch64_insn_is_bcond(insn)) { - asi->handler = simulate_b_cond; + api->handler = simulate_b_cond; } else if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn)) { - asi->handler = simulate_cbz_cbnz; + api->handler = simulate_cbz_cbnz; } else if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn)) { - asi->handler = simulate_tbz_tbnz; + api->handler = simulate_tbz_tbnz; } else if (aarch64_insn_is_adr_adrp(insn)) { - asi->handler = simulate_adr_adrp; + api->handler = simulate_adr_adrp; } else if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn)) { - asi->handler = simulate_b_bl; + api->handler = simulate_b_bl; } else if (aarch64_insn_is_br(insn) || aarch64_insn_is_blr(insn) || aarch64_insn_is_ret(insn)) { - asi->handler = simulate_br_blr_ret; + api->handler = simulate_br_blr_ret; } else if (aarch64_insn_is_ldr_lit(insn)) { - asi->handler = simulate_ldr_literal; + api->handler = simulate_ldr_literal; } else if (aarch64_insn_is_ldrsw_lit(insn)) { - asi->handler = simulate_ldrsw_literal; + api->handler = simulate_ldrsw_literal; } else { /* * Instruction cannot be stepped out-of-line and we don't @@ -120,6 +119,7 @@ arm_probe_decode_insn(kprobe_opcode_t insn, struct arch_specific_insn *asi) return INSN_GOOD_NO_SLOT; } +#ifdef CONFIG_KPROBES static bool __kprobes is_probed_address_atomic(kprobe_opcode_t *scan_start, kprobe_opcode_t *scan_end) { @@ -138,12 +138,12 @@ is_probed_address_atomic(kprobe_opcode_t *scan_start, kprobe_opcode_t *scan_end) return false; } -enum kprobe_insn __kprobes +enum probe_insn __kprobes arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) { - enum kprobe_insn decoded; - kprobe_opcode_t insn = le32_to_cpu(*addr); - kprobe_opcode_t *scan_end = NULL; + enum probe_insn decoded; + probe_opcode_t insn = le32_to_cpu(*addr); + probe_opcode_t *scan_end = NULL; unsigned long size = 0, offset = 0; /* @@ -162,7 +162,7 @@ arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) else scan_end = addr - MAX_ATOMIC_CONTEXT_SIZE; } - decoded = arm_probe_decode_insn(insn, asi); + decoded = arm_probe_decode_insn(insn, &asi->api); if (decoded != INSN_REJECTED && scan_end) if (is_probed_address_atomic(addr - 1, scan_end)) @@ -170,3 +170,4 @@ arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) return decoded; } +#endif diff --git a/arch/arm64/kernel/probes/decode-insn.h b/arch/arm64/kernel/probes/decode-insn.h index d438289646a6..76d3f315407f 100644 --- a/arch/arm64/kernel/probes/decode-insn.h +++ b/arch/arm64/kernel/probes/decode-insn.h @@ -23,13 +23,17 @@ */ #define MAX_ATOMIC_CONTEXT_SIZE (128 / sizeof(kprobe_opcode_t)) -enum kprobe_insn { +enum probe_insn { INSN_REJECTED, INSN_GOOD_NO_SLOT, INSN_GOOD, }; -enum kprobe_insn __kprobes +#ifdef CONFIG_KPROBES +enum probe_insn __kprobes arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi); +#endif +enum probe_insn __kprobes +arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *asi); #endif /* _ARM_KERNEL_KPROBES_ARM64_H */ diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index f5077ea7af6d..1decd2b2c730 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -44,31 +44,31 @@ post_kprobe_handler(struct kprobe_ctlblk *, struct pt_regs *); static void __kprobes arch_prepare_ss_slot(struct kprobe *p) { /* prepare insn slot */ - p->ainsn.insn[0] = cpu_to_le32(p->opcode); + p->ainsn.api.insn[0] = cpu_to_le32(p->opcode); - flush_icache_range((uintptr_t) (p->ainsn.insn), - (uintptr_t) (p->ainsn.insn) + + flush_icache_range((uintptr_t) (p->ainsn.api.insn), + (uintptr_t) (p->ainsn.api.insn) + MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); /* * Needs restoring of return address after stepping xol. */ - p->ainsn.restore = (unsigned long) p->addr + + p->ainsn.api.restore = (unsigned long) p->addr + sizeof(kprobe_opcode_t); } static void __kprobes arch_prepare_simulate(struct kprobe *p) { /* This instructions is not executed xol. No need to adjust the PC */ - p->ainsn.restore = 0; + p->ainsn.api.restore = 0; } static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - if (p->ainsn.handler) - p->ainsn.handler((u32)p->opcode, (long)p->addr, regs); + if (p->ainsn.api.handler) + p->ainsn.api.handler((u32)p->opcode, (long)p->addr, regs); /* single step simulated, now go for post processing */ post_kprobe_handler(kcb, regs); @@ -98,18 +98,18 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return -EINVAL; case INSN_GOOD_NO_SLOT: /* insn need simulation */ - p->ainsn.insn = NULL; + p->ainsn.api.insn = NULL; break; case INSN_GOOD: /* instruction uses slot */ - p->ainsn.insn = get_insn_slot(); - if (!p->ainsn.insn) + p->ainsn.api.insn = get_insn_slot(); + if (!p->ainsn.api.insn) return -ENOMEM; break; }; /* prepare the instruction */ - if (p->ainsn.insn) + if (p->ainsn.api.insn) arch_prepare_ss_slot(p); else arch_prepare_simulate(p); @@ -142,9 +142,9 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p) void __kprobes arch_remove_kprobe(struct kprobe *p) { - if (p->ainsn.insn) { - free_insn_slot(p->ainsn.insn, 0); - p->ainsn.insn = NULL; + if (p->ainsn.api.insn) { + free_insn_slot(p->ainsn.api.insn, 0); + p->ainsn.api.insn = NULL; } } @@ -244,9 +244,9 @@ static void __kprobes setup_singlestep(struct kprobe *p, } - if (p->ainsn.insn) { + if (p->ainsn.api.insn) { /* prepare for single stepping */ - slot = (unsigned long)p->ainsn.insn; + slot = (unsigned long)p->ainsn.api.insn; set_ss_context(kcb, slot); /* mark pending ss */ @@ -295,8 +295,8 @@ post_kprobe_handler(struct kprobe_ctlblk *kcb, struct pt_regs *regs) return; /* return addr restore if non-branching insn */ - if (cur->ainsn.restore != 0) - instruction_pointer_set(regs, cur->ainsn.restore); + if (cur->ainsn.api.restore != 0) + instruction_pointer_set(regs, cur->ainsn.api.restore); /* restore back original saved kprobe variables and continue */ if (kcb->kprobe_status == KPROBE_REENTER) { diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c index 8977ce9d009d..357d3efe1366 100644 --- a/arch/arm64/kernel/probes/simulate-insn.c +++ b/arch/arm64/kernel/probes/simulate-insn.c @@ -13,28 +13,26 @@ * General Public License for more details. */ +#include <linux/bitops.h> #include <linux/kernel.h> #include <linux/kprobes.h> #include "simulate-insn.h" -#define sign_extend(x, signbit) \ - ((x) | (0 - ((x) & (1 << (signbit))))) - #define bbl_displacement(insn) \ - sign_extend(((insn) & 0x3ffffff) << 2, 27) + sign_extend32(((insn) & 0x3ffffff) << 2, 27) #define bcond_displacement(insn) \ - sign_extend(((insn >> 5) & 0x7ffff) << 2, 20) + sign_extend32(((insn >> 5) & 0x7ffff) << 2, 20) #define cbz_displacement(insn) \ - sign_extend(((insn >> 5) & 0x7ffff) << 2, 20) + sign_extend32(((insn >> 5) & 0x7ffff) << 2, 20) #define tbz_displacement(insn) \ - sign_extend(((insn >> 5) & 0x3fff) << 2, 15) + sign_extend32(((insn >> 5) & 0x3fff) << 2, 15) #define ldr_displacement(insn) \ - sign_extend(((insn >> 5) & 0x7ffff) << 2, 20) + sign_extend32(((insn >> 5) & 0x7ffff) << 2, 20) static inline void set_x_reg(struct pt_regs *regs, int reg, u64 val) { @@ -106,7 +104,7 @@ simulate_adr_adrp(u32 opcode, long addr, struct pt_regs *regs) xn = opcode & 0x1f; imm = ((opcode >> 3) & 0x1ffffc) | ((opcode >> 29) & 0x3); - imm = sign_extend(imm, 20); + imm = sign_extend64(imm, 20); if (opcode & 0x80000000) val = (imm<<12) + (addr & 0xfffffffffffff000); else diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c new file mode 100644 index 000000000000..26c998534dca --- /dev/null +++ b/arch/arm64/kernel/probes/uprobes.c @@ -0,0 +1,216 @@ +/* + * Copyright (C) 2014-2016 Pratyush Anand <panand@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/highmem.h> +#include <linux/ptrace.h> +#include <linux/uprobes.h> +#include <asm/cacheflush.h> + +#include "decode-insn.h" + +#define UPROBE_INV_FAULT_CODE UINT_MAX + +void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, + void *src, unsigned long len) +{ + void *xol_page_kaddr = kmap_atomic(page); + void *dst = xol_page_kaddr + (vaddr & ~PAGE_MASK); + + /* Initialize the slot */ + memcpy(dst, src, len); + + /* flush caches (dcache/icache) */ + sync_icache_aliases(dst, len); + + kunmap_atomic(xol_page_kaddr); +} + +unsigned long uprobe_get_swbp_addr(struct pt_regs *regs) +{ + return instruction_pointer(regs); +} + +int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, + unsigned long addr) +{ + probe_opcode_t insn; + + /* TODO: Currently we do not support AARCH32 instruction probing */ + if (test_bit(TIF_32BIT, &mm->context.flags)) + return -ENOTSUPP; + else if (!IS_ALIGNED(addr, AARCH64_INSN_SIZE)) + return -EINVAL; + + insn = *(probe_opcode_t *)(&auprobe->insn[0]); + + switch (arm_probe_decode_insn(insn, &auprobe->api)) { + case INSN_REJECTED: + return -EINVAL; + + case INSN_GOOD_NO_SLOT: + auprobe->simulate = true; + break; + + default: + break; + } + + return 0; +} + +int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + /* Initialize with an invalid fault code to detect if ol insn trapped */ + current->thread.fault_code = UPROBE_INV_FAULT_CODE; + + /* Instruction points to execute ol */ + instruction_pointer_set(regs, utask->xol_vaddr); + + user_enable_single_step(current); + + return 0; +} + +int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + WARN_ON_ONCE(current->thread.fault_code != UPROBE_INV_FAULT_CODE); + + /* Instruction points to execute next to breakpoint address */ + instruction_pointer_set(regs, utask->vaddr + 4); + + user_disable_single_step(current); + + return 0; +} +bool arch_uprobe_xol_was_trapped(struct task_struct *t) +{ + /* + * Between arch_uprobe_pre_xol and arch_uprobe_post_xol, if an xol + * insn itself is trapped, then detect the case with the help of + * invalid fault code which is being set in arch_uprobe_pre_xol + */ + if (t->thread.fault_code != UPROBE_INV_FAULT_CODE) + return true; + + return false; +} + +bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + probe_opcode_t insn; + unsigned long addr; + + if (!auprobe->simulate) + return false; + + insn = *(probe_opcode_t *)(&auprobe->insn[0]); + addr = instruction_pointer(regs); + + if (auprobe->api.handler) + auprobe->api.handler(insn, addr, regs); + + return true; +} + +void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + /* + * Task has received a fatal signal, so reset back to probbed + * address. + */ + instruction_pointer_set(regs, utask->vaddr); + + user_disable_single_step(current); +} + +bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, + struct pt_regs *regs) +{ + /* + * If a simple branch instruction (B) was called for retprobed + * assembly label then return true even when regs->sp and ret->stack + * are same. It will ensure that cleanup and reporting of return + * instances corresponding to callee label is done when + * handle_trampoline for called function is executed. + */ + if (ctx == RP_CHECK_CHAIN_CALL) + return regs->sp <= ret->stack; + else + return regs->sp < ret->stack; +} + +unsigned long +arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, + struct pt_regs *regs) +{ + unsigned long orig_ret_vaddr; + + orig_ret_vaddr = procedure_link_pointer(regs); + /* Replace the return addr with trampoline addr */ + procedure_link_pointer_set(regs, trampoline_vaddr); + + return orig_ret_vaddr; +} + +int arch_uprobe_exception_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return NOTIFY_DONE; +} + +static int uprobe_breakpoint_handler(struct pt_regs *regs, + unsigned int esr) +{ + if (user_mode(regs) && uprobe_pre_sstep_notifier(regs)) + return DBG_HOOK_HANDLED; + + return DBG_HOOK_ERROR; +} + +static int uprobe_single_step_handler(struct pt_regs *regs, + unsigned int esr) +{ + struct uprobe_task *utask = current->utask; + + if (user_mode(regs)) { + WARN_ON(utask && + (instruction_pointer(regs) != utask->xol_vaddr + 4)); + + if (uprobe_post_sstep_notifier(regs)) + return DBG_HOOK_HANDLED; + } + + return DBG_HOOK_ERROR; +} + +/* uprobe breakpoint handler hook */ +static struct break_hook uprobes_break_hook = { + .esr_mask = BRK64_ESR_MASK, + .esr_val = BRK64_ESR_UPROBES, + .fn = uprobe_breakpoint_handler, +}; + +/* uprobe single step handler hook */ +static struct step_hook uprobes_step_hook = { + .fn = uprobe_single_step_handler, +}; + +static int __init arch_init_uprobes(void) +{ + register_break_hook(&uprobes_break_hook); + register_step_hook(&uprobes_step_hook); + + return 0; +} + +device_initcall(arch_init_uprobes); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 01753cd7d3f0..a3a2816ba73a 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -45,6 +45,7 @@ #include <linux/personality.h> #include <linux/notifier.h> #include <trace/events/power.h> +#include <linux/percpu.h> #include <asm/alternative.h> #include <asm/compat.h> @@ -282,7 +283,7 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, memset(childregs, 0, sizeof(struct pt_regs)); childregs->pstate = PSR_MODE_EL1h; if (IS_ENABLED(CONFIG_ARM64_UAO) && - cpus_have_cap(ARM64_HAS_UAO)) + cpus_have_const_cap(ARM64_HAS_UAO)) childregs->pstate |= PSR_UAO_BIT; p->thread.cpu_context.x19 = stack_start; p->thread.cpu_context.x20 = stk_sz; @@ -322,6 +323,20 @@ void uao_thread_switch(struct task_struct *next) } /* + * We store our current task in sp_el0, which is clobbered by userspace. Keep a + * shadow copy so that we can restore this upon entry from userspace. + * + * This is *only* for exception entry from EL0, and is not valid until we + * __switch_to() a user task. + */ +DEFINE_PER_CPU(struct task_struct *, __entry_task); + +static void entry_task_switch(struct task_struct *next) +{ + __this_cpu_write(__entry_task, next); +} + +/* * Thread switching. */ struct task_struct *__switch_to(struct task_struct *prev, @@ -333,6 +348,7 @@ struct task_struct *__switch_to(struct task_struct *prev, tls_thread_switch(next); hw_breakpoint_thread_switch(next); contextidr_thread_switch(next); + entry_task_switch(next); uao_thread_switch(next); /* @@ -350,27 +366,35 @@ struct task_struct *__switch_to(struct task_struct *prev, unsigned long get_wchan(struct task_struct *p) { struct stackframe frame; - unsigned long stack_page; + unsigned long stack_page, ret = 0; int count = 0; if (!p || p == current || p->state == TASK_RUNNING) return 0; + stack_page = (unsigned long)try_get_task_stack(p); + if (!stack_page) + return 0; + frame.fp = thread_saved_fp(p); frame.sp = thread_saved_sp(p); frame.pc = thread_saved_pc(p); #ifdef CONFIG_FUNCTION_GRAPH_TRACER frame.graph = p->curr_ret_stack; #endif - stack_page = (unsigned long)task_stack_page(p); do { if (frame.sp < stack_page || frame.sp >= stack_page + THREAD_SIZE || unwind_frame(p, &frame)) - return 0; - if (!in_sched_functions(frame.pc)) - return frame.pc; + goto out; + if (!in_sched_functions(frame.pc)) { + ret = frame.pc; + goto out; + } } while (count ++ < 16); - return 0; + +out: + put_task_stack(p); + return ret; } unsigned long arch_align_stack(unsigned long sp) diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index e0c81da60f76..fc35e06ccaac 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -327,13 +327,13 @@ static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type, struct arch_hw_breakpoint_ctrl ctrl, struct perf_event_attr *attr) { - int err, len, type, disabled = !ctrl.enabled; + int err, len, type, offset, disabled = !ctrl.enabled; attr->disabled = disabled; if (disabled) return 0; - err = arch_bp_generic_fields(ctrl, &len, &type); + err = arch_bp_generic_fields(ctrl, &len, &type, &offset); if (err) return err; @@ -352,6 +352,7 @@ static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type, attr->bp_len = len; attr->bp_type = type; + attr->bp_addr += offset; return 0; } @@ -404,7 +405,7 @@ static int ptrace_hbp_get_addr(unsigned int note_type, if (IS_ERR(bp)) return PTR_ERR(bp); - *addr = bp ? bp->attr.bp_addr : 0; + *addr = bp ? counter_arch_bp(bp)->address : 0; return 0; } diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c index 1718706fde83..12a87f2600f2 100644 --- a/arch/arm64/kernel/return_address.c +++ b/arch/arm64/kernel/return_address.c @@ -12,6 +12,7 @@ #include <linux/export.h> #include <linux/ftrace.h> +#include <asm/stack_pointer.h> #include <asm/stacktrace.h> struct return_address_data { diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index f534f492a268..a53f52ac81c6 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -291,6 +291,15 @@ void __init setup_arch(char **cmdline_p) smp_init_cpus(); smp_build_mpidr_hash(); +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Make sure init_thread_info.ttbr0 always generates translation + * faults in case uaccess_enable() is inadvertently called by the init + * thread. + */ + init_task.thread_info.ttbr0 = virt_to_phys(empty_zero_page); +#endif + #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) conswitchp = &vga_con; diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 404dd67080b9..c7b6de62f9d3 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -414,6 +414,9 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, } else { local_irq_enable(); + if (thread_flags & _TIF_UPROBE) + uprobe_notify_resume(regs); + if (thread_flags & _TIF_SIGPENDING) do_signal(regs); diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index 1bec41b5fda3..df67652e46f0 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -125,9 +125,6 @@ ENTRY(_cpu_resume) /* load sp from context */ ldr x2, [x0, #CPU_CTX_SP] mov sp, x2 - /* save thread_info */ - and x2, x2, #~(THREAD_SIZE - 1) - msr sp_el0, x2 /* * cpu_do_resume expects x0 to contain context address pointer */ diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 8507703dabe4..cb87234cfcf2 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -58,6 +58,9 @@ #define CREATE_TRACE_POINTS #include <trace/events/ipi.h> +DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number); +EXPORT_PER_CPU_SYMBOL(cpu_number); + /* * as from 2.5, kernels no longer have an init_tasks structure * so we need some other way of telling a new secondary core @@ -146,6 +149,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) * We need to tell the secondary core where to find its stack and the * page tables. */ + secondary_data.task = idle; secondary_data.stack = task_stack_page(idle) + THREAD_START_SP; update_cpu_boot_status(CPU_MMU_OFF); __flush_dcache_area(&secondary_data, sizeof(secondary_data)); @@ -170,6 +174,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) pr_err("CPU%u: failed to boot: %d\n", cpu, ret); } + secondary_data.task = NULL; secondary_data.stack = NULL; status = READ_ONCE(secondary_data.status); if (ret && status) { @@ -208,7 +213,10 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) asmlinkage void secondary_start_kernel(void) { struct mm_struct *mm = &init_mm; - unsigned int cpu = smp_processor_id(); + unsigned int cpu; + + cpu = task_cpu(current); + set_my_cpu_offset(per_cpu_offset(cpu)); /* * All kernel threads share the same mm context; grab a @@ -217,8 +225,6 @@ asmlinkage void secondary_start_kernel(void) atomic_inc(&mm->mm_count); current->active_mm = mm; - set_my_cpu_offset(per_cpu_offset(smp_processor_id())); - /* * TTBR0 is only used for the identity mapping at this stage. Make it * point to zero page to avoid speculatively fetching new entries. @@ -718,6 +724,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus) */ for_each_possible_cpu(cpu) { + per_cpu(cpu_number, cpu) = cpu; + if (cpu == smp_processor_id()) continue; diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index c2efddfca18c..8a552a33c6ef 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -22,6 +22,7 @@ #include <linux/stacktrace.h> #include <asm/irq.h> +#include <asm/stack_pointer.h> #include <asm/stacktrace.h> /* @@ -128,7 +129,6 @@ void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame, break; } } -EXPORT_SYMBOL(walk_stackframe); #ifdef CONFIG_STACKTRACE struct stack_trace_data { @@ -181,6 +181,9 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) struct stack_trace_data data; struct stackframe frame; + if (!try_get_task_stack(tsk)) + return; + data.trace = trace; data.skip = trace->skip; @@ -202,6 +205,8 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) walk_stackframe(tsk, &frame, save_trace, &data); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; + + put_task_stack(tsk); } void save_stack_trace(struct stack_trace *trace) diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index bb0cd787a9d3..1e3be9064cfa 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -47,12 +47,6 @@ void notrace __cpu_suspend_exit(void) cpu_uninstall_idmap(); /* - * Restore per-cpu offset before any kernel - * subsystem relying on it has a chance to run. - */ - set_my_cpu_offset(per_cpu_offset(cpu)); - - /* * PSTATE was not saved over suspend/resume, re-enable any detected * features that might not have been set correctly. */ diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 694f6deedbab..23e9e13bd2aa 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -19,10 +19,226 @@ #include <linux/nodemask.h> #include <linux/of.h> #include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/cpufreq.h> +#include <asm/cpu.h> #include <asm/cputype.h> #include <asm/topology.h> +static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE; +static DEFINE_MUTEX(cpu_scale_mutex); + +unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) +{ + return per_cpu(cpu_scale, cpu); +} + +static void set_capacity_scale(unsigned int cpu, unsigned long capacity) +{ + per_cpu(cpu_scale, cpu) = capacity; +} + +#ifdef CONFIG_PROC_SYSCTL +static ssize_t cpu_capacity_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct cpu *cpu = container_of(dev, struct cpu, dev); + + return sprintf(buf, "%lu\n", + arch_scale_cpu_capacity(NULL, cpu->dev.id)); +} + +static ssize_t cpu_capacity_store(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct cpu *cpu = container_of(dev, struct cpu, dev); + int this_cpu = cpu->dev.id, i; + unsigned long new_capacity; + ssize_t ret; + + if (count) { + ret = kstrtoul(buf, 0, &new_capacity); + if (ret) + return ret; + if (new_capacity > SCHED_CAPACITY_SCALE) + return -EINVAL; + + mutex_lock(&cpu_scale_mutex); + for_each_cpu(i, &cpu_topology[this_cpu].core_sibling) + set_capacity_scale(i, new_capacity); + mutex_unlock(&cpu_scale_mutex); + } + + return count; +} + +static DEVICE_ATTR_RW(cpu_capacity); + +static int register_cpu_capacity_sysctl(void) +{ + int i; + struct device *cpu; + + for_each_possible_cpu(i) { + cpu = get_cpu_device(i); + if (!cpu) { + pr_err("%s: too early to get CPU%d device!\n", + __func__, i); + continue; + } + device_create_file(cpu, &dev_attr_cpu_capacity); + } + + return 0; +} +subsys_initcall(register_cpu_capacity_sysctl); +#endif + +static u32 capacity_scale; +static u32 *raw_capacity; +static bool cap_parsing_failed; + +static void __init parse_cpu_capacity(struct device_node *cpu_node, int cpu) +{ + int ret; + u32 cpu_capacity; + + if (cap_parsing_failed) + return; + + ret = of_property_read_u32(cpu_node, + "capacity-dmips-mhz", + &cpu_capacity); + if (!ret) { + if (!raw_capacity) { + raw_capacity = kcalloc(num_possible_cpus(), + sizeof(*raw_capacity), + GFP_KERNEL); + if (!raw_capacity) { + pr_err("cpu_capacity: failed to allocate memory for raw capacities\n"); + cap_parsing_failed = true; + return; + } + } + capacity_scale = max(cpu_capacity, capacity_scale); + raw_capacity[cpu] = cpu_capacity; + pr_debug("cpu_capacity: %s cpu_capacity=%u (raw)\n", + cpu_node->full_name, raw_capacity[cpu]); + } else { + if (raw_capacity) { + pr_err("cpu_capacity: missing %s raw capacity\n", + cpu_node->full_name); + pr_err("cpu_capacity: partial information: fallback to 1024 for all CPUs\n"); + } + cap_parsing_failed = true; + kfree(raw_capacity); + } +} + +static void normalize_cpu_capacity(void) +{ + u64 capacity; + int cpu; + + if (!raw_capacity || cap_parsing_failed) + return; + + pr_debug("cpu_capacity: capacity_scale=%u\n", capacity_scale); + mutex_lock(&cpu_scale_mutex); + for_each_possible_cpu(cpu) { + pr_debug("cpu_capacity: cpu=%d raw_capacity=%u\n", + cpu, raw_capacity[cpu]); + capacity = (raw_capacity[cpu] << SCHED_CAPACITY_SHIFT) + / capacity_scale; + set_capacity_scale(cpu, capacity); + pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n", + cpu, arch_scale_cpu_capacity(NULL, cpu)); + } + mutex_unlock(&cpu_scale_mutex); +} + +#ifdef CONFIG_CPU_FREQ +static cpumask_var_t cpus_to_visit; +static bool cap_parsing_done; +static void parsing_done_workfn(struct work_struct *work); +static DECLARE_WORK(parsing_done_work, parsing_done_workfn); + +static int +init_cpu_capacity_callback(struct notifier_block *nb, + unsigned long val, + void *data) +{ + struct cpufreq_policy *policy = data; + int cpu; + + if (cap_parsing_failed || cap_parsing_done) + return 0; + + switch (val) { + case CPUFREQ_NOTIFY: + pr_debug("cpu_capacity: init cpu capacity for CPUs [%*pbl] (to_visit=%*pbl)\n", + cpumask_pr_args(policy->related_cpus), + cpumask_pr_args(cpus_to_visit)); + cpumask_andnot(cpus_to_visit, + cpus_to_visit, + policy->related_cpus); + for_each_cpu(cpu, policy->related_cpus) { + raw_capacity[cpu] = arch_scale_cpu_capacity(NULL, cpu) * + policy->cpuinfo.max_freq / 1000UL; + capacity_scale = max(raw_capacity[cpu], capacity_scale); + } + if (cpumask_empty(cpus_to_visit)) { + normalize_cpu_capacity(); + kfree(raw_capacity); + pr_debug("cpu_capacity: parsing done\n"); + cap_parsing_done = true; + schedule_work(&parsing_done_work); + } + } + return 0; +} + +static struct notifier_block init_cpu_capacity_notifier = { + .notifier_call = init_cpu_capacity_callback, +}; + +static int __init register_cpufreq_notifier(void) +{ + if (cap_parsing_failed) + return -EINVAL; + + if (!alloc_cpumask_var(&cpus_to_visit, GFP_KERNEL)) { + pr_err("cpu_capacity: failed to allocate memory for cpus_to_visit\n"); + return -ENOMEM; + } + cpumask_copy(cpus_to_visit, cpu_possible_mask); + + return cpufreq_register_notifier(&init_cpu_capacity_notifier, + CPUFREQ_POLICY_NOTIFIER); +} +core_initcall(register_cpufreq_notifier); + +static void parsing_done_workfn(struct work_struct *work) +{ + cpufreq_unregister_notifier(&init_cpu_capacity_notifier, + CPUFREQ_POLICY_NOTIFIER); +} + +#else +static int __init free_raw_capacity(void) +{ + kfree(raw_capacity); + + return 0; +} +core_initcall(free_raw_capacity); +#endif + static int __init get_cpu_for_node(struct device_node *node) { struct device_node *cpu_node; @@ -34,6 +250,7 @@ static int __init get_cpu_for_node(struct device_node *node) for_each_possible_cpu(cpu) { if (of_get_cpu_node(cpu, NULL) == cpu_node) { + parse_cpu_capacity(cpu_node, cpu); of_node_put(cpu_node); return cpu; } @@ -178,13 +395,17 @@ static int __init parse_dt_topology(void) * cluster with restricted subnodes. */ map = of_get_child_by_name(cn, "cpu-map"); - if (!map) + if (!map) { + cap_parsing_failed = true; goto out; + } ret = parse_cluster(map, 0); if (ret != 0) goto out_map; + normalize_cpu_capacity(); + /* * Check that all cores are in the topology; the SMP code will * only mark cores described in the DT as possible. diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index c9986b3e0a96..5b830be79c01 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -38,6 +38,7 @@ #include <asm/esr.h> #include <asm/insn.h> #include <asm/traps.h> +#include <asm/stack_pointer.h> #include <asm/stacktrace.h> #include <asm/exception.h> #include <asm/system_misc.h> @@ -147,6 +148,9 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) if (!tsk) tsk = current; + if (!try_get_task_stack(tsk)) + return; + /* * Switching between stacks is valid when tracing current and in * non-preemptible context. @@ -212,6 +216,8 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) stack + sizeof(struct pt_regs)); } } + + put_task_stack(tsk); } void show_stack(struct task_struct *tsk, unsigned long *sp) @@ -227,10 +233,9 @@ void show_stack(struct task_struct *tsk, unsigned long *sp) #endif #define S_SMP " SMP" -static int __die(const char *str, int err, struct thread_info *thread, - struct pt_regs *regs) +static int __die(const char *str, int err, struct pt_regs *regs) { - struct task_struct *tsk = thread->task; + struct task_struct *tsk = current; static int die_counter; int ret; @@ -245,7 +250,8 @@ static int __die(const char *str, int err, struct thread_info *thread, print_modules(); __show_regs(regs); pr_emerg("Process %.*s (pid: %d, stack limit = 0x%p)\n", - TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), thread + 1); + TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), + end_of_stack(tsk)); if (!user_mode(regs)) { dump_mem(KERN_EMERG, "Stack: ", regs->sp, @@ -264,7 +270,6 @@ static DEFINE_RAW_SPINLOCK(die_lock); */ void die(const char *str, struct pt_regs *regs, int err) { - struct thread_info *thread = current_thread_info(); int ret; oops_enter(); @@ -272,9 +277,9 @@ void die(const char *str, struct pt_regs *regs, int err) raw_spin_lock_irq(&die_lock); console_verbose(); bust_spinlocks(1); - ret = __die(str, err, thread, regs); + ret = __die(str, err, regs); - if (regs && kexec_should_crash(thread->task)) + if (regs && kexec_should_crash(current)) crash_kexec(regs); bust_spinlocks(0); @@ -435,9 +440,10 @@ int cpu_enable_cache_maint_trap(void *__unused) } #define __user_cache_maint(insn, address, res) \ - if (untagged_addr(address) >= user_addr_max()) \ + if (untagged_addr(address) >= user_addr_max()) { \ res = -EFAULT; \ - else \ + } else { \ + uaccess_ttbr0_enable(); \ asm volatile ( \ "1: " insn ", %1\n" \ " mov %w0, #0\n" \ @@ -449,7 +455,9 @@ int cpu_enable_cache_maint_trap(void *__unused) " .popsection\n" \ _ASM_EXTABLE(1b, 3b) \ : "=r" (res) \ - : "r" (address), "i" (-EFAULT) ) + : "r" (address), "i" (-EFAULT)); \ + uaccess_ttbr0_disable(); \ + } static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs) { diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 1105aab1e6d6..b8deffa9e1bf 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -216,6 +216,11 @@ SECTIONS swapper_pg_dir = .; . += SWAPPER_DIR_SIZE; +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + reserved_ttbr0 = .; + . += RESERVED_TTBR0_SIZE; +#endif + _end = .; STABS_DEBUG diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 6eaf12c1d627..52cb7ad9b2fd 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -16,9 +16,6 @@ menuconfig VIRTUALIZATION if VIRTUALIZATION -config KVM_ARM_VGIC_V3_ITS - bool - config KVM bool "Kernel-based Virtual Machine (KVM) support" depends on OF @@ -34,7 +31,6 @@ config KVM select KVM_VFIO select HAVE_KVM_EVENTFD select HAVE_KVM_IRQFD - select KVM_ARM_VGIC_V3_ITS select KVM_ARM_PMU if HW_PERF_EVENTS select HAVE_KVM_MSI select HAVE_KVM_IRQCHIP diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index a204adf29f0a..1bfe30dfbfe7 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -57,6 +57,16 @@ static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run) return 1; } +/* + * Guest access to FP/ASIMD registers are routed to this handler only + * when the system doesn't support FP/ASIMD. + */ +static int handle_no_fpsimd(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + kvm_inject_undefined(vcpu); + return 1; +} + /** * kvm_handle_wfx - handle a wait-for-interrupts or wait-for-event * instruction executed by a guest @@ -144,6 +154,7 @@ static exit_handle_fn arm_exit_handlers[] = { [ESR_ELx_EC_BREAKPT_LOW]= kvm_handle_guest_debug, [ESR_ELx_EC_BKPT32] = kvm_handle_guest_debug, [ESR_ELx_EC_BRK64] = kvm_handle_guest_debug, + [ESR_ELx_EC_FP_ASIMD] = handle_no_fpsimd, }; static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 4e92399f7105..5e9052f087f2 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -106,9 +106,16 @@ el1_trap: * x0: ESR_EC */ - /* Guest accessed VFP/SIMD registers, save host, restore Guest */ + /* + * We trap the first access to the FP/SIMD to save the host context + * and restore the guest context lazily. + * If FP/SIMD is not implemented, handle the trap and inject an + * undefined instruction exception to the guest. + */ +alternative_if_not ARM64_HAS_NO_FPSIMD cmp x0, #ESR_ELx_EC_FP_ASIMD b.eq __fpsimd_guest_restore +alternative_else_nop_endif mrs x1, tpidr_el2 mov x0, #ARM_EXCEPTION_TRAP diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 83037cd62d01..75e83dd40d43 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -21,6 +21,7 @@ #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> +#include <asm/fpsimd.h> static bool __hyp_text __fpsimd_enabled_nvhe(void) { @@ -76,16 +77,24 @@ static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu) * traps are only taken to EL2 if the operation would not otherwise * trap to EL1. Therefore, always make sure that for 32-bit guests, * we set FPEXC.EN to prevent traps to EL1, when setting the TFP bit. + * If FP/ASIMD is not implemented, FPEXC is UNDEFINED and any access to + * it will cause an exception. */ val = vcpu->arch.hcr_el2; - if (!(val & HCR_RW)) { + if (!(val & HCR_RW) && system_supports_fpsimd()) { write_sysreg(1 << 30, fpexc32_el2); isb(); } write_sysreg(val, hcr_el2); /* Trap on AArch32 cp15 c15 accesses (EL1 or EL0) */ write_sysreg(1 << 15, hstr_el2); - /* Make sure we trap PMU access from EL0 to EL2 */ + /* + * Make sure we trap PMU access from EL0 to EL2. Also sanitize + * PMSELR_EL0 to make sure it never contains the cycle + * counter, which could make a PMXEVCNTR_EL0 access UNDEF at + * EL1 instead of being trapped to EL2. + */ + write_sysreg(0, pmselr_el0); write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); __activate_traps_arch()(); diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 5bc460884639..e95d4f68bf54 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -86,12 +86,6 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_VCPU_ATTRIBUTES: r = 1; break; - case KVM_CAP_MSI_DEVID: - if (!kvm) - r = -EINVAL; - else - r = kvm->arch.vgic.msis_require_devid; - break; default: r = 0; } diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index 5d1cad3ce6d6..d7150e30438a 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -17,10 +17,7 @@ */ #include <linux/linkage.h> -#include <asm/alternative.h> -#include <asm/assembler.h> -#include <asm/cpufeature.h> -#include <asm/sysreg.h> +#include <asm/uaccess.h> .text @@ -33,8 +30,7 @@ * Alignment fixed up by hardware. */ ENTRY(__clear_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_enable_not_uao x2, x3 mov x2, x1 // save the size for fixup return subs x1, x1, #8 b.mi 2f @@ -54,8 +50,7 @@ uao_user_alternative 9f, strh, sttrh, wzr, x0, 2 b.mi 5f uao_user_alternative 9f, strb, sttrb, wzr, x0, 0 5: mov x0, #0 -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_disable_not_uao x2 ret ENDPROC(__clear_user) diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 4fd67ea03bb0..cfe13396085b 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -16,11 +16,8 @@ #include <linux/linkage.h> -#include <asm/alternative.h> -#include <asm/assembler.h> #include <asm/cache.h> -#include <asm/cpufeature.h> -#include <asm/sysreg.h> +#include <asm/uaccess.h> /* * Copy from user space to a kernel buffer (alignment handled by the hardware) @@ -67,12 +64,10 @@ end .req x5 ENTRY(__arch_copy_from_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_enable_not_uao x3, x4 add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_disable_not_uao x3 mov x0, #0 // Nothing to copy ret ENDPROC(__arch_copy_from_user) diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index f7292dd08c84..718b1c4e2f85 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -18,11 +18,8 @@ #include <linux/linkage.h> -#include <asm/alternative.h> -#include <asm/assembler.h> #include <asm/cache.h> -#include <asm/cpufeature.h> -#include <asm/sysreg.h> +#include <asm/uaccess.h> /* * Copy from user space to user space (alignment handled by the hardware) @@ -68,12 +65,10 @@ end .req x5 ENTRY(__copy_in_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_enable_not_uao x3, x4 add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_disable_not_uao x3 mov x0, #0 ret ENDPROC(__copy_in_user) diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 7a7efe255034..e99e31c9acac 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -16,11 +16,8 @@ #include <linux/linkage.h> -#include <asm/alternative.h> -#include <asm/assembler.h> #include <asm/cache.h> -#include <asm/cpufeature.h> -#include <asm/sysreg.h> +#include <asm/uaccess.h> /* * Copy to user space from a kernel buffer (alignment handled by the hardware) @@ -66,12 +63,10 @@ end .req x5 ENTRY(__arch_copy_to_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_enable_not_uao x3, x4 add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_disable_not_uao x3 mov x0, #0 ret ENDPROC(__arch_copy_to_user) diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index 54bb209cae8e..e703fb9defad 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -3,7 +3,8 @@ obj-y := dma-mapping.o extable.o fault.o init.o \ ioremap.o mmap.o pgd.o mmu.o \ context.o proc.o pageattr.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_ARM64_PTDUMP) += dump.o +obj-$(CONFIG_ARM64_PTDUMP_CORE) += dump.o +obj-$(CONFIG_ARM64_PTDUMP_DEBUGFS) += ptdump_debugfs.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_KASAN) += kasan_init.o diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index 58b5a906ff78..da9576932322 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -23,6 +23,7 @@ #include <asm/assembler.h> #include <asm/cpufeature.h> #include <asm/alternative.h> +#include <asm/uaccess.h> /* * flush_icache_range(start,end) @@ -48,6 +49,7 @@ ENTRY(flush_icache_range) * - end - virtual end address of region */ ENTRY(__flush_cache_user_range) + uaccess_ttbr0_enable x2, x3 dcache_line_size x2, x3 sub x3, x2, #1 bic x4, x0, x3 @@ -69,10 +71,12 @@ USER(9f, ic ivau, x4 ) // invalidate I line PoU dsb ish isb mov x0, #0 +1: + uaccess_ttbr0_disable x1 ret 9: mov x0, #-EFAULT - ret + b 1b ENDPROC(flush_icache_range) ENDPROC(__flush_cache_user_range) diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index efcf1f7ef1e4..4c63cb154859 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -221,7 +221,12 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) raw_spin_unlock_irqrestore(&cpu_asid_lock, flags); switch_mm_fastpath: - cpu_switch_mm(mm->pgd, mm); + /* + * Defer TTBR0_EL1 setting for user threads to uaccess_enable() when + * emulating PAN. + */ + if (!system_uses_ttbr0_pan()) + cpu_switch_mm(mm->pgd, mm); } static int asids_init(void) diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 3f74d0d98de6..aa6c8f834d9e 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -938,11 +938,6 @@ static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, void arch_teardown_dma_ops(struct device *dev) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - - if (WARN_ON(domain)) - iommu_detach_device(domain, dev); - dev->archdata.dma_ops = NULL; } diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index 9c3e75df2180..ca74a2aace42 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -50,6 +50,18 @@ static const struct addr_marker address_markers[] = { { -1, NULL }, }; +#define pt_dump_seq_printf(m, fmt, args...) \ +({ \ + if (m) \ + seq_printf(m, fmt, ##args); \ +}) + +#define pt_dump_seq_puts(m, fmt) \ +({ \ + if (m) \ + seq_printf(m, fmt); \ +}) + /* * The page dumper groups page table entries of the same type into a single * description. It uses pg_state to track the range information while @@ -62,6 +74,9 @@ struct pg_state { unsigned long start_address; unsigned level; u64 current_prot; + bool check_wx; + unsigned long wx_pages; + unsigned long uxn_pages; }; struct prot_bits { @@ -186,10 +201,39 @@ static void dump_prot(struct pg_state *st, const struct prot_bits *bits, s = bits->clear; if (s) - seq_printf(st->seq, " %s", s); + pt_dump_seq_printf(st->seq, " %s", s); } } +static void note_prot_uxn(struct pg_state *st, unsigned long addr) +{ + if (!st->check_wx) + return; + + if ((st->current_prot & PTE_UXN) == PTE_UXN) + return; + + WARN_ONCE(1, "arm64/mm: Found non-UXN mapping at address %p/%pS\n", + (void *)st->start_address, (void *)st->start_address); + + st->uxn_pages += (addr - st->start_address) / PAGE_SIZE; +} + +static void note_prot_wx(struct pg_state *st, unsigned long addr) +{ + if (!st->check_wx) + return; + if ((st->current_prot & PTE_RDONLY) == PTE_RDONLY) + return; + if ((st->current_prot & PTE_PXN) == PTE_PXN) + return; + + WARN_ONCE(1, "arm64/mm: Found insecure W+X mapping at address %p/%pS\n", + (void *)st->start_address, (void *)st->start_address); + + st->wx_pages += (addr - st->start_address) / PAGE_SIZE; +} + static void note_page(struct pg_state *st, unsigned long addr, unsigned level, u64 val) { @@ -200,14 +244,16 @@ static void note_page(struct pg_state *st, unsigned long addr, unsigned level, st->level = level; st->current_prot = prot; st->start_address = addr; - seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); + pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); } else if (prot != st->current_prot || level != st->level || addr >= st->marker[1].start_address) { const char *unit = units; unsigned long delta; if (st->current_prot) { - seq_printf(st->seq, "0x%016lx-0x%016lx ", + note_prot_uxn(st, addr); + note_prot_wx(st, addr); + pt_dump_seq_printf(st->seq, "0x%016lx-0x%016lx ", st->start_address, addr); delta = (addr - st->start_address) >> 10; @@ -215,17 +261,17 @@ static void note_page(struct pg_state *st, unsigned long addr, unsigned level, delta >>= 10; unit++; } - seq_printf(st->seq, "%9lu%c %s", delta, *unit, + pt_dump_seq_printf(st->seq, "%9lu%c %s", delta, *unit, pg_level[st->level].name); if (pg_level[st->level].bits) dump_prot(st, pg_level[st->level].bits, pg_level[st->level].num); - seq_puts(st->seq, "\n"); + pt_dump_seq_puts(st->seq, "\n"); } if (addr >= st->marker[1].start_address) { st->marker++; - seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); + pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); } st->start_address = addr; @@ -235,7 +281,7 @@ static void note_page(struct pg_state *st, unsigned long addr, unsigned level, if (addr >= st->marker[1].start_address) { st->marker++; - seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); + pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); } } @@ -304,9 +350,8 @@ static void walk_pgd(struct pg_state *st, struct mm_struct *mm, } } -static int ptdump_show(struct seq_file *m, void *v) +void ptdump_walk_pgd(struct seq_file *m, struct ptdump_info *info) { - struct ptdump_info *info = m->private; struct pg_state st = { .seq = m, .marker = info->markers, @@ -315,33 +360,16 @@ static int ptdump_show(struct seq_file *m, void *v) walk_pgd(&st, info->mm, info->base_addr); note_page(&st, 0, 0, 0); - return 0; } -static int ptdump_open(struct inode *inode, struct file *file) +static void ptdump_initialize(void) { - return single_open(file, ptdump_show, inode->i_private); -} - -static const struct file_operations ptdump_fops = { - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -int ptdump_register(struct ptdump_info *info, const char *name) -{ - struct dentry *pe; unsigned i, j; for (i = 0; i < ARRAY_SIZE(pg_level); i++) if (pg_level[i].bits) for (j = 0; j < pg_level[i].num; j++) pg_level[i].mask |= pg_level[i].bits[j].mask; - - pe = debugfs_create_file(name, 0400, NULL, info, &ptdump_fops); - return pe ? 0 : -ENOMEM; } static struct ptdump_info kernel_ptdump_info = { @@ -350,8 +378,30 @@ static struct ptdump_info kernel_ptdump_info = { .base_addr = VA_START, }; +void ptdump_check_wx(void) +{ + struct pg_state st = { + .seq = NULL, + .marker = (struct addr_marker[]) { + { 0, NULL}, + { -1, NULL}, + }, + .check_wx = true, + }; + + walk_pgd(&st, &init_mm, 0); + note_page(&st, 0, 0, 0); + if (st.wx_pages || st.uxn_pages) + pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n", + st.wx_pages, st.uxn_pages); + else + pr_info("Checked W+X mappings: passed, no W+X pages found\n"); +} + static int ptdump_init(void) { - return ptdump_register(&kernel_ptdump_info, "kernel_page_tables"); + ptdump_initialize(); + return ptdump_debugfs_register(&kernel_ptdump_info, + "kernel_page_tables"); } device_initcall(ptdump_init); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 0f8788374815..a78a5c401806 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -269,13 +269,19 @@ out: return fault; } -static inline bool is_permission_fault(unsigned int esr) +static inline bool is_permission_fault(unsigned int esr, struct pt_regs *regs) { unsigned int ec = ESR_ELx_EC(esr); unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; - return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM) || - (ec == ESR_ELx_EC_IABT_CUR && fsc_type == ESR_ELx_FSC_PERM); + if (ec != ESR_ELx_EC_DABT_CUR && ec != ESR_ELx_EC_IABT_CUR) + return false; + + if (system_uses_ttbr0_pan()) + return fsc_type == ESR_ELx_FSC_FAULT && + (regs->pstate & PSR_PAN_BIT); + else + return fsc_type == ESR_ELx_FSC_PERM; } static bool is_el0_instruction_abort(unsigned int esr) @@ -315,7 +321,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, mm_flags |= FAULT_FLAG_WRITE; } - if (is_permission_fault(esr) && (addr < USER_DS)) { + if (addr < USER_DS && is_permission_fault(esr, regs)) { /* regs->orig_addr_limit may be 0 if we entered from EL0 */ if (regs->orig_addr_limit == KERNEL_DS) die("Accessing user space memory with fs=KERNEL_DS", regs, esr); @@ -507,10 +513,10 @@ static const struct fault_info { { do_bad, SIGBUS, 0, "unknown 17" }, { do_bad, SIGBUS, 0, "unknown 18" }, { do_bad, SIGBUS, 0, "unknown 19" }, - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, + { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, + { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, + { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, + { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, { do_bad, SIGBUS, 0, "synchronous parity error" }, { do_bad, SIGBUS, 0, "unknown 25" }, { do_bad, SIGBUS, 0, "unknown 26" }, diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 8377329d8c97..554a2558c12e 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -25,14 +25,7 @@ #include <asm/cachetype.h> #include <asm/tlbflush.h> -void flush_cache_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) -{ - if (vma->vm_flags & VM_EXEC) - __flush_icache_all(); -} - -static void sync_icache_aliases(void *kaddr, unsigned long len) +void sync_icache_aliases(void *kaddr, unsigned long len) { unsigned long addr = (unsigned long)kaddr; diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 2e49bd252fe7..964b7549af5c 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -51,20 +51,8 @@ static int find_num_contig(struct mm_struct *mm, unsigned long addr, *pgsize = PAGE_SIZE; if (!pte_cont(pte)) return 1; - if (!pgd_present(*pgd)) { - VM_BUG_ON(!pgd_present(*pgd)); - return 1; - } pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) { - VM_BUG_ON(!pud_present(*pud)); - return 1; - } pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) { - VM_BUG_ON(!pmd_present(*pmd)); - return 1; - } if ((pte_t *)pmd == ptep) { *pgsize = PMD_SIZE; return CONT_PMDS; @@ -212,7 +200,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize); /* save the 1st pte to return */ pte = ptep_get_and_clear(mm, addr, cpte); - for (i = 1; i < ncontig; ++i) { + for (i = 1, addr += pgsize; i < ncontig; ++i, addr += pgsize) { /* * If HW_AFDBM is enabled, then the HW could * turn on the dirty bit for any of the page @@ -250,7 +238,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, pfn = pte_pfn(*cpte); ncontig = find_num_contig(vma->vm_mm, addr, cpte, *cpte, &pgsize); - for (i = 0; i < ncontig; ++i, ++cpte) { + for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) { changed = ptep_set_access_flags(vma, addr, cpte, pfn_pte(pfn, hugeprot), @@ -273,7 +261,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm, cpte = huge_pte_offset(mm, addr); ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize); - for (i = 0; i < ncontig; ++i, ++cpte) + for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) ptep_set_wrprotect(mm, addr, cpte); } else { ptep_set_wrprotect(mm, addr, ptep); @@ -291,7 +279,7 @@ void huge_ptep_clear_flush(struct vm_area_struct *vma, cpte = huge_pte_offset(vma->vm_mm, addr); ncontig = find_num_contig(vma->vm_mm, addr, cpte, *cpte, &pgsize); - for (i = 0; i < ncontig; ++i, ++cpte) + for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) ptep_clear_flush(vma, addr, cpte); } else { ptep_clear_flush(vma, addr, ptep); @@ -323,7 +311,7 @@ __setup("hugepagesz=", setup_hugepagesz); static __init int add_default_hugepagesz(void) { if (size_to_hstate(CONT_PTES * PAGE_SIZE) == NULL) - hugetlb_add_hstate(CONT_PMD_SHIFT); + hugetlb_add_hstate(CONT_PTE_SHIFT); return 0; } arch_initcall(add_default_hugepagesz); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 05615a3fdc6f..17243e43184e 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -28,8 +28,6 @@ #include <linux/memblock.h> #include <linux/fs.h> #include <linux/io.h> -#include <linux/slab.h> -#include <linux/stop_machine.h> #include <asm/barrier.h> #include <asm/cputype.h> @@ -42,6 +40,7 @@ #include <asm/tlb.h> #include <asm/memblock.h> #include <asm/mmu_context.h> +#include <asm/ptdump.h> u64 idmap_t0sz = TCR_T0SZ(VA_BITS); @@ -95,11 +94,24 @@ static phys_addr_t __init early_pgtable_alloc(void) return phys; } +static bool pgattr_change_is_safe(u64 old, u64 new) +{ + /* + * The following mapping attributes may be updated in live + * kernel mappings without the need for break-before-make. + */ + static const pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE; + + return old == 0 || new == 0 || ((old ^ new) & ~mask) == 0; +} + static void alloc_init_pte(pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(void)) + phys_addr_t (*pgtable_alloc)(void), + bool page_mappings_only) { + pgprot_t __prot = prot; pte_t *pte; BUG_ON(pmd_sect(*pmd)); @@ -115,8 +127,28 @@ static void alloc_init_pte(pmd_t *pmd, unsigned long addr, pte = pte_set_fixmap_offset(pmd, addr); do { - set_pte(pte, pfn_pte(pfn, prot)); + pte_t old_pte = *pte; + + /* + * Set the contiguous bit for the subsequent group of PTEs if + * its size and alignment are appropriate. + */ + if (((addr | PFN_PHYS(pfn)) & ~CONT_PTE_MASK) == 0) { + if (end - addr >= CONT_PTE_SIZE && !page_mappings_only) + __prot = __pgprot(pgprot_val(prot) | PTE_CONT); + else + __prot = prot; + } + + set_pte(pte, pfn_pte(pfn, __prot)); pfn++; + + /* + * After the PTE entry has been populated once, we + * only allow updates to the permission attributes. + */ + BUG_ON(!pgattr_change_is_safe(pte_val(old_pte), pte_val(*pte))); + } while (pte++, addr += PAGE_SIZE, addr != end); pte_clear_fixmap(); @@ -125,8 +157,9 @@ static void alloc_init_pte(pmd_t *pmd, unsigned long addr, static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void), - bool allow_block_mappings) + bool page_mappings_only) { + pgprot_t __prot = prot; pmd_t *pmd; unsigned long next; @@ -146,27 +179,39 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, pmd = pmd_set_fixmap_offset(pud, addr); do { + pmd_t old_pmd = *pmd; + next = pmd_addr_end(addr, end); + /* try section mapping first */ if (((addr | next | phys) & ~SECTION_MASK) == 0 && - allow_block_mappings) { - pmd_t old_pmd =*pmd; - pmd_set_huge(pmd, phys, prot); + !page_mappings_only) { /* - * Check for previous table entries created during - * boot (__create_page_tables) and flush them. + * Set the contiguous bit for the subsequent group of + * PMDs if its size and alignment are appropriate. */ - if (!pmd_none(old_pmd)) { - flush_tlb_all(); - if (pmd_table(old_pmd)) { - phys_addr_t table = pmd_page_paddr(old_pmd); - if (!WARN_ON_ONCE(slab_is_available())) - memblock_free(table, PAGE_SIZE); - } + if (((addr | phys) & ~CONT_PMD_MASK) == 0) { + if (end - addr >= CONT_PMD_SIZE) + __prot = __pgprot(pgprot_val(prot) | + PTE_CONT); + else + __prot = prot; } + pmd_set_huge(pmd, phys, __prot); + + /* + * After the PMD entry has been populated once, we + * only allow updates to the permission attributes. + */ + BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd), + pmd_val(*pmd))); } else { alloc_init_pte(pmd, addr, next, __phys_to_pfn(phys), - prot, pgtable_alloc); + prot, pgtable_alloc, + page_mappings_only); + + BUG_ON(pmd_val(old_pmd) != 0 && + pmd_val(old_pmd) != pmd_val(*pmd)); } phys += next - addr; } while (pmd++, addr = next, addr != end); @@ -189,7 +234,7 @@ static inline bool use_1G_block(unsigned long addr, unsigned long next, static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void), - bool allow_block_mappings) + bool page_mappings_only) { pud_t *pud; unsigned long next; @@ -204,33 +249,28 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, pud = pud_set_fixmap_offset(pgd, addr); do { + pud_t old_pud = *pud; + next = pud_addr_end(addr, end); /* * For 4K granule only, attempt to put down a 1GB block */ - if (use_1G_block(addr, next, phys) && allow_block_mappings) { - pud_t old_pud = *pud; + if (use_1G_block(addr, next, phys) && !page_mappings_only) { pud_set_huge(pud, phys, prot); /* - * If we have an old value for a pud, it will - * be pointing to a pmd table that we no longer - * need (from swapper_pg_dir). - * - * Look up the old pmd table and free it. + * After the PUD entry has been populated once, we + * only allow updates to the permission attributes. */ - if (!pud_none(old_pud)) { - flush_tlb_all(); - if (pud_table(old_pud)) { - phys_addr_t table = pud_page_paddr(old_pud); - if (!WARN_ON_ONCE(slab_is_available())) - memblock_free(table, PAGE_SIZE); - } - } + BUG_ON(!pgattr_change_is_safe(pud_val(old_pud), + pud_val(*pud))); } else { alloc_init_pmd(pud, addr, next, phys, prot, - pgtable_alloc, allow_block_mappings); + pgtable_alloc, page_mappings_only); + + BUG_ON(pud_val(old_pud) != 0 && + pud_val(old_pud) != pud_val(*pud)); } phys += next - addr; } while (pud++, addr = next, addr != end); @@ -242,7 +282,7 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void), - bool allow_block_mappings) + bool page_mappings_only) { unsigned long addr, length, end, next; pgd_t *pgd = pgd_offset_raw(pgdir, virt); @@ -262,7 +302,7 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, do { next = pgd_addr_end(addr, end); alloc_init_pud(pgd, addr, next, phys, prot, pgtable_alloc, - allow_block_mappings); + page_mappings_only); phys += next - addr; } while (pgd++, addr = next, addr != end); } @@ -291,17 +331,17 @@ static void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt, &phys, virt); return; } - __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, true); + __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, false); } void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, unsigned long virt, phys_addr_t size, - pgprot_t prot, bool allow_block_mappings) + pgprot_t prot, bool page_mappings_only) { BUG_ON(mm == &init_mm); __create_pgd_mapping(mm->pgd, phys, virt, size, prot, - pgd_pgtable_alloc, allow_block_mappings); + pgd_pgtable_alloc, page_mappings_only); } static void create_mapping_late(phys_addr_t phys, unsigned long virt, @@ -314,7 +354,7 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt, } __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, - NULL, !debug_pagealloc_enabled()); + NULL, debug_pagealloc_enabled()); } static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end) @@ -332,7 +372,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end __create_pgd_mapping(pgd, start, __phys_to_virt(start), end - start, PAGE_KERNEL, early_pgtable_alloc, - !debug_pagealloc_enabled()); + debug_pagealloc_enabled()); return; } @@ -345,13 +385,13 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end __phys_to_virt(start), kernel_start - start, PAGE_KERNEL, early_pgtable_alloc, - !debug_pagealloc_enabled()); + debug_pagealloc_enabled()); if (kernel_end < end) __create_pgd_mapping(pgd, kernel_end, __phys_to_virt(kernel_end), end - kernel_end, PAGE_KERNEL, early_pgtable_alloc, - !debug_pagealloc_enabled()); + debug_pagealloc_enabled()); /* * Map the linear alias of the [_text, __init_begin) interval as @@ -361,7 +401,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end */ __create_pgd_mapping(pgd, kernel_start, __phys_to_virt(kernel_start), kernel_end - kernel_start, PAGE_KERNEL_RO, - early_pgtable_alloc, !debug_pagealloc_enabled()); + early_pgtable_alloc, debug_pagealloc_enabled()); } static void __init map_mem(pgd_t *pgd) @@ -396,6 +436,11 @@ void mark_rodata_ro(void) section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata; create_mapping_late(__pa(__start_rodata), (unsigned long)__start_rodata, section_size, PAGE_KERNEL_RO); + + /* flush the TLBs after updating live kernel mappings */ + flush_tlb_all(); + + debug_checkwx(); } static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end, @@ -408,7 +453,7 @@ static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end, BUG_ON(!PAGE_ALIGNED(size)); __create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot, - early_pgtable_alloc, !debug_pagealloc_enabled()); + early_pgtable_alloc, debug_pagealloc_enabled()); vma->addr = va_start; vma->phys_addr = pa_start; diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 352c73b6a59e..32682be978e0 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -70,11 +70,14 @@ ENTRY(cpu_do_suspend) mrs x8, mdscr_el1 mrs x9, oslsr_el1 mrs x10, sctlr_el1 + mrs x11, tpidr_el1 + mrs x12, sp_el0 stp x2, x3, [x0] stp x4, xzr, [x0, #16] stp x5, x6, [x0, #32] stp x7, x8, [x0, #48] stp x9, x10, [x0, #64] + stp x11, x12, [x0, #80] ret ENDPROC(cpu_do_suspend) @@ -90,6 +93,7 @@ ENTRY(cpu_do_resume) ldp x6, x8, [x0, #32] ldp x9, x10, [x0, #48] ldp x11, x12, [x0, #64] + ldp x13, x14, [x0, #80] msr tpidr_el0, x2 msr tpidrro_el0, x3 msr contextidr_el1, x4 @@ -112,6 +116,8 @@ ENTRY(cpu_do_resume) msr mdscr_el1, x10 msr sctlr_el1, x12 + msr tpidr_el1, x13 + msr sp_el0, x14 /* * Restore oslsr_el1 by writing oslar_el1 */ @@ -136,11 +142,7 @@ ENTRY(cpu_do_switch_mm) bfi x0, x1, #48, #16 // set the ASID msr ttbr0_el1, x0 // set TTBR0 isb -alternative_if ARM64_WORKAROUND_CAVIUM_27456 - ic iallu - dsb nsh - isb -alternative_else_nop_endif + post_ttbr0_update_workaround ret ENDPROC(cpu_do_switch_mm) diff --git a/arch/arm64/mm/ptdump_debugfs.c b/arch/arm64/mm/ptdump_debugfs.c new file mode 100644 index 000000000000..eee4d864350c --- /dev/null +++ b/arch/arm64/mm/ptdump_debugfs.c @@ -0,0 +1,31 @@ +#include <linux/debugfs.h> +#include <linux/seq_file.h> + +#include <asm/ptdump.h> + +static int ptdump_show(struct seq_file *m, void *v) +{ + struct ptdump_info *info = m->private; + ptdump_walk_pgd(m, info); + return 0; +} + +static int ptdump_open(struct inode *inode, struct file *file) +{ + return single_open(file, ptdump_show, inode->i_private); +} + +static const struct file_operations ptdump_fops = { + .open = ptdump_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +int ptdump_debugfs_register(struct ptdump_info *info, const char *name) +{ + struct dentry *pe; + pe = debugfs_create_file(name, 0400, NULL, info, &ptdump_fops); + return pe ? 0 : -ENOMEM; + +} diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S index 329c8027b0a9..b41aff25426d 100644 --- a/arch/arm64/xen/hypercall.S +++ b/arch/arm64/xen/hypercall.S @@ -49,6 +49,7 @@ #include <linux/linkage.h> #include <asm/assembler.h> +#include <asm/uaccess.h> #include <xen/interface/xen.h> @@ -91,6 +92,20 @@ ENTRY(privcmd_call) mov x2, x3 mov x3, x4 mov x4, x5 + /* + * Privcmd calls are issued by the userspace. The kernel needs to + * enable access to TTBR0_EL1 as the hypervisor would issue stage 1 + * translations to user memory via AT instructions. Since AT + * instructions are not affected by the PAN bit (ARMv8.1), we only + * need the explicit uaccess_enable/disable if the TTBR0 PAN emulation + * is enabled (it implies that hardware UAO and PAN disabled). + */ + uaccess_ttbr0_enable x6, x7 hvc XEN_IMM + + /* + * Disable userspace access from kernel once the hyp call completed. + */ + uaccess_ttbr0_disable x6 ret ENDPROC(privcmd_call); diff --git a/arch/blackfin/mach-bf561/coreb.c b/arch/blackfin/mach-bf561/coreb.c index 8a2543c654b3..cf27554e76bf 100644 --- a/arch/blackfin/mach-bf561/coreb.c +++ b/arch/blackfin/mach-bf561/coreb.c @@ -1,5 +1,7 @@ /* Load firmware into Core B on a BF561 * + * Author: Bas Vermeulen <bvermeul@blackstar.xs4all.nl> + * * Copyright 2004-2009 Analog Devices Inc. * Licensed under the GPL-2 or later. */ @@ -14,9 +16,9 @@ #include <linux/device.h> #include <linux/fs.h> +#include <linux/init.h> #include <linux/kernel.h> #include <linux/miscdevice.h> -#include <linux/module.h> #define CMD_COREB_START _IO('b', 0) #define CMD_COREB_STOP _IO('b', 1) @@ -59,8 +61,4 @@ static struct miscdevice coreb_dev = { .name = "coreb", .fops = &coreb_fops, }; -module_misc_device(coreb_dev); - -MODULE_AUTHOR("Bas Vermeulen <bvermeul@blackstar.xs4all.nl>"); -MODULE_DESCRIPTION("BF561 Core B Support"); -MODULE_LICENSE("GPL"); +builtin_misc_device(coreb_dev); diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c index fae2f9447792..6080582a26d1 100644 --- a/arch/mips/kernel/asm-offsets.c +++ b/arch/mips/kernel/asm-offsets.c @@ -341,7 +341,7 @@ void output_pm_defines(void) void output_kvm_defines(void) { - COMMENT(" KVM/MIPS Specfic offsets. "); + COMMENT(" KVM/MIPS Specific offsets. "); OFFSET(VCPU_FPR0, kvm_vcpu_arch, fpu.fpr[0]); OFFSET(VCPU_FPR1, kvm_vcpu_arch, fpu.fpr[1]); diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index e407af2b7333..2e6a823fa502 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -70,7 +70,9 @@ #define HPTE_V_SSIZE_SHIFT 62 #define HPTE_V_AVPN_SHIFT 7 +#define HPTE_V_COMMON_BITS ASM_CONST(0x000fffffffffffff) #define HPTE_V_AVPN ASM_CONST(0x3fffffffffffff80) +#define HPTE_V_AVPN_3_0 ASM_CONST(0x000fffffffffff80) #define HPTE_V_AVPN_VAL(x) (((x) & HPTE_V_AVPN) >> HPTE_V_AVPN_SHIFT) #define HPTE_V_COMPARE(x,y) (!(((x) ^ (y)) & 0xffffffffffffff80UL)) #define HPTE_V_BOLTED ASM_CONST(0x0000000000000010) @@ -80,14 +82,16 @@ #define HPTE_V_VALID ASM_CONST(0x0000000000000001) /* - * ISA 3.0 have a different HPTE format. + * ISA 3.0 has a different HPTE format. */ #define HPTE_R_3_0_SSIZE_SHIFT 58 +#define HPTE_R_3_0_SSIZE_MASK (3ull << HPTE_R_3_0_SSIZE_SHIFT) #define HPTE_R_PP0 ASM_CONST(0x8000000000000000) #define HPTE_R_TS ASM_CONST(0x4000000000000000) #define HPTE_R_KEY_HI ASM_CONST(0x3000000000000000) #define HPTE_R_RPN_SHIFT 12 #define HPTE_R_RPN ASM_CONST(0x0ffffffffffff000) +#define HPTE_R_RPN_3_0 ASM_CONST(0x01fffffffffff000) #define HPTE_R_PP ASM_CONST(0x0000000000000003) #define HPTE_R_PPP ASM_CONST(0x8000000000000003) #define HPTE_R_N ASM_CONST(0x0000000000000004) @@ -316,12 +320,43 @@ static inline unsigned long hpte_encode_avpn(unsigned long vpn, int psize, */ v = (vpn >> (23 - VPN_SHIFT)) & ~(mmu_psize_defs[psize].avpnm); v <<= HPTE_V_AVPN_SHIFT; - if (!cpu_has_feature(CPU_FTR_ARCH_300)) - v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT; + v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT; return v; } /* + * ISA v3.0 defines a new HPTE format, which differs from the old + * format in having smaller AVPN and ARPN fields, and the B field + * in the second dword instead of the first. + */ +static inline unsigned long hpte_old_to_new_v(unsigned long v) +{ + /* trim AVPN, drop B */ + return v & HPTE_V_COMMON_BITS; +} + +static inline unsigned long hpte_old_to_new_r(unsigned long v, unsigned long r) +{ + /* move B field from 1st to 2nd dword, trim ARPN */ + return (r & ~HPTE_R_3_0_SSIZE_MASK) | + (((v) >> HPTE_V_SSIZE_SHIFT) << HPTE_R_3_0_SSIZE_SHIFT); +} + +static inline unsigned long hpte_new_to_old_v(unsigned long v, unsigned long r) +{ + /* insert B field */ + return (v & HPTE_V_COMMON_BITS) | + ((r & HPTE_R_3_0_SSIZE_MASK) << + (HPTE_V_SSIZE_SHIFT - HPTE_R_3_0_SSIZE_SHIFT)); +} + +static inline unsigned long hpte_new_to_old_r(unsigned long r) +{ + /* clear out B field */ + return r & ~HPTE_R_3_0_SSIZE_MASK; +} + +/* * This function sets the AVPN and L fields of the HPTE appropriately * using the base page size and actual page size. */ @@ -341,12 +376,8 @@ static inline unsigned long hpte_encode_v(unsigned long vpn, int base_psize, * aligned for the requested page size */ static inline unsigned long hpte_encode_r(unsigned long pa, int base_psize, - int actual_psize, int ssize) + int actual_psize) { - - if (cpu_has_feature(CPU_FTR_ARCH_300)) - pa |= ((unsigned long) ssize) << HPTE_R_3_0_SSIZE_SHIFT; - /* A 4K page needs no special encoding */ if (actual_psize == MMU_PAGE_4K) return pa & HPTE_R_RPN; diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index 05cabed3d1bd..09a802bb702f 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h @@ -99,6 +99,7 @@ #define BOOK3S_INTERRUPT_H_EMUL_ASSIST 0xe40 #define BOOK3S_INTERRUPT_HMI 0xe60 #define BOOK3S_INTERRUPT_H_DOORBELL 0xe80 +#define BOOK3S_INTERRUPT_H_VIRT 0xea0 #define BOOK3S_INTERRUPT_PERFMON 0xf00 #define BOOK3S_INTERRUPT_ALTIVEC 0xf20 #define BOOK3S_INTERRUPT_VSX 0xf40 diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 28350a294b1e..e59b172666cd 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -48,7 +48,7 @@ #ifdef CONFIG_KVM_MMIO #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 #endif -#define KVM_HALT_POLL_NS_DEFAULT 500000 +#define KVM_HALT_POLL_NS_DEFAULT 10000 /* 10 us */ /* These values are internal and can be increased later */ #define KVM_NR_IRQCHIPS 1 @@ -244,8 +244,10 @@ struct kvm_arch_memory_slot { struct kvm_arch { unsigned int lpid; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + unsigned int tlb_sets; unsigned long hpt_virt; struct revmap_entry *revmap; + atomic64_t mmio_update; unsigned int host_lpid; unsigned long host_lpcr; unsigned long sdr1; @@ -408,6 +410,24 @@ struct kvmppc_passthru_irqmap { #define KVMPPC_IRQ_MPIC 1 #define KVMPPC_IRQ_XICS 2 +#define MMIO_HPTE_CACHE_SIZE 4 + +struct mmio_hpte_cache_entry { + unsigned long hpte_v; + unsigned long hpte_r; + unsigned long rpte; + unsigned long pte_index; + unsigned long eaddr; + unsigned long slb_v; + long mmio_update; + unsigned int slb_base_pshift; +}; + +struct mmio_hpte_cache { + struct mmio_hpte_cache_entry entry[MMIO_HPTE_CACHE_SIZE]; + unsigned int index; +}; + struct openpic; struct kvm_vcpu_arch { @@ -498,6 +518,8 @@ struct kvm_vcpu_arch { ulong tcscr; ulong acop; ulong wort; + ulong tid; + ulong psscr; ulong shadow_srr1; #endif u32 vrsave; /* also USPRG0 */ @@ -546,6 +568,7 @@ struct kvm_vcpu_arch { u64 tfiar; u32 cr_tm; + u64 xer_tm; u64 lr_tm; u64 ctr_tm; u64 amr_tm; @@ -655,9 +678,11 @@ struct kvm_vcpu_arch { #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE struct kvm_vcpu_arch_shared shregs; + struct mmio_hpte_cache mmio_cache; unsigned long pgfault_addr; long pgfault_index; unsigned long pgfault_hpte[2]; + struct mmio_hpte_cache_entry *pgfault_cache; struct task_struct *run_task; struct kvm_run *kvm_run; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index f6e49640dbe1..2da67bf1f2ec 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -483,9 +483,10 @@ extern void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long guest_irq, unsigned long host_irq); extern void kvmppc_xics_clr_mapped(struct kvm *kvm, unsigned long guest_irq, unsigned long host_irq); -extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, u32 xirr, - struct kvmppc_irq_map *irq_map, - struct kvmppc_passthru_irqmap *pimap); +extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, __be32 xirr, + struct kvmppc_irq_map *irq_map, + struct kvmppc_passthru_irqmap *pimap, + bool *again); extern int h_ipi_redirect; #else static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap( @@ -510,6 +511,48 @@ static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) #endif /* + * Prototypes for functions called only from assembler code. + * Having prototypes reduces sparse errors. + */ +long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, + unsigned long ioba, unsigned long tce); +long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, + unsigned long liobn, unsigned long ioba, + unsigned long tce_list, unsigned long npages); +long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, + unsigned long liobn, unsigned long ioba, + unsigned long tce_value, unsigned long npages); +long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target, + unsigned int yield_count); +long kvmppc_h_random(struct kvm_vcpu *vcpu); +void kvmhv_commence_exit(int trap); +long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu); +void kvmppc_subcore_enter_guest(void); +void kvmppc_subcore_exit_guest(void); +long kvmppc_realmode_hmi_handler(void); +long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, + long pte_index, unsigned long pteh, unsigned long ptel); +long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long pte_index, unsigned long avpn); +long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu); +long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long pte_index, unsigned long avpn, + unsigned long va); +long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long pte_index); +long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long pte_index); +long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long pte_index); +long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, + unsigned long slb_v, unsigned int status, bool data); +unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu); +int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, + unsigned long mfrr); +int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr); +int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr); + +/* * Host-side operations we want to set up while running in real * mode in the guest operating on the xics. * Currently only VCPU wakeup is supported. diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index e311c25751a4..8d1499334257 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -214,6 +214,11 @@ extern u64 ppc64_rma_size; /* Cleanup function used by kexec */ extern void mmu_cleanup_all(void); extern void radix__mmu_cleanup_all(void); + +/* Functions for creating and updating partition table on POWER9 */ +extern void mmu_partition_table_init(void); +extern void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, + unsigned long dw1); #endif /* CONFIG_PPC64 */ struct mm_struct; diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index e958b7096f19..5c7db0f1a708 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -220,9 +220,12 @@ int64_t opal_pci_set_power_state(uint64_t async_token, uint64_t id, int64_t opal_pci_poll2(uint64_t id, uint64_t data); int64_t opal_int_get_xirr(uint32_t *out_xirr, bool just_poll); +int64_t opal_rm_int_get_xirr(__be32 *out_xirr, bool just_poll); int64_t opal_int_set_cppr(uint8_t cppr); int64_t opal_int_eoi(uint32_t xirr); +int64_t opal_rm_int_eoi(uint32_t xirr); int64_t opal_int_set_mfrr(uint32_t cpu, uint8_t mfrr); +int64_t opal_rm_int_set_mfrr(uint32_t cpu, uint8_t mfrr); int64_t opal_pci_tce_kill(uint64_t phb_id, uint32_t kill_type, uint32_t pe_num, uint32_t tce_size, uint64_t dma_addr, uint32_t npages); diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 9e1499f98def..04aa1ee8cdb6 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -153,6 +153,8 @@ #define PSSCR_EC 0x00100000 /* Exit Criterion */ #define PSSCR_ESL 0x00200000 /* Enable State Loss */ #define PSSCR_SD 0x00400000 /* Status Disable */ +#define PSSCR_PLS 0xf000000000000000 /* Power-saving Level Status */ +#define PSSCR_GUEST_VIS 0xf0000000000003ff /* Guest-visible PSSCR fields */ /* Floating Point Status and Control Register (FPSCR) Fields */ #define FPSCR_FX 0x80000000 /* FPU exception summary */ @@ -236,6 +238,7 @@ #define SPRN_TEXASRU 0x83 /* '' '' '' Upper 32 */ #define TEXASR_FS __MASK(63-36) /* TEXASR Failure Summary */ #define SPRN_TFHAR 0x80 /* Transaction Failure Handler Addr */ +#define SPRN_TIDR 144 /* Thread ID register */ #define SPRN_CTRLF 0x088 #define SPRN_CTRLT 0x098 #define CTRL_CT 0xc0000000 /* current thread */ @@ -294,6 +297,7 @@ #define SPRN_HSRR1 0x13B /* Hypervisor Save/Restore 1 */ #define SPRN_LMRR 0x32D /* Load Monitor Region Register */ #define SPRN_LMSER 0x32E /* Load Monitor Section Enable Register */ +#define SPRN_ASDR 0x330 /* Access segment descriptor register */ #define SPRN_IC 0x350 /* Virtual Instruction Count */ #define SPRN_VTB 0x351 /* Virtual Time Base */ #define SPRN_LDBAR 0x352 /* LD Base Address Register */ @@ -305,6 +309,7 @@ /* HFSCR and FSCR bit numbers are the same */ #define FSCR_LM_LG 11 /* Enable Load Monitor Registers */ +#define FSCR_MSGP_LG 10 /* Enable MSGP */ #define FSCR_TAR_LG 8 /* Enable Target Address Register */ #define FSCR_EBB_LG 7 /* Enable Event Based Branching */ #define FSCR_TM_LG 5 /* Enable Transactional Memory */ @@ -320,6 +325,7 @@ #define FSCR_DSCR __MASK(FSCR_DSCR_LG) #define SPRN_HFSCR 0xbe /* HV=1 Facility Status & Control Register */ #define HFSCR_LM __MASK(FSCR_LM_LG) +#define HFSCR_MSGP __MASK(FSCR_MSGP_LG) #define HFSCR_TAR __MASK(FSCR_TAR_LG) #define HFSCR_EBB __MASK(FSCR_EBB_LG) #define HFSCR_TM __MASK(FSCR_TM_LG) @@ -358,6 +364,7 @@ #define LPCR_PECE_HVEE ASM_CONST(0x0000400000000000) /* P9 Wakeup on HV interrupts */ #define LPCR_MER ASM_CONST(0x0000000000000800) /* Mediated External Exception */ #define LPCR_MER_SH 11 +#define LPCR_GTSE ASM_CONST(0x0000000000000400) /* Guest Translation Shootdown Enable */ #define LPCR_TC ASM_CONST(0x0000000000000200) /* Translation control */ #define LPCR_LPES 0x0000000c #define LPCR_LPES0 ASM_CONST(0x0000000000000008) /* LPAR Env selector 0 */ @@ -378,6 +385,12 @@ #define PCR_VEC_DIS (1ul << (63-0)) /* Vec. disable (bit NA since POWER8) */ #define PCR_VSX_DIS (1ul << (63-1)) /* VSX disable (bit NA since POWER8) */ #define PCR_TM_DIS (1ul << (63-2)) /* Trans. memory disable (POWER8) */ +/* + * These bits are used in the function kvmppc_set_arch_compat() to specify and + * determine both the compatibility level which we want to emulate and the + * compatibility level which the host is capable of emulating. + */ +#define PCR_ARCH_207 0x8 /* Architecture 2.07 */ #define PCR_ARCH_206 0x4 /* Architecture 2.06 */ #define PCR_ARCH_205 0x2 /* Architecture 2.05 */ #define SPRN_HEIR 0x153 /* Hypervisor Emulated Instruction Register */ @@ -1219,6 +1232,7 @@ #define PVR_ARCH_206 0x0f000003 #define PVR_ARCH_206p 0x0f100003 #define PVR_ARCH_207 0x0f000004 +#define PVR_ARCH_300 0x0f000005 /* Macros for setting and retrieving special purpose registers */ #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index c93cf35ce379..3603b6f51b11 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -573,6 +573,10 @@ struct kvm_get_htab_header { #define KVM_REG_PPC_SPRG9 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xba) #define KVM_REG_PPC_DBSR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbb) +/* POWER9 registers */ +#define KVM_REG_PPC_TIDR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbc) +#define KVM_REG_PPC_PSSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbd) + /* Transactional Memory checkpointed state: * This is all GPRs, all VSX regs and a subset of SPRs */ @@ -596,6 +600,7 @@ struct kvm_get_htab_header { #define KVM_REG_PPC_TM_VSCR (KVM_REG_PPC_TM | KVM_REG_SIZE_U32 | 0x67) #define KVM_REG_PPC_TM_DSCR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x68) #define KVM_REG_PPC_TM_TAR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x69) +#define KVM_REG_PPC_TM_XER (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x6a) /* PPC64 eXternal Interrupt Controller Specification */ #define KVM_DEV_XICS_GRP_SOURCES 1 /* 64-bit source attributes */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index caec7bf3b99a..195a9fc8f81c 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -487,6 +487,7 @@ int main(void) /* book3s */ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + DEFINE(KVM_TLB_SETS, offsetof(struct kvm, arch.tlb_sets)); DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1)); DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid)); DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr)); @@ -548,6 +549,8 @@ int main(void) DEFINE(VCPU_TCSCR, offsetof(struct kvm_vcpu, arch.tcscr)); DEFINE(VCPU_ACOP, offsetof(struct kvm_vcpu, arch.acop)); DEFINE(VCPU_WORT, offsetof(struct kvm_vcpu, arch.wort)); + DEFINE(VCPU_TID, offsetof(struct kvm_vcpu, arch.tid)); + DEFINE(VCPU_PSSCR, offsetof(struct kvm_vcpu, arch.psscr)); DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_map)); DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest)); DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads)); @@ -569,6 +572,7 @@ int main(void) DEFINE(VCPU_VRS_TM, offsetof(struct kvm_vcpu, arch.vr_tm.vr)); DEFINE(VCPU_VRSAVE_TM, offsetof(struct kvm_vcpu, arch.vrsave_tm)); DEFINE(VCPU_CR_TM, offsetof(struct kvm_vcpu, arch.cr_tm)); + DEFINE(VCPU_XER_TM, offsetof(struct kvm_vcpu, arch.xer_tm)); DEFINE(VCPU_LR_TM, offsetof(struct kvm_vcpu, arch.lr_tm)); DEFINE(VCPU_CTR_TM, offsetof(struct kvm_vcpu, arch.ctr_tm)); DEFINE(VCPU_AMR_TM, offsetof(struct kvm_vcpu, arch.amr_tm)); diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S index 37c027ca83b2..f3e1f5d29dce 100644 --- a/arch/powerpc/kernel/cpu_setup_power.S +++ b/arch/powerpc/kernel/cpu_setup_power.S @@ -174,7 +174,7 @@ __init_FSCR: __init_HFSCR: mfspr r3,SPRN_HFSCR ori r3,r3,HFSCR_TAR|HFSCR_TM|HFSCR_BHRB|HFSCR_PM|\ - HFSCR_DSCR|HFSCR_VECVSX|HFSCR_FP|HFSCR_EBB + HFSCR_DSCR|HFSCR_VECVSX|HFSCR_FP|HFSCR_EBB|HFSCR_MSGP mtspr SPRN_HFSCR,r3 blr diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 05f09ae82587..b795dd1ac2ef 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -88,6 +88,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) /* 128 (2**7) bytes in each HPTEG */ kvm->arch.hpt_mask = (1ul << (order - 7)) - 1; + atomic64_set(&kvm->arch.mmio_update, 0); + /* Allocate reverse map array */ rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte); if (!rev) { @@ -255,7 +257,7 @@ static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) kvmppc_set_msr(vcpu, msr); } -long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, +static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel, unsigned long *pte_idx_ret) { @@ -312,7 +314,7 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_slb *slbe; unsigned long slb_v; unsigned long pp, key; - unsigned long v, gr; + unsigned long v, orig_v, gr; __be64 *hptep; int index; int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR); @@ -337,10 +339,12 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, return -ENOENT; } hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); - v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; + v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; + if (cpu_has_feature(CPU_FTR_ARCH_300)) + v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1])); gr = kvm->arch.revmap[index].guest_rpte; - unlock_hpte(hptep, v); + unlock_hpte(hptep, orig_v); preempt_enable(); gpte->eaddr = eaddr; @@ -438,6 +442,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, { struct kvm *kvm = vcpu->kvm; unsigned long hpte[3], r; + unsigned long hnow_v, hnow_r; __be64 *hptep; unsigned long mmu_seq, psize, pte_size; unsigned long gpa_base, gfn_base; @@ -451,6 +456,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int writing, write_ok; struct vm_area_struct *vma; unsigned long rcbits; + long mmio_update; /* * Real-mode code has already searched the HPT and found the @@ -460,6 +466,19 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, */ if (ea != vcpu->arch.pgfault_addr) return RESUME_GUEST; + + if (vcpu->arch.pgfault_cache) { + mmio_update = atomic64_read(&kvm->arch.mmio_update); + if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) { + r = vcpu->arch.pgfault_cache->rpte; + psize = hpte_page_size(vcpu->arch.pgfault_hpte[0], r); + gpa_base = r & HPTE_R_RPN & ~(psize - 1); + gfn_base = gpa_base >> PAGE_SHIFT; + gpa = gpa_base | (ea & (psize - 1)); + return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, + dsisr & DSISR_ISSTORE); + } + } index = vcpu->arch.pgfault_index; hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); rev = &kvm->arch.revmap[index]; @@ -472,6 +491,10 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unlock_hpte(hptep, hpte[0]); preempt_enable(); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + hpte[0] = hpte_new_to_old_v(hpte[0], hpte[1]); + hpte[1] = hpte_new_to_old_r(hpte[1]); + } if (hpte[0] != vcpu->arch.pgfault_hpte[0] || hpte[1] != vcpu->arch.pgfault_hpte[1]) return RESUME_GUEST; @@ -575,16 +598,22 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, */ if (psize < PAGE_SIZE) psize = PAGE_SIZE; - r = (r & ~(HPTE_R_PP0 - psize)) | ((pfn << PAGE_SHIFT) & ~(psize - 1)); + r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) | + ((pfn << PAGE_SHIFT) & ~(psize - 1)); if (hpte_is_writable(r) && !write_ok) r = hpte_make_readonly(r); ret = RESUME_GUEST; preempt_disable(); while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) cpu_relax(); - if ((be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK) != hpte[0] || - be64_to_cpu(hptep[1]) != hpte[1] || - rev->guest_rpte != hpte[2]) + hnow_v = be64_to_cpu(hptep[0]); + hnow_r = be64_to_cpu(hptep[1]); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + hnow_v = hpte_new_to_old_v(hnow_v, hnow_r); + hnow_r = hpte_new_to_old_r(hnow_r); + } + if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] || + rev->guest_rpte != hpte[2]) /* HPTE has been changed under us; let the guest retry */ goto out_unlock; hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; @@ -615,6 +644,10 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); } + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + r = hpte_old_to_new_r(hpte[0], r); + hpte[0] = hpte_old_to_new_v(hpte[0]); + } hptep[1] = cpu_to_be64(r); eieio(); __unlock_hpte(hptep, hpte[0]); @@ -758,6 +791,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, hpte_rpn(ptel, psize) == gfn) { hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); kvmppc_invalidate_hpte(kvm, hptep, i); + hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO); /* Harvest R and C */ rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; @@ -1165,7 +1199,7 @@ static long record_hpte(unsigned long flags, __be64 *hptp, unsigned long *hpte, struct revmap_entry *revp, int want_valid, int first_pass) { - unsigned long v, r; + unsigned long v, r, hr; unsigned long rcbits_unset; int ok = 1; int valid, dirty; @@ -1192,6 +1226,11 @@ static long record_hpte(unsigned long flags, __be64 *hptp, while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) cpu_relax(); v = be64_to_cpu(hptp[0]); + hr = be64_to_cpu(hptp[1]); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + v = hpte_new_to_old_v(v, hr); + hr = hpte_new_to_old_r(hr); + } /* re-evaluate valid and dirty from synchronized HPTE value */ valid = !!(v & HPTE_V_VALID); @@ -1199,8 +1238,8 @@ static long record_hpte(unsigned long flags, __be64 *hptp, /* Harvest R and C into guest view if necessary */ rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); - if (valid && (rcbits_unset & be64_to_cpu(hptp[1]))) { - revp->guest_rpte |= (be64_to_cpu(hptp[1]) & + if (valid && (rcbits_unset & hr)) { + revp->guest_rpte |= (hr & (HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED; dirty = 1; } @@ -1608,7 +1647,7 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf, return ret; } -ssize_t debugfs_htab_write(struct file *file, const char __user *buf, +static ssize_t debugfs_htab_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { return -EACCES; diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index d461c440889a..e4c4ea973e57 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -39,7 +39,6 @@ #include <asm/udbg.h> #include <asm/iommu.h> #include <asm/tce.h> -#include <asm/iommu.h> #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 39ef1f4a7b02..8dcbe37a4dac 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -54,6 +54,9 @@ #include <asm/dbell.h> #include <asm/hmi.h> #include <asm/pnv-pci.h> +#include <asm/mmu.h> +#include <asm/opal.h> +#include <asm/xics.h> #include <linux/gfp.h> #include <linux/vmalloc.h> #include <linux/highmem.h> @@ -62,6 +65,7 @@ #include <linux/irqbypass.h> #include <linux/module.h> #include <linux/compiler.h> +#include <linux/of.h> #include "book3s.h" @@ -104,23 +108,6 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); #endif -/* Maximum halt poll interval defaults to KVM_HALT_POLL_NS_DEFAULT */ -static unsigned int halt_poll_max_ns = KVM_HALT_POLL_NS_DEFAULT; -module_param(halt_poll_max_ns, uint, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(halt_poll_max_ns, "Maximum halt poll time in ns"); - -/* Factor by which the vcore halt poll interval is grown, default is to double - */ -static unsigned int halt_poll_ns_grow = 2; -module_param(halt_poll_ns_grow, int, S_IRUGO); -MODULE_PARM_DESC(halt_poll_ns_grow, "Factor halt poll time is grown by"); - -/* Factor by which the vcore halt poll interval is shrunk, default is to reset - */ -static unsigned int halt_poll_ns_shrink; -module_param(halt_poll_ns_shrink, int, S_IRUGO); -MODULE_PARM_DESC(halt_poll_ns_shrink, "Factor halt poll time is shrunk by"); - static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); @@ -146,12 +133,21 @@ static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc, static bool kvmppc_ipi_thread(int cpu) { + unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); + + /* On POWER9 we can use msgsnd to IPI any cpu */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + msg |= get_hard_smp_processor_id(cpu); + smp_mb(); + __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); + return true; + } + /* On POWER8 for IPIs to threads in the same core, use msgsnd */ if (cpu_has_feature(CPU_FTR_ARCH_207S)) { preempt_disable(); if (cpu_first_thread_sibling(cpu) == cpu_first_thread_sibling(smp_processor_id())) { - unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); msg |= cpu_thread_in_core(cpu); smp_mb(); __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); @@ -162,8 +158,12 @@ static bool kvmppc_ipi_thread(int cpu) } #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) - if (cpu >= 0 && cpu < nr_cpu_ids && paca[cpu].kvm_hstate.xics_phys) { - xics_wake_cpu(cpu); + if (cpu >= 0 && cpu < nr_cpu_ids) { + if (paca[cpu].kvm_hstate.xics_phys) { + xics_wake_cpu(cpu); + return true; + } + opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY); return true; } #endif @@ -299,41 +299,54 @@ static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr) vcpu->arch.pvr = pvr; } +/* Dummy value used in computing PCR value below */ +#define PCR_ARCH_300 (PCR_ARCH_207 << 1) + static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat) { - unsigned long pcr = 0; + unsigned long host_pcr_bit = 0, guest_pcr_bit = 0; struct kvmppc_vcore *vc = vcpu->arch.vcore; + /* We can (emulate) our own architecture version and anything older */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + host_pcr_bit = PCR_ARCH_300; + else if (cpu_has_feature(CPU_FTR_ARCH_207S)) + host_pcr_bit = PCR_ARCH_207; + else if (cpu_has_feature(CPU_FTR_ARCH_206)) + host_pcr_bit = PCR_ARCH_206; + else + host_pcr_bit = PCR_ARCH_205; + + /* Determine lowest PCR bit needed to run guest in given PVR level */ + guest_pcr_bit = host_pcr_bit; if (arch_compat) { switch (arch_compat) { case PVR_ARCH_205: - /* - * If an arch bit is set in PCR, all the defined - * higher-order arch bits also have to be set. - */ - pcr = PCR_ARCH_206 | PCR_ARCH_205; + guest_pcr_bit = PCR_ARCH_205; break; case PVR_ARCH_206: case PVR_ARCH_206p: - pcr = PCR_ARCH_206; + guest_pcr_bit = PCR_ARCH_206; break; case PVR_ARCH_207: + guest_pcr_bit = PCR_ARCH_207; + break; + case PVR_ARCH_300: + guest_pcr_bit = PCR_ARCH_300; break; default: return -EINVAL; } - - if (!cpu_has_feature(CPU_FTR_ARCH_207S)) { - /* POWER7 can't emulate POWER8 */ - if (!(pcr & PCR_ARCH_206)) - return -EINVAL; - pcr &= ~PCR_ARCH_206; - } } + /* Check requested PCR bits don't exceed our capabilities */ + if (guest_pcr_bit > host_pcr_bit) + return -EINVAL; + spin_lock(&vc->lock); vc->arch_compat = arch_compat; - vc->pcr = pcr; + /* Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit */ + vc->pcr = host_pcr_bit - guest_pcr_bit; spin_unlock(&vc->lock); return 0; @@ -945,6 +958,7 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, break; case BOOK3S_INTERRUPT_EXTERNAL: case BOOK3S_INTERRUPT_H_DOORBELL: + case BOOK3S_INTERRUPT_H_VIRT: vcpu->stat.ext_intr_exits++; r = RESUME_GUEST; break; @@ -1229,6 +1243,12 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_WORT: *val = get_reg_val(id, vcpu->arch.wort); break; + case KVM_REG_PPC_TIDR: + *val = get_reg_val(id, vcpu->arch.tid); + break; + case KVM_REG_PPC_PSSCR: + *val = get_reg_val(id, vcpu->arch.psscr); + break; case KVM_REG_PPC_VPA_ADDR: spin_lock(&vcpu->arch.vpa_update_lock); *val = get_reg_val(id, vcpu->arch.vpa.next_gpa); @@ -1288,6 +1308,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_TM_CR: *val = get_reg_val(id, vcpu->arch.cr_tm); break; + case KVM_REG_PPC_TM_XER: + *val = get_reg_val(id, vcpu->arch.xer_tm); + break; case KVM_REG_PPC_TM_LR: *val = get_reg_val(id, vcpu->arch.lr_tm); break; @@ -1427,6 +1450,12 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_WORT: vcpu->arch.wort = set_reg_val(id, *val); break; + case KVM_REG_PPC_TIDR: + vcpu->arch.tid = set_reg_val(id, *val); + break; + case KVM_REG_PPC_PSSCR: + vcpu->arch.psscr = set_reg_val(id, *val) & PSSCR_GUEST_VIS; + break; case KVM_REG_PPC_VPA_ADDR: addr = set_reg_val(id, *val); r = -EINVAL; @@ -1498,6 +1527,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_TM_CR: vcpu->arch.cr_tm = set_reg_val(id, *val); break; + case KVM_REG_PPC_TM_XER: + vcpu->arch.xer_tm = set_reg_val(id, *val); + break; case KVM_REG_PPC_TM_LR: vcpu->arch.lr_tm = set_reg_val(id, *val); break; @@ -1540,6 +1572,20 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, return r; } +/* + * On POWER9, threads are independent and can be in different partitions. + * Therefore we consider each thread to be a subcore. + * There is a restriction that all threads have to be in the same + * MMU mode (radix or HPT), unfortunately, but since we only support + * HPT guests on a HPT host so far, that isn't an impediment yet. + */ +static int threads_per_vcore(void) +{ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + return 1; + return threads_per_subcore; +} + static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core) { struct kvmppc_vcore *vcore; @@ -1554,7 +1600,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core) init_swait_queue_head(&vcore->wq); vcore->preempt_tb = TB_NIL; vcore->lpcr = kvm->arch.lpcr; - vcore->first_vcpuid = core * threads_per_subcore; + vcore->first_vcpuid = core * threads_per_vcore(); vcore->kvm = kvm; INIT_LIST_HEAD(&vcore->preempt_list); @@ -1717,7 +1763,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, int core; struct kvmppc_vcore *vcore; - core = id / threads_per_subcore; + core = id / threads_per_vcore(); if (core >= KVM_MAX_VCORES) goto out; @@ -1935,7 +1981,10 @@ static void kvmppc_wait_for_nap(void) { int cpu = smp_processor_id(); int i, loops; + int n_threads = threads_per_vcore(); + if (n_threads <= 1) + return; for (loops = 0; loops < 1000000; ++loops) { /* * Check if all threads are finished. @@ -1943,17 +1992,17 @@ static void kvmppc_wait_for_nap(void) * and the thread clears it when finished, so we look * for any threads that still have a non-NULL vcore ptr. */ - for (i = 1; i < threads_per_subcore; ++i) + for (i = 1; i < n_threads; ++i) if (paca[cpu + i].kvm_hstate.kvm_vcore) break; - if (i == threads_per_subcore) { + if (i == n_threads) { HMT_medium(); return; } HMT_low(); } HMT_medium(); - for (i = 1; i < threads_per_subcore; ++i) + for (i = 1; i < n_threads; ++i) if (paca[cpu + i].kvm_hstate.kvm_vcore) pr_err("KVM: CPU %d seems to be stuck\n", cpu + i); } @@ -2019,7 +2068,7 @@ static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc) vc->vcore_state = VCORE_PREEMPT; vc->pcpu = smp_processor_id(); - if (vc->num_threads < threads_per_subcore) { + if (vc->num_threads < threads_per_vcore()) { spin_lock(&lp->lock); list_add_tail(&vc->preempt_list, &lp->list); spin_unlock(&lp->lock); @@ -2123,8 +2172,7 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip) cip->subcore_threads[sub] = vc->num_threads; cip->subcore_vm[sub] = vc->kvm; init_master_vcore(vc); - list_del(&vc->preempt_list); - list_add_tail(&vc->preempt_list, &cip->vcs[sub]); + list_move_tail(&vc->preempt_list, &cip->vcs[sub]); return true; } @@ -2309,6 +2357,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) unsigned long cmd_bit, stat_bit; int pcpu, thr; int target_threads; + int controlled_threads; /* * Remove from the list any threads that have a signal pending @@ -2327,11 +2376,18 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) vc->preempt_tb = TB_NIL; /* + * Number of threads that we will be controlling: the same as + * the number of threads per subcore, except on POWER9, + * where it's 1 because the threads are (mostly) independent. + */ + controlled_threads = threads_per_vcore(); + + /* * Make sure we are running on primary threads, and that secondary * threads are offline. Also check if the number of threads in this * guest are greater than the current system threads per guest. */ - if ((threads_per_core > 1) && + if ((controlled_threads > 1) && ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) { for_each_runnable_thread(i, vcpu, vc) { vcpu->arch.ret = -EBUSY; @@ -2347,7 +2403,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) */ init_core_info(&core_info, vc); pcpu = smp_processor_id(); - target_threads = threads_per_subcore; + target_threads = controlled_threads; if (target_smt_mode && target_smt_mode < target_threads) target_threads = target_smt_mode; if (vc->num_threads < target_threads) @@ -2383,7 +2439,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) smp_wmb(); } pcpu = smp_processor_id(); - for (thr = 0; thr < threads_per_subcore; ++thr) + for (thr = 0; thr < controlled_threads; ++thr) paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip; /* Initiate micro-threading (split-core) if required */ @@ -2493,7 +2549,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) } /* Let secondaries go back to the offline loop */ - for (i = 0; i < threads_per_subcore; ++i) { + for (i = 0; i < controlled_threads; ++i) { kvmppc_release_hwthread(pcpu + i); if (sip && sip->napped[i]) kvmppc_ipi_thread(pcpu + i); @@ -2545,9 +2601,6 @@ static void grow_halt_poll_ns(struct kvmppc_vcore *vc) vc->halt_poll_ns = 10000; else vc->halt_poll_ns *= halt_poll_ns_grow; - - if (vc->halt_poll_ns > halt_poll_max_ns) - vc->halt_poll_ns = halt_poll_max_ns; } static void shrink_halt_poll_ns(struct kvmppc_vcore *vc) @@ -2558,7 +2611,8 @@ static void shrink_halt_poll_ns(struct kvmppc_vcore *vc) vc->halt_poll_ns /= halt_poll_ns_shrink; } -/* Check to see if any of the runnable vcpus on the vcore have pending +/* + * Check to see if any of the runnable vcpus on the vcore have pending * exceptions or are no longer ceded */ static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc) @@ -2657,16 +2711,18 @@ out: } /* Adjust poll time */ - if (halt_poll_max_ns) { + if (halt_poll_ns) { if (block_ns <= vc->halt_poll_ns) ; /* We slept and blocked for longer than the max halt time */ - else if (vc->halt_poll_ns && block_ns > halt_poll_max_ns) + else if (vc->halt_poll_ns && block_ns > halt_poll_ns) shrink_halt_poll_ns(vc); /* We slept and our poll time is too small */ - else if (vc->halt_poll_ns < halt_poll_max_ns && - block_ns < halt_poll_max_ns) + else if (vc->halt_poll_ns < halt_poll_ns && + block_ns < halt_poll_ns) grow_halt_poll_ns(vc); + if (vc->halt_poll_ns > halt_poll_ns) + vc->halt_poll_ns = halt_poll_ns; } else vc->halt_poll_ns = 0; @@ -2973,6 +3029,15 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, struct kvm_memslots *slots; struct kvm_memory_slot *memslot; + /* + * If we are making a new memslot, it might make + * some address that was previously cached as emulated + * MMIO be no longer emulated MMIO, so invalidate + * all the caches of emulated MMIO translations. + */ + if (npages) + atomic64_inc(&kvm->arch.mmio_update); + if (npages && old->npages) { /* * If modifying a memslot, reset all the rmap dirty bits. @@ -3017,6 +3082,22 @@ static void kvmppc_mmu_destroy_hv(struct kvm_vcpu *vcpu) return; } +static void kvmppc_setup_partition_table(struct kvm *kvm) +{ + unsigned long dw0, dw1; + + /* PS field - page size for VRMA */ + dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) | + ((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1); + /* HTABSIZE and HTABORG fields */ + dw0 |= kvm->arch.sdr1; + + /* Second dword has GR=0; other fields are unused since UPRT=0 */ + dw1 = 0; + + mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1); +} + static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) { int err = 0; @@ -3068,17 +3149,20 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) psize == 0x1000000)) goto out_srcu; - /* Update VRMASD field in the LPCR */ senc = slb_pgsize_encoding(psize); kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | (VRMA_VSID << SLB_VSID_SHIFT_1T); - /* the -4 is to account for senc values starting at 0x10 */ - lpcr = senc << (LPCR_VRMASD_SH - 4); - /* Create HPTEs in the hash page table for the VRMA */ kvmppc_map_vrma(vcpu, memslot, porder); - kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); + /* Update VRMASD field in the LPCR */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) { + /* the -4 is to account for senc values starting at 0x10 */ + lpcr = senc << (LPCR_VRMASD_SH - 4); + kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); + } else { + kvmppc_setup_partition_table(kvm); + } /* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */ smp_wmb(); @@ -3193,14 +3277,18 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) * Since we don't flush the TLB when tearing down a VM, * and this lpid might have previously been used, * make sure we flush on each core before running the new VM. + * On POWER9, the tlbie in mmu_partition_table_set_entry() + * does this flush for us. */ - cpumask_setall(&kvm->arch.need_tlb_flush); + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + cpumask_setall(&kvm->arch.need_tlb_flush); /* Start out with the default set of hcalls enabled */ memcpy(kvm->arch.enabled_hcalls, default_enabled_hcalls, sizeof(kvm->arch.enabled_hcalls)); - kvm->arch.host_sdr1 = mfspr(SPRN_SDR1); + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + kvm->arch.host_sdr1 = mfspr(SPRN_SDR1); /* Init LPCR for virtual RMA mode */ kvm->arch.host_lpid = mfspr(SPRN_LPID); @@ -3213,9 +3301,29 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) /* On POWER8 turn on online bit to enable PURR/SPURR */ if (cpu_has_feature(CPU_FTR_ARCH_207S)) lpcr |= LPCR_ONL; + /* + * On POWER9, VPM0 bit is reserved (VPM0=1 behaviour is assumed) + * Set HVICE bit to enable hypervisor virtualization interrupts. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + lpcr &= ~LPCR_VPM0; + lpcr |= LPCR_HVICE; + } + kvm->arch.lpcr = lpcr; /* + * Work out how many sets the TLB has, for the use of + * the TLB invalidation loop in book3s_hv_rmhandlers.S. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH; /* 256 */ + else if (cpu_has_feature(CPU_FTR_ARCH_207S)) + kvm->arch.tlb_sets = POWER8_TLB_SETS; /* 512 */ + else + kvm->arch.tlb_sets = POWER7_TLB_SETS; /* 128 */ + + /* * Track that we now have a HV mode VM active. This blocks secondary * CPU threads from coming online. */ @@ -3279,9 +3387,9 @@ static int kvmppc_core_check_processor_compat_hv(void) !cpu_has_feature(CPU_FTR_ARCH_206)) return -EIO; /* - * Disable KVM for Power9, untill the required bits merged. + * Disable KVM for Power9 in radix mode. */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) + if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled()) return -EIO; return 0; @@ -3635,6 +3743,23 @@ static int kvmppc_book3s_init_hv(void) if (r) return r; + /* + * We need a way of accessing the XICS interrupt controller, + * either directly, via paca[cpu].kvm_hstate.xics_phys, or + * indirectly, via OPAL. + */ +#ifdef CONFIG_SMP + if (!get_paca()->kvm_hstate.xics_phys) { + struct device_node *np; + + np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc"); + if (!np) { + pr_err("KVM-HV: Cannot determine method for accessing XICS\n"); + return -ENODEV; + } + } +#endif + kvm_ops_hv.owner = THIS_MODULE; kvmppc_hv_ops = &kvm_ops_hv; @@ -3657,3 +3782,4 @@ module_exit(kvmppc_book3s_exit_hv); MODULE_LICENSE("GPL"); MODULE_ALIAS_MISCDEV(KVM_MINOR); MODULE_ALIAS("devname:kvm"); + diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 0c84d6bc8356..5bb24be0b346 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -26,6 +26,8 @@ #include <asm/dbell.h> #include <asm/cputhreads.h> #include <asm/io.h> +#include <asm/opal.h> +#include <asm/smp.h> #define KVM_CMA_CHUNK_ORDER 18 @@ -205,12 +207,18 @@ static inline void rm_writeb(unsigned long paddr, u8 val) void kvmhv_rm_send_ipi(int cpu) { unsigned long xics_phys; + unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); - /* On POWER8 for IPIs to threads in the same core, use msgsnd */ + /* On POWER9 we can use msgsnd for any destination cpu. */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + msg |= get_hard_smp_processor_id(cpu); + __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); + return; + } + /* On POWER8 for IPIs to threads in the same core, use msgsnd. */ if (cpu_has_feature(CPU_FTR_ARCH_207S) && cpu_first_thread_sibling(cpu) == cpu_first_thread_sibling(raw_smp_processor_id())) { - unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); msg |= cpu_thread_in_core(cpu); __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); return; @@ -218,7 +226,11 @@ void kvmhv_rm_send_ipi(int cpu) /* Else poke the target with an IPI */ xics_phys = paca[cpu].kvm_hstate.xics_phys; - rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY); + if (xics_phys) + rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY); + else + opal_rm_int_set_mfrr(get_hard_smp_processor_id(cpu), + IPI_PRIORITY); } /* @@ -329,7 +341,7 @@ static struct kvmppc_irq_map *get_irqmap(struct kvmppc_passthru_irqmap *pimap, * saved a copy of the XIRR in the PACA, it will be picked up by * the host ICP driver. */ -static int kvmppc_check_passthru(u32 xisr, __be32 xirr) +static int kvmppc_check_passthru(u32 xisr, __be32 xirr, bool *again) { struct kvmppc_passthru_irqmap *pimap; struct kvmppc_irq_map *irq_map; @@ -348,11 +360,11 @@ static int kvmppc_check_passthru(u32 xisr, __be32 xirr) /* We're handling this interrupt, generic code doesn't need to */ local_paca->kvm_hstate.saved_xirr = 0; - return kvmppc_deliver_irq_passthru(vcpu, xirr, irq_map, pimap); + return kvmppc_deliver_irq_passthru(vcpu, xirr, irq_map, pimap, again); } #else -static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr) +static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr, bool *again) { return 1; } @@ -367,14 +379,31 @@ static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr) * -1 if there was a guest wakeup IPI (which has now been cleared) * -2 if there is PCI passthrough external interrupt that was handled */ +static long kvmppc_read_one_intr(bool *again); long kvmppc_read_intr(void) { + long ret = 0; + long rc; + bool again; + + do { + again = false; + rc = kvmppc_read_one_intr(&again); + if (rc && (ret == 0 || rc > ret)) + ret = rc; + } while (again); + return ret; +} + +static long kvmppc_read_one_intr(bool *again) +{ unsigned long xics_phys; u32 h_xirr; __be32 xirr; u32 xisr; u8 host_ipi; + int64_t rc; /* see if a host IPI is pending */ host_ipi = local_paca->kvm_hstate.host_ipi; @@ -383,8 +412,14 @@ long kvmppc_read_intr(void) /* Now read the interrupt from the ICP */ xics_phys = local_paca->kvm_hstate.xics_phys; - if (unlikely(!xics_phys)) - return 1; + if (!xics_phys) { + /* Use OPAL to read the XIRR */ + rc = opal_rm_int_get_xirr(&xirr, false); + if (rc < 0) + return 1; + } else { + xirr = _lwzcix(xics_phys + XICS_XIRR); + } /* * Save XIRR for later. Since we get control in reverse endian @@ -392,7 +427,6 @@ long kvmppc_read_intr(void) * host endian. Note that xirr is the value read from the * XIRR register, while h_xirr is the host endian version. */ - xirr = _lwzcix(xics_phys + XICS_XIRR); h_xirr = be32_to_cpu(xirr); local_paca->kvm_hstate.saved_xirr = h_xirr; xisr = h_xirr & 0xffffff; @@ -411,8 +445,16 @@ long kvmppc_read_intr(void) * If it is an IPI, clear the MFRR and EOI it. */ if (xisr == XICS_IPI) { - _stbcix(xics_phys + XICS_MFRR, 0xff); - _stwcix(xics_phys + XICS_XIRR, xirr); + if (xics_phys) { + _stbcix(xics_phys + XICS_MFRR, 0xff); + _stwcix(xics_phys + XICS_XIRR, xirr); + } else { + opal_rm_int_set_mfrr(hard_smp_processor_id(), 0xff); + rc = opal_rm_int_eoi(h_xirr); + /* If rc > 0, there is another interrupt pending */ + *again = rc > 0; + } + /* * Need to ensure side effects of above stores * complete before proceeding. @@ -429,7 +471,11 @@ long kvmppc_read_intr(void) /* We raced with the host, * we need to resend that IPI, bummer */ - _stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY); + if (xics_phys) + _stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY); + else + opal_rm_int_set_mfrr(hard_smp_processor_id(), + IPI_PRIORITY); /* Let side effects complete */ smp_mb(); return 1; @@ -440,5 +486,5 @@ long kvmppc_read_intr(void) return -1; } - return kvmppc_check_passthru(xisr, xirr); + return kvmppc_check_passthru(xisr, xirr, again); } diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c index 0fa70a9618d7..7ef0993214f3 100644 --- a/arch/powerpc/kvm/book3s_hv_ras.c +++ b/arch/powerpc/kvm/book3s_hv_ras.c @@ -16,6 +16,7 @@ #include <asm/machdep.h> #include <asm/cputhreads.h> #include <asm/hmi.h> +#include <asm/kvm_ppc.h> /* SRR1 bits for machine check on POWER7 */ #define SRR1_MC_LDSTERR (1ul << (63-42)) diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 99b4e9d5dd23..9ef3c4be952f 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -264,8 +264,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, if (pa) pteh |= HPTE_V_VALID; - else + else { pteh |= HPTE_V_ABSENT; + ptel &= ~(HPTE_R_KEY_HI | HPTE_R_KEY_LO); + } /*If we had host pte mapping then Check WIMG */ if (ptep && !hpte_cache_flags_ok(ptel, is_ci)) { @@ -351,6 +353,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, /* inval in progress, write a non-present HPTE */ pteh |= HPTE_V_ABSENT; pteh &= ~HPTE_V_VALID; + ptel &= ~(HPTE_R_KEY_HI | HPTE_R_KEY_LO); unlock_rmap(rmap); } else { kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index, @@ -361,6 +364,11 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, } } + /* Convert to new format on P9 */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + ptel = hpte_old_to_new_r(pteh, ptel); + pteh = hpte_old_to_new_v(pteh); + } hpte[1] = cpu_to_be64(ptel); /* Write the first HPTE dword, unlocking the HPTE and making it valid */ @@ -386,6 +394,13 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, #define LOCK_TOKEN (*(u32 *)(&get_paca()->paca_index)) #endif +static inline int is_mmio_hpte(unsigned long v, unsigned long r) +{ + return ((v & HPTE_V_ABSENT) && + (r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) == + (HPTE_R_KEY_HI | HPTE_R_KEY_LO)); +} + static inline int try_lock_tlbie(unsigned int *lock) { unsigned int tmp, old; @@ -409,13 +424,18 @@ static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues, { long i; + /* + * We use the POWER9 5-operand versions of tlbie and tlbiel here. + * Since we are using RIC=0 PRS=0 R=0, and P7/P8 tlbiel ignores + * the RS field, this is backwards-compatible with P7 and P8. + */ if (global) { while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) cpu_relax(); if (need_sync) asm volatile("ptesync" : : : "memory"); for (i = 0; i < npages; ++i) - asm volatile(PPC_TLBIE(%1,%0) : : + asm volatile(PPC_TLBIE_5(%0,%1,0,0,0) : : "r" (rbvalues[i]), "r" (kvm->arch.lpid)); asm volatile("eieio; tlbsync; ptesync" : : : "memory"); kvm->arch.tlbie_lock = 0; @@ -423,7 +443,8 @@ static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues, if (need_sync) asm volatile("ptesync" : : : "memory"); for (i = 0; i < npages; ++i) - asm volatile("tlbiel %0" : : "r" (rbvalues[i])); + asm volatile(PPC_TLBIEL(%0,%1,0,0,0) : : + "r" (rbvalues[i]), "r" (0)); asm volatile("ptesync" : : : "memory"); } } @@ -435,18 +456,23 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, __be64 *hpte; unsigned long v, r, rb; struct revmap_entry *rev; - u64 pte; + u64 pte, orig_pte, pte_r; if (pte_index >= kvm->arch.hpt_npte) return H_PARAMETER; hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); - pte = be64_to_cpu(hpte[0]); + pte = orig_pte = be64_to_cpu(hpte[0]); + pte_r = be64_to_cpu(hpte[1]); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + pte = hpte_new_to_old_v(pte, pte_r); + pte_r = hpte_new_to_old_r(pte_r); + } if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn) || ((flags & H_ANDCOND) && (pte & avpn) != 0)) { - __unlock_hpte(hpte, pte); + __unlock_hpte(hpte, orig_pte); return H_NOT_FOUND; } @@ -454,7 +480,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, v = pte & ~HPTE_V_HVLOCK; if (v & HPTE_V_VALID) { hpte[0] &= ~cpu_to_be64(HPTE_V_VALID); - rb = compute_tlbie_rb(v, be64_to_cpu(hpte[1]), pte_index); + rb = compute_tlbie_rb(v, pte_r, pte_index); do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true); /* * The reference (R) and change (C) bits in a HPT @@ -472,6 +498,9 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, note_hpte_modification(kvm, rev); unlock_hpte(hpte, 0); + if (is_mmio_hpte(v, pte_r)) + atomic64_inc(&kvm->arch.mmio_update); + if (v & HPTE_V_ABSENT) v = (v & ~HPTE_V_ABSENT) | HPTE_V_VALID; hpret[0] = v; @@ -498,7 +527,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) int global; long int ret = H_SUCCESS; struct revmap_entry *rev, *revs[4]; - u64 hp0; + u64 hp0, hp1; global = global_invalidates(kvm, 0); for (i = 0; i < 4 && ret == H_SUCCESS; ) { @@ -531,6 +560,11 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) } found = 0; hp0 = be64_to_cpu(hp[0]); + hp1 = be64_to_cpu(hp[1]); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + hp0 = hpte_new_to_old_v(hp0, hp1); + hp1 = hpte_new_to_old_r(hp1); + } if (hp0 & (HPTE_V_ABSENT | HPTE_V_VALID)) { switch (flags & 3) { case 0: /* absolute */ @@ -561,13 +595,14 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C); args[j] |= rcbits << (56 - 5); hp[0] = 0; + if (is_mmio_hpte(hp0, hp1)) + atomic64_inc(&kvm->arch.mmio_update); continue; } /* leave it locked */ hp[0] &= ~cpu_to_be64(HPTE_V_VALID); - tlbrb[n] = compute_tlbie_rb(be64_to_cpu(hp[0]), - be64_to_cpu(hp[1]), pte_index); + tlbrb[n] = compute_tlbie_rb(hp0, hp1, pte_index); indexes[n] = j; hptes[n] = hp; revs[n] = rev; @@ -605,7 +640,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, __be64 *hpte; struct revmap_entry *rev; unsigned long v, r, rb, mask, bits; - u64 pte; + u64 pte_v, pte_r; if (pte_index >= kvm->arch.hpt_npte) return H_PARAMETER; @@ -613,14 +648,16 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); - pte = be64_to_cpu(hpte[0]); - if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || - ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn)) { - __unlock_hpte(hpte, pte); + v = pte_v = be64_to_cpu(hpte[0]); + if (cpu_has_feature(CPU_FTR_ARCH_300)) + v = hpte_new_to_old_v(v, be64_to_cpu(hpte[1])); + if ((v & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || + ((flags & H_AVPN) && (v & ~0x7fUL) != avpn)) { + __unlock_hpte(hpte, pte_v); return H_NOT_FOUND; } - v = pte; + pte_r = be64_to_cpu(hpte[1]); bits = (flags << 55) & HPTE_R_PP0; bits |= (flags << 48) & HPTE_R_KEY_HI; bits |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO); @@ -642,22 +679,26 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, * readonly to writable. If it should be writable, we'll * take a trap and let the page fault code sort it out. */ - pte = be64_to_cpu(hpte[1]); - r = (pte & ~mask) | bits; - if (hpte_is_writable(r) && !hpte_is_writable(pte)) + r = (pte_r & ~mask) | bits; + if (hpte_is_writable(r) && !hpte_is_writable(pte_r)) r = hpte_make_readonly(r); /* If the PTE is changing, invalidate it first */ - if (r != pte) { + if (r != pte_r) { rb = compute_tlbie_rb(v, r, pte_index); - hpte[0] = cpu_to_be64((v & ~HPTE_V_VALID) | + hpte[0] = cpu_to_be64((pte_v & ~HPTE_V_VALID) | HPTE_V_ABSENT); do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true); + /* Don't lose R/C bit updates done by hardware */ + r |= be64_to_cpu(hpte[1]) & (HPTE_R_R | HPTE_R_C); hpte[1] = cpu_to_be64(r); } } - unlock_hpte(hpte, v & ~HPTE_V_HVLOCK); + unlock_hpte(hpte, pte_v & ~HPTE_V_HVLOCK); asm volatile("ptesync" : : : "memory"); + if (is_mmio_hpte(v, pte_r)) + atomic64_inc(&kvm->arch.mmio_update); + return H_SUCCESS; } @@ -681,6 +722,10 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK; r = be64_to_cpu(hpte[1]); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + v = hpte_new_to_old_v(v, r); + r = hpte_new_to_old_r(r); + } if (v & HPTE_V_ABSENT) { v &= ~HPTE_V_ABSENT; v |= HPTE_V_VALID; @@ -798,10 +843,16 @@ void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep, unsigned long pte_index) { unsigned long rb; + u64 hp0, hp1; hptep[0] &= ~cpu_to_be64(HPTE_V_VALID); - rb = compute_tlbie_rb(be64_to_cpu(hptep[0]), be64_to_cpu(hptep[1]), - pte_index); + hp0 = be64_to_cpu(hptep[0]); + hp1 = be64_to_cpu(hptep[1]); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + hp0 = hpte_new_to_old_v(hp0, hp1); + hp1 = hpte_new_to_old_r(hp1); + } + rb = compute_tlbie_rb(hp0, hp1, pte_index); do_tlbies(kvm, &rb, 1, 1, true); } EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte); @@ -811,9 +862,15 @@ void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep, { unsigned long rb; unsigned char rbyte; + u64 hp0, hp1; - rb = compute_tlbie_rb(be64_to_cpu(hptep[0]), be64_to_cpu(hptep[1]), - pte_index); + hp0 = be64_to_cpu(hptep[0]); + hp1 = be64_to_cpu(hptep[1]); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + hp0 = hpte_new_to_old_v(hp0, hp1); + hp1 = hpte_new_to_old_r(hp1); + } + rb = compute_tlbie_rb(hp0, hp1, pte_index); rbyte = (be64_to_cpu(hptep[1]) & ~HPTE_R_R) >> 8; /* modify only the second-last byte, which contains the ref bit */ *((char *)hptep + 14) = rbyte; @@ -828,6 +885,37 @@ static int slb_base_page_shift[4] = { 20, /* 1M, unsupported */ }; +static struct mmio_hpte_cache_entry *mmio_cache_search(struct kvm_vcpu *vcpu, + unsigned long eaddr, unsigned long slb_v, long mmio_update) +{ + struct mmio_hpte_cache_entry *entry = NULL; + unsigned int pshift; + unsigned int i; + + for (i = 0; i < MMIO_HPTE_CACHE_SIZE; i++) { + entry = &vcpu->arch.mmio_cache.entry[i]; + if (entry->mmio_update == mmio_update) { + pshift = entry->slb_base_pshift; + if ((entry->eaddr >> pshift) == (eaddr >> pshift) && + entry->slb_v == slb_v) + return entry; + } + } + return NULL; +} + +static struct mmio_hpte_cache_entry * + next_mmio_cache_entry(struct kvm_vcpu *vcpu) +{ + unsigned int index = vcpu->arch.mmio_cache.index; + + vcpu->arch.mmio_cache.index++; + if (vcpu->arch.mmio_cache.index == MMIO_HPTE_CACHE_SIZE) + vcpu->arch.mmio_cache.index = 0; + + return &vcpu->arch.mmio_cache.entry[index]; +} + /* When called from virtmode, this func should be protected by * preempt_disable(), otherwise, the holding of HPTE_V_HVLOCK * can trigger deadlock issue. @@ -842,7 +930,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, unsigned long avpn; __be64 *hpte; unsigned long mask, val; - unsigned long v, r; + unsigned long v, r, orig_v; /* Get page shift, work out hash and AVPN etc. */ mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY; @@ -877,6 +965,8 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, for (i = 0; i < 16; i += 2) { /* Read the PTE racily */ v = be64_to_cpu(hpte[i]) & ~HPTE_V_HVLOCK; + if (cpu_has_feature(CPU_FTR_ARCH_300)) + v = hpte_new_to_old_v(v, be64_to_cpu(hpte[i+1])); /* Check valid/absent, hash, segment size and AVPN */ if (!(v & valid) || (v & mask) != val) @@ -885,8 +975,12 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, /* Lock the PTE and read it under the lock */ while (!try_lock_hpte(&hpte[i], HPTE_V_HVLOCK)) cpu_relax(); - v = be64_to_cpu(hpte[i]) & ~HPTE_V_HVLOCK; + v = orig_v = be64_to_cpu(hpte[i]) & ~HPTE_V_HVLOCK; r = be64_to_cpu(hpte[i+1]); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + v = hpte_new_to_old_v(v, r); + r = hpte_new_to_old_r(r); + } /* * Check the HPTE again, including base page size @@ -896,7 +990,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, /* Return with the HPTE still locked */ return (hash << 3) + (i >> 1); - __unlock_hpte(&hpte[i], v); + __unlock_hpte(&hpte[i], orig_v); } if (val & HPTE_V_SECONDARY) @@ -924,30 +1018,45 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, { struct kvm *kvm = vcpu->kvm; long int index; - unsigned long v, r, gr; + unsigned long v, r, gr, orig_v; __be64 *hpte; unsigned long valid; struct revmap_entry *rev; unsigned long pp, key; + struct mmio_hpte_cache_entry *cache_entry = NULL; + long mmio_update = 0; /* For protection fault, expect to find a valid HPTE */ valid = HPTE_V_VALID; - if (status & DSISR_NOHPTE) + if (status & DSISR_NOHPTE) { valid |= HPTE_V_ABSENT; - - index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid); - if (index < 0) { - if (status & DSISR_NOHPTE) - return status; /* there really was no HPTE */ - return 0; /* for prot fault, HPTE disappeared */ + mmio_update = atomic64_read(&kvm->arch.mmio_update); + cache_entry = mmio_cache_search(vcpu, addr, slb_v, mmio_update); } - hpte = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); - v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK; - r = be64_to_cpu(hpte[1]); - rev = real_vmalloc_addr(&kvm->arch.revmap[index]); - gr = rev->guest_rpte; + if (cache_entry) { + index = cache_entry->pte_index; + v = cache_entry->hpte_v; + r = cache_entry->hpte_r; + gr = cache_entry->rpte; + } else { + index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid); + if (index < 0) { + if (status & DSISR_NOHPTE) + return status; /* there really was no HPTE */ + return 0; /* for prot fault, HPTE disappeared */ + } + hpte = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); + v = orig_v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK; + r = be64_to_cpu(hpte[1]); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + v = hpte_new_to_old_v(v, r); + r = hpte_new_to_old_r(r); + } + rev = real_vmalloc_addr(&kvm->arch.revmap[index]); + gr = rev->guest_rpte; - unlock_hpte(hpte, v); + unlock_hpte(hpte, orig_v); + } /* For not found, if the HPTE is valid by now, retry the instruction */ if ((status & DSISR_NOHPTE) && (v & HPTE_V_VALID)) @@ -985,12 +1094,32 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, vcpu->arch.pgfault_index = index; vcpu->arch.pgfault_hpte[0] = v; vcpu->arch.pgfault_hpte[1] = r; + vcpu->arch.pgfault_cache = cache_entry; /* Check the storage key to see if it is possibly emulated MMIO */ - if (data && (vcpu->arch.shregs.msr & MSR_IR) && - (r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) == - (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) - return -2; /* MMIO emulation - load instr word */ + if ((r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) == + (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) { + if (!cache_entry) { + unsigned int pshift = 12; + unsigned int pshift_index; + + if (slb_v & SLB_VSID_L) { + pshift_index = ((slb_v & SLB_VSID_LP) >> 4); + pshift = slb_base_page_shift[pshift_index]; + } + cache_entry = next_mmio_cache_entry(vcpu); + cache_entry->eaddr = addr; + cache_entry->slb_base_pshift = pshift; + cache_entry->pte_index = index; + cache_entry->hpte_v = v; + cache_entry->hpte_r = r; + cache_entry->rpte = gr; + cache_entry->slb_v = slb_v; + cache_entry->mmio_update = mmio_update; + } + if (data && (vcpu->arch.shregs.msr & MSR_IR)) + return -2; /* MMIO emulation - load instr word */ + } return -1; /* send fault up to host kernel mode */ } diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index a0ea63ac2b52..06edc4366639 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -70,7 +70,11 @@ static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) hcpu = hcore << threads_shift; kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu; smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION); - icp_native_cause_ipi_rm(hcpu); + if (paca[hcpu].kvm_hstate.xics_phys) + icp_native_cause_ipi_rm(hcpu); + else + opal_rm_int_set_mfrr(get_hard_smp_processor_id(hcpu), + IPI_PRIORITY); } #else static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) { } @@ -737,7 +741,7 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) unsigned long eoi_rc; -static void icp_eoi(struct irq_chip *c, u32 hwirq, u32 xirr) +static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again) { unsigned long xics_phys; int64_t rc; @@ -751,7 +755,12 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, u32 xirr) /* EOI it */ xics_phys = local_paca->kvm_hstate.xics_phys; - _stwcix(xics_phys + XICS_XIRR, xirr); + if (xics_phys) { + _stwcix(xics_phys + XICS_XIRR, xirr); + } else { + rc = opal_rm_int_eoi(be32_to_cpu(xirr)); + *again = rc > 0; + } } static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu) @@ -809,9 +818,10 @@ static void kvmppc_rm_handle_irq_desc(struct irq_desc *desc) } long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, - u32 xirr, + __be32 xirr, struct kvmppc_irq_map *irq_map, - struct kvmppc_passthru_irqmap *pimap) + struct kvmppc_passthru_irqmap *pimap, + bool *again) { struct kvmppc_xics *xics; struct kvmppc_icp *icp; @@ -825,7 +835,8 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, icp_rm_deliver_irq(xics, icp, irq); /* EOI the interrupt */ - icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr); + icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr, + again); if (check_too_hard(xics, icp) == H_TOO_HARD) return 2; diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index c3c1d1bcfc67..9338a818e05c 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -501,17 +501,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) cmpwi r0, 0 beq 57f li r3, (LPCR_PECEDH | LPCR_PECE0) >> 4 - mfspr r4, SPRN_LPCR - rlwimi r4, r3, 4, (LPCR_PECEDP | LPCR_PECEDH | LPCR_PECE0 | LPCR_PECE1) - mtspr SPRN_LPCR, r4 - isync - std r0, HSTATE_SCRATCH0(r13) - ptesync - ld r0, HSTATE_SCRATCH0(r13) -1: cmpd r0, r0 - bne 1b - nap - b . + mfspr r5, SPRN_LPCR + rlwimi r5, r3, 4, (LPCR_PECEDP | LPCR_PECEDH | LPCR_PECE0 | LPCR_PECE1) + b kvm_nap_sequence 57: li r0, 0 stbx r0, r3, r4 @@ -523,6 +515,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) * * *****************************************************************************/ +/* Stack frame offsets */ +#define STACK_SLOT_TID (112-16) +#define STACK_SLOT_PSSCR (112-24) + .global kvmppc_hv_entry kvmppc_hv_entry: @@ -581,12 +577,14 @@ kvmppc_hv_entry: ld r9,VCORE_KVM(r5) /* pointer to struct kvm */ cmpwi r6,0 bne 10f - ld r6,KVM_SDR1(r9) lwz r7,KVM_LPID(r9) +BEGIN_FTR_SECTION + ld r6,KVM_SDR1(r9) li r0,LPID_RSVD /* switch to reserved LPID */ mtspr SPRN_LPID,r0 ptesync mtspr SPRN_SDR1,r6 /* switch to partition page table */ +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) mtspr SPRN_LPID,r7 isync @@ -607,12 +605,8 @@ kvmppc_hv_entry: stdcx. r7,0,r6 bne 23b /* Flush the TLB of any entries for this LPID */ - /* use arch 2.07S as a proxy for POWER8 */ -BEGIN_FTR_SECTION - li r6,512 /* POWER8 has 512 sets */ -FTR_SECTION_ELSE - li r6,128 /* POWER7 has 128 sets */ -ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S) + lwz r6,KVM_TLB_SETS(r9) + li r0,0 /* RS for P9 version of tlbiel */ mtctr r6 li r7,0x800 /* IS field = 0b10 */ ptesync @@ -698,6 +692,14 @@ kvmppc_got_guest: mtspr SPRN_PURR,r7 mtspr SPRN_SPURR,r8 + /* Save host values of some registers */ +BEGIN_FTR_SECTION + mfspr r5, SPRN_TIDR + mfspr r6, SPRN_PSSCR + std r5, STACK_SLOT_TID(r1) + std r6, STACK_SLOT_PSSCR(r1) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + BEGIN_FTR_SECTION /* Set partition DABR */ /* Do this before re-enabling PMU to avoid P7 DABR corruption bug */ @@ -750,14 +752,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG) BEGIN_FTR_SECTION ld r5, VCPU_MMCR + 24(r4) ld r6, VCPU_SIER(r4) + mtspr SPRN_MMCR2, r5 + mtspr SPRN_SIER, r6 +BEGIN_FTR_SECTION_NESTED(96) lwz r7, VCPU_PMC + 24(r4) lwz r8, VCPU_PMC + 28(r4) ld r9, VCPU_MMCR + 32(r4) - mtspr SPRN_MMCR2, r5 - mtspr SPRN_SIER, r6 mtspr SPRN_SPMC1, r7 mtspr SPRN_SPMC2, r8 mtspr SPRN_MMCRS, r9 +END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mtspr SPRN_MMCR0, r3 isync @@ -813,20 +817,30 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) mtspr SPRN_EBBHR, r8 ld r5, VCPU_EBBRR(r4) ld r6, VCPU_BESCR(r4) - ld r7, VCPU_CSIGR(r4) - ld r8, VCPU_TACR(r4) + lwz r7, VCPU_GUEST_PID(r4) + ld r8, VCPU_WORT(r4) mtspr SPRN_EBBRR, r5 mtspr SPRN_BESCR, r6 - mtspr SPRN_CSIGR, r7 - mtspr SPRN_TACR, r8 + mtspr SPRN_PID, r7 + mtspr SPRN_WORT, r8 +BEGIN_FTR_SECTION + /* POWER8-only registers */ ld r5, VCPU_TCSCR(r4) ld r6, VCPU_ACOP(r4) - lwz r7, VCPU_GUEST_PID(r4) - ld r8, VCPU_WORT(r4) + ld r7, VCPU_CSIGR(r4) + ld r8, VCPU_TACR(r4) mtspr SPRN_TCSCR, r5 mtspr SPRN_ACOP, r6 - mtspr SPRN_PID, r7 - mtspr SPRN_WORT, r8 + mtspr SPRN_CSIGR, r7 + mtspr SPRN_TACR, r8 +FTR_SECTION_ELSE + /* POWER9-only registers */ + ld r5, VCPU_TID(r4) + ld r6, VCPU_PSSCR(r4) + oris r6, r6, PSSCR_EC@h /* This makes stop trap to HV */ + mtspr SPRN_TIDR, r5 + mtspr SPRN_PSSCR, r6 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) 8: /* @@ -1341,20 +1355,29 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) std r8, VCPU_EBBHR(r9) mfspr r5, SPRN_EBBRR mfspr r6, SPRN_BESCR - mfspr r7, SPRN_CSIGR - mfspr r8, SPRN_TACR + mfspr r7, SPRN_PID + mfspr r8, SPRN_WORT std r5, VCPU_EBBRR(r9) std r6, VCPU_BESCR(r9) - std r7, VCPU_CSIGR(r9) - std r8, VCPU_TACR(r9) + stw r7, VCPU_GUEST_PID(r9) + std r8, VCPU_WORT(r9) +BEGIN_FTR_SECTION mfspr r5, SPRN_TCSCR mfspr r6, SPRN_ACOP - mfspr r7, SPRN_PID - mfspr r8, SPRN_WORT + mfspr r7, SPRN_CSIGR + mfspr r8, SPRN_TACR std r5, VCPU_TCSCR(r9) std r6, VCPU_ACOP(r9) - stw r7, VCPU_GUEST_PID(r9) - std r8, VCPU_WORT(r9) + std r7, VCPU_CSIGR(r9) + std r8, VCPU_TACR(r9) +FTR_SECTION_ELSE + mfspr r5, SPRN_TIDR + mfspr r6, SPRN_PSSCR + std r5, VCPU_TID(r9) + rldicl r6, r6, 4, 50 /* r6 &= PSSCR_GUEST_VIS */ + rotldi r6, r6, 60 + std r6, VCPU_PSSCR(r9) +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) /* * Restore various registers to 0, where non-zero values * set by the guest could disrupt the host. @@ -1363,12 +1386,14 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) mtspr SPRN_IAMR, r0 mtspr SPRN_CIABR, r0 mtspr SPRN_DAWRX, r0 - mtspr SPRN_TCSCR, r0 mtspr SPRN_WORT, r0 +BEGIN_FTR_SECTION + mtspr SPRN_TCSCR, r0 /* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */ li r0, 1 sldi r0, r0, 31 mtspr SPRN_MMCRS, r0 +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) 8: /* Save and reset AMR and UAMOR before turning on the MMU */ @@ -1502,15 +1527,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) stw r8, VCPU_PMC + 20(r9) BEGIN_FTR_SECTION mfspr r5, SPRN_SIER + std r5, VCPU_SIER(r9) +BEGIN_FTR_SECTION_NESTED(96) mfspr r6, SPRN_SPMC1 mfspr r7, SPRN_SPMC2 mfspr r8, SPRN_MMCRS - std r5, VCPU_SIER(r9) stw r6, VCPU_PMC + 24(r9) stw r7, VCPU_PMC + 28(r9) std r8, VCPU_MMCR + 32(r9) lis r4, 0x8000 mtspr SPRN_MMCRS, r4 +END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 22: /* Clear out SLB */ @@ -1519,6 +1546,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) slbia ptesync + /* Restore host values of some registers */ +BEGIN_FTR_SECTION + ld r5, STACK_SLOT_TID(r1) + ld r6, STACK_SLOT_PSSCR(r1) + mtspr SPRN_TIDR, r5 + mtspr SPRN_PSSCR, r6 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + /* * POWER7/POWER8 guest -> host partition switch code. * We don't have to lock against tlbies but we do @@ -1552,12 +1587,14 @@ kvmhv_switch_to_host: beq 19f /* Primary thread switches back to host partition */ - ld r6,KVM_HOST_SDR1(r4) lwz r7,KVM_HOST_LPID(r4) +BEGIN_FTR_SECTION + ld r6,KVM_HOST_SDR1(r4) li r8,LPID_RSVD /* switch to reserved LPID */ mtspr SPRN_LPID,r8 ptesync - mtspr SPRN_SDR1,r6 /* switch to partition page table */ + mtspr SPRN_SDR1,r6 /* switch to host page table */ +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) mtspr SPRN_LPID,r7 isync @@ -2211,6 +2248,21 @@ BEGIN_FTR_SECTION ori r5, r5, LPCR_PECEDH rlwimi r5, r3, 0, LPCR_PECEDP END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + +kvm_nap_sequence: /* desired LPCR value in r5 */ +BEGIN_FTR_SECTION + /* + * PSSCR bits: exit criterion = 1 (wakeup based on LPCR at sreset) + * enable state loss = 1 (allow SMT mode switch) + * requested level = 0 (just stop dispatching) + */ + lis r3, (PSSCR_EC | PSSCR_ESL)@h + mtspr SPRN_PSSCR, r3 + /* Set LPCR_PECE_HVEE bit to enable wakeup by HV interrupts */ + li r4, LPCR_PECE_HVEE@higher + sldi r4, r4, 32 + or r5, r5, r4 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) mtspr SPRN_LPCR,r5 isync li r0, 0 @@ -2219,7 +2271,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) ld r0, HSTATE_SCRATCH0(r13) 1: cmpd r0, r0 bne 1b +BEGIN_FTR_SECTION nap +FTR_SECTION_ELSE + PPC_STOP +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) b . 33: mr r4, r3 @@ -2600,11 +2656,13 @@ kvmppc_save_tm: mfctr r7 mfspr r8, SPRN_AMR mfspr r10, SPRN_TAR + mfxer r11 std r5, VCPU_LR_TM(r9) stw r6, VCPU_CR_TM(r9) std r7, VCPU_CTR_TM(r9) std r8, VCPU_AMR_TM(r9) std r10, VCPU_TAR_TM(r9) + std r11, VCPU_XER_TM(r9) /* Restore r12 as trap number. */ lwz r12, VCPU_TRAP(r9) @@ -2697,11 +2755,13 @@ kvmppc_restore_tm: ld r7, VCPU_CTR_TM(r4) ld r8, VCPU_AMR_TM(r4) ld r9, VCPU_TAR_TM(r4) + ld r10, VCPU_XER_TM(r4) mtlr r5 mtcr r6 mtctr r7 mtspr SPRN_AMR, r8 mtspr SPRN_TAR, r9 + mtxer r10 /* * Load up PPR and DSCR values but don't put them in the actual SPRs diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 70963c845e96..efd1183a6b16 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -536,7 +536,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) #ifdef CONFIG_PPC_BOOK3S_64 case KVM_CAP_SPAPR_TCE: case KVM_CAP_SPAPR_TCE_64: - case KVM_CAP_PPC_ALLOC_HTAB: case KVM_CAP_PPC_RTAS: case KVM_CAP_PPC_FIXUP_HCALL: case KVM_CAP_PPC_ENABLE_HCALL: @@ -545,13 +544,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) #endif r = 1; break; + + case KVM_CAP_PPC_ALLOC_HTAB: + r = hv_enabled; + break; #endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE case KVM_CAP_PPC_SMT: - if (hv_enabled) - r = threads_per_subcore; - else - r = 0; + r = 0; + if (hv_enabled) { + if (cpu_has_feature(CPU_FTR_ARCH_300)) + r = 1; + else + r = threads_per_subcore; + } break; case KVM_CAP_PPC_RMA: r = 0; diff --git a/arch/powerpc/kvm/trace_hv.h b/arch/powerpc/kvm/trace_hv.h index fb21990c0fb4..ebc6dd449556 100644 --- a/arch/powerpc/kvm/trace_hv.h +++ b/arch/powerpc/kvm/trace_hv.h @@ -449,7 +449,7 @@ TRACE_EVENT(kvmppc_vcore_wakeup, __entry->tgid = current->tgid; ), - TP_printk("%s time %lld ns, tgid=%d", + TP_printk("%s time %llu ns, tgid=%d", __entry->waited ? "wait" : "poll", __entry->ns, __entry->tgid) ); diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index 83ddc0e171b0..ad9fd5245be2 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -221,13 +221,18 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, return -1; hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; - hpte_r = hpte_encode_r(pa, psize, apsize, ssize) | rflags; + hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; if (!(vflags & HPTE_V_BOLTED)) { DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n", i, hpte_v, hpte_r); } + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + hpte_r = hpte_old_to_new_r(hpte_v, hpte_r); + hpte_v = hpte_old_to_new_v(hpte_v); + } + hptep->r = cpu_to_be64(hpte_r); /* Guarantee the second dword is visible before the valid bit */ eieio(); @@ -295,6 +300,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, vpn, want_v & HPTE_V_AVPN, slot, newpp); hpte_v = be64_to_cpu(hptep->v); + if (cpu_has_feature(CPU_FTR_ARCH_300)) + hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r)); /* * We need to invalidate the TLB always because hpte_remove doesn't do * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less @@ -309,6 +316,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, native_lock_hpte(hptep); /* recheck with locks held */ hpte_v = be64_to_cpu(hptep->v); + if (cpu_has_feature(CPU_FTR_ARCH_300)) + hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r)); if (unlikely(!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))) { ret = -1; @@ -350,6 +359,8 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize) for (i = 0; i < HPTES_PER_GROUP; i++) { hptep = htab_address + slot; hpte_v = be64_to_cpu(hptep->v); + if (cpu_has_feature(CPU_FTR_ARCH_300)) + hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r)); if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) /* HPTE matches */ @@ -409,6 +420,8 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, want_v = hpte_encode_avpn(vpn, bpsize, ssize); native_lock_hpte(hptep); hpte_v = be64_to_cpu(hptep->v); + if (cpu_has_feature(CPU_FTR_ARCH_300)) + hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r)); /* * We need to invalidate the TLB always because hpte_remove doesn't do @@ -467,6 +480,8 @@ static void native_hugepage_invalidate(unsigned long vsid, want_v = hpte_encode_avpn(vpn, psize, ssize); native_lock_hpte(hptep); hpte_v = be64_to_cpu(hptep->v); + if (cpu_has_feature(CPU_FTR_ARCH_300)) + hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r)); /* Even if we miss, we need to invalidate the TLB */ if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) @@ -504,6 +519,10 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot, /* Look at the 8 bit LP value */ unsigned int lp = (hpte_r >> LP_SHIFT) & ((1 << LP_BITS) - 1); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + hpte_v = hpte_new_to_old_v(hpte_v, hpte_r); + hpte_r = hpte_new_to_old_r(hpte_r); + } if (!(hpte_v & HPTE_V_LARGE)) { size = MMU_PAGE_4K; a_size = MMU_PAGE_4K; @@ -512,11 +531,7 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot, a_size = hpte_page_sizes[lp] >> 4; } /* This works for all page sizes, and for 256M and 1T segments */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) - *ssize = hpte_r >> HPTE_R_3_0_SSIZE_SHIFT; - else - *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT; - + *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT; shift = mmu_psize_defs[size].shift; avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm); @@ -639,6 +654,9 @@ static void native_flush_hash_range(unsigned long number, int local) want_v = hpte_encode_avpn(vpn, psize, ssize); native_lock_hpte(hptep); hpte_v = be64_to_cpu(hptep->v); + if (cpu_has_feature(CPU_FTR_ARCH_300)) + hpte_v = hpte_new_to_old_v(hpte_v, + be64_to_cpu(hptep->r)); if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) native_unlock_hpte(hptep); diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 78dabf065ba9..8410b4bb36ed 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -796,37 +796,17 @@ static void update_hid_for_hash(void) static void __init hash_init_partition_table(phys_addr_t hash_table, unsigned long htab_size) { - unsigned long ps_field; - unsigned long patb_size = 1UL << PATB_SIZE_SHIFT; + mmu_partition_table_init(); /* - * slb llp encoding for the page size used in VPM real mode. - * We can ignore that for lpid 0 + * PS field (VRMA page size) is not used for LPID 0, hence set to 0. + * For now, UPRT is 0 and we have no segment table. */ - ps_field = 0; htab_size = __ilog2(htab_size) - 18; - - BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large."); - partition_tb = __va(memblock_alloc_base(patb_size, patb_size, - MEMBLOCK_ALLOC_ANYWHERE)); - - /* Initialize the Partition Table with no entries */ - memset((void *)partition_tb, 0, patb_size); - partition_tb->patb0 = cpu_to_be64(ps_field | hash_table | htab_size); - /* - * FIXME!! This should be done via update_partition table - * For now UPRT is 0 for us. - */ - partition_tb->patb1 = 0; + mmu_partition_table_set_entry(0, hash_table | htab_size, 0); pr_info("Partition table %p\n", partition_tb); if (cpu_has_feature(CPU_FTR_POWER9_DD1)) update_hid_for_hash(); - /* - * update partition table control register, - * 64 K size. - */ - mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); - } static void __init htab_initialize(void) diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 688b54517655..8d941c692eb3 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -177,23 +177,15 @@ redo: static void __init radix_init_partition_table(void) { - unsigned long rts_field; + unsigned long rts_field, dw0; + mmu_partition_table_init(); rts_field = radix__get_tree_size(); + dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR; + mmu_partition_table_set_entry(0, dw0, 0); - BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large."); - partition_tb = early_alloc_pgtable(1UL << PATB_SIZE_SHIFT); - partition_tb->patb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | - RADIX_PGD_INDEX_SIZE | PATB_HR); pr_info("Initializing Radix MMU\n"); pr_info("Partition table %p\n", partition_tb); - - memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); - /* - * update partition table control register, - * 64 K size. - */ - mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); } void __init radix_init_native(void) @@ -378,6 +370,8 @@ void __init radix__early_init_mmu(void) radix_init_partition_table(); } + memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); + radix_init_pgtable(); } diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index f5e8d4edb808..8bca7f58afc4 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -431,3 +431,37 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) } } #endif + +#ifdef CONFIG_PPC_BOOK3S_64 +void __init mmu_partition_table_init(void) +{ + unsigned long patb_size = 1UL << PATB_SIZE_SHIFT; + + BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large."); + partition_tb = __va(memblock_alloc_base(patb_size, patb_size, + MEMBLOCK_ALLOC_ANYWHERE)); + + /* Initialize the Partition Table with no entries */ + memset((void *)partition_tb, 0, patb_size); + + /* + * update partition table control register, + * 64 K size. + */ + mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); +} + +void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, + unsigned long dw1) +{ + partition_tb[lpid].patb0 = cpu_to_be64(dw0); + partition_tb[lpid].patb1 = cpu_to_be64(dw1); + + /* Global flush of TLBs and partition table caches for this lpid */ + asm volatile("ptesync" : : : "memory"); + asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); +} +EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry); +#endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index 44d2d842cee7..3aa40f1b20f5 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -304,8 +304,11 @@ OPAL_CALL(opal_pci_get_presence_state, OPAL_PCI_GET_PRESENCE_STATE); OPAL_CALL(opal_pci_get_power_state, OPAL_PCI_GET_POWER_STATE); OPAL_CALL(opal_pci_set_power_state, OPAL_PCI_SET_POWER_STATE); OPAL_CALL(opal_int_get_xirr, OPAL_INT_GET_XIRR); +OPAL_CALL_REAL(opal_rm_int_get_xirr, OPAL_INT_GET_XIRR); OPAL_CALL(opal_int_set_cppr, OPAL_INT_SET_CPPR); OPAL_CALL(opal_int_eoi, OPAL_INT_EOI); +OPAL_CALL_REAL(opal_rm_int_eoi, OPAL_INT_EOI); OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR); +OPAL_CALL_REAL(opal_rm_int_set_mfrr, OPAL_INT_SET_MFRR); OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL); OPAL_CALL_REAL(opal_rm_pci_tce_kill, OPAL_PCI_TCE_KILL); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 6c9a65b52e63..b3b8930ac52f 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -896,3 +896,5 @@ EXPORT_SYMBOL_GPL(opal_leds_get_ind); EXPORT_SYMBOL_GPL(opal_leds_set_ind); /* Export this symbol for PowerNV Operator Panel class driver */ EXPORT_SYMBOL_GPL(opal_write_oppanel_async); +/* Export this for KVM */ +EXPORT_SYMBOL_GPL(opal_int_set_mfrr); diff --git a/arch/powerpc/platforms/ps3/htab.c b/arch/powerpc/platforms/ps3/htab.c index cb3c50328de8..cc2b281a3766 100644 --- a/arch/powerpc/platforms/ps3/htab.c +++ b/arch/powerpc/platforms/ps3/htab.c @@ -63,7 +63,7 @@ static long ps3_hpte_insert(unsigned long hpte_group, unsigned long vpn, vflags &= ~HPTE_V_SECONDARY; hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; - hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize, apsize, ssize) | rflags; + hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize, apsize) | rflags; spin_lock_irqsave(&ps3_htab_lock, flags); diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index aa35245d8d6d..f2c98f6c1c9c 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -145,7 +145,7 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group, hpte_group, vpn, pa, rflags, vflags, psize); hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; - hpte_r = hpte_encode_r(pa, psize, apsize, ssize) | rflags; + hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; if (!(vflags & HPTE_V_BOLTED)) pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r); diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 028f97be5bae..c6722112527d 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -136,6 +136,7 @@ config S390 select HAVE_CMPXCHG_LOCAL select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_API_DEBUG + select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_EFFICIENT_UNALIGNED_ACCESS @@ -169,6 +170,7 @@ config S390 select OLD_SIGSUSPEND3 select SPARSE_IRQ select SYSCTL_EXCEPTION_TRACE + select THREAD_INFO_IN_TASK select TTY select VIRT_CPU_ACCOUNTING select ARCH_HAS_SCALED_CPUTIME diff --git a/arch/s390/boot/compressed/Makefile b/arch/s390/boot/compressed/Makefile index 0daa070d6c9d..6bd2c9022be3 100644 --- a/arch/s390/boot/compressed/Makefile +++ b/arch/s390/boot/compressed/Makefile @@ -10,7 +10,7 @@ targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 targets += vmlinux.bin.xz vmlinux.bin.lzma vmlinux.bin.lzo vmlinux.bin.lz4 targets += misc.o piggy.o sizes.h head.o -KBUILD_CFLAGS := -m64 -D__KERNEL__ $(LINUX_INCLUDE) -O2 +KBUILD_CFLAGS := -m64 -D__KERNEL__ -O2 KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING KBUILD_CFLAGS += $(cflags-y) -fno-delete-null-pointer-checks -msoft-float KBUILD_CFLAGS += $(call cc-option,-mpacked-stack) diff --git a/arch/s390/boot/compressed/head.S b/arch/s390/boot/compressed/head.S index 28c4f96a2d9c..11f6254c561e 100644 --- a/arch/s390/boot/compressed/head.S +++ b/arch/s390/boot/compressed/head.S @@ -46,7 +46,7 @@ mover_end: .align 8 .Lstack: - .quad 0x8000 + (1<<(PAGE_SHIFT+THREAD_ORDER)) + .quad 0x8000 + (1<<(PAGE_SHIFT+THREAD_SIZE_ORDER)) .Loffset: .quad 0x11000 .Lmvsize: diff --git a/arch/s390/configs/default_defconfig b/arch/s390/configs/default_defconfig index 45968686f918..e659daffe368 100644 --- a/arch/s390/configs/default_defconfig +++ b/arch/s390/configs/default_defconfig @@ -66,6 +66,8 @@ CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_CLEANCACHE=y CONFIG_FRONTSWAP=y CONFIG_CMA=y +CONFIG_CMA_DEBUG=y +CONFIG_CMA_DEBUGFS=y CONFIG_MEM_SOFT_DIRTY=y CONFIG_ZPOOL=m CONFIG_ZBUD=m @@ -366,6 +368,8 @@ CONFIG_BPF_JIT=y CONFIG_NET_PKTGEN=m CONFIG_NET_TCPPROBE=m CONFIG_DEVTMPFS=y +CONFIG_DMA_CMA=y +CONFIG_CMA_SIZE_MBYTES=0 CONFIG_CONNECTOR=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_CRYPTOLOOP=m @@ -438,7 +442,6 @@ CONFIG_TUN=m CONFIG_VETH=m CONFIG_VIRTIO_NET=m CONFIG_NLMON=m -CONFIG_VHOST_NET=m # CONFIG_NET_VENDOR_ARC is not set # CONFIG_NET_VENDOR_CHELSIO is not set # CONFIG_NET_VENDOR_INTEL is not set @@ -693,3 +696,4 @@ CONFIG_CMM=m CONFIG_APPLDATA_BASE=y CONFIG_KVM=m CONFIG_KVM_S390_UCONTROL=y +CONFIG_VHOST_NET=m diff --git a/arch/s390/configs/gcov_defconfig b/arch/s390/configs/gcov_defconfig index 1dd05e345c4d..95ceac50bc65 100644 --- a/arch/s390/configs/gcov_defconfig +++ b/arch/s390/configs/gcov_defconfig @@ -362,6 +362,8 @@ CONFIG_BPF_JIT=y CONFIG_NET_PKTGEN=m CONFIG_NET_TCPPROBE=m CONFIG_DEVTMPFS=y +CONFIG_DMA_CMA=y +CONFIG_CMA_SIZE_MBYTES=0 CONFIG_CONNECTOR=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_CRYPTOLOOP=m @@ -434,7 +436,6 @@ CONFIG_TUN=m CONFIG_VETH=m CONFIG_VIRTIO_NET=m CONFIG_NLMON=m -CONFIG_VHOST_NET=m # CONFIG_NET_VENDOR_ARC is not set # CONFIG_NET_VENDOR_CHELSIO is not set # CONFIG_NET_VENDOR_INTEL is not set @@ -633,3 +634,4 @@ CONFIG_CMM=m CONFIG_APPLDATA_BASE=y CONFIG_KVM=m CONFIG_KVM_S390_UCONTROL=y +CONFIG_VHOST_NET=m diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig index 29d1178666f0..bc7b176f5795 100644 --- a/arch/s390/configs/performance_defconfig +++ b/arch/s390/configs/performance_defconfig @@ -362,6 +362,8 @@ CONFIG_BPF_JIT=y CONFIG_NET_PKTGEN=m CONFIG_NET_TCPPROBE=m CONFIG_DEVTMPFS=y +CONFIG_DMA_CMA=y +CONFIG_CMA_SIZE_MBYTES=0 CONFIG_CONNECTOR=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_CRYPTOLOOP=m @@ -434,7 +436,6 @@ CONFIG_TUN=m CONFIG_VETH=m CONFIG_VIRTIO_NET=m CONFIG_NLMON=m -CONFIG_VHOST_NET=m # CONFIG_NET_VENDOR_ARC is not set # CONFIG_NET_VENDOR_CHELSIO is not set # CONFIG_NET_VENDOR_INTEL is not set @@ -632,3 +633,4 @@ CONFIG_CMM=m CONFIG_APPLDATA_BASE=y CONFIG_KVM=m CONFIG_KVM_S390_UCONTROL=y +CONFIG_VHOST_NET=m diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c index 9cc050f9536c..1113389d0a39 100644 --- a/arch/s390/crypto/prng.c +++ b/arch/s390/crypto/prng.c @@ -507,8 +507,10 @@ static ssize_t prng_tdes_read(struct file *file, char __user *ubuf, prng_data->prngws.byte_counter += n; prng_data->prngws.reseed_counter += n; - if (copy_to_user(ubuf, prng_data->buf, chunk)) - return -EFAULT; + if (copy_to_user(ubuf, prng_data->buf, chunk)) { + ret = -EFAULT; + break; + } nbytes -= chunk; ret += chunk; diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index 09bccb224d03..cf8a2d92467f 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -3,6 +3,7 @@ * * Copyright IBM Corp. 2006, 2008 * Author(s): Michael Holzheu <holzheu@de.ibm.com> + * License: GPL */ #define KMSG_COMPONENT "hypfs" @@ -18,7 +19,8 @@ #include <linux/time.h> #include <linux/parser.h> #include <linux/sysfs.h> -#include <linux/module.h> +#include <linux/init.h> +#include <linux/kobject.h> #include <linux/seq_file.h> #include <linux/mount.h> #include <linux/uio.h> @@ -443,7 +445,6 @@ static struct file_system_type hypfs_type = { .mount = hypfs_mount, .kill_sb = hypfs_kill_super }; -MODULE_ALIAS_FS("s390_hypfs"); static const struct super_operations hypfs_s_ops = { .statfs = simple_statfs, @@ -497,21 +498,4 @@ fail_dbfs_exit: pr_err("Initialization of hypfs failed with rc=%i\n", rc); return rc; } - -static void __exit hypfs_exit(void) -{ - unregister_filesystem(&hypfs_type); - sysfs_remove_mount_point(hypervisor_kobj, "s390"); - hypfs_diag0c_exit(); - hypfs_sprp_exit(); - hypfs_vm_exit(); - hypfs_diag_exit(); - hypfs_dbfs_exit(); -} - -module_init(hypfs_init) -module_exit(hypfs_exit) - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Michael Holzheu <holzheu@de.ibm.com>"); -MODULE_DESCRIPTION("s390 Hypervisor Filesystem"); +device_initcall(hypfs_init) diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 20f196b82a6e..8aea32fe8bd2 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -1,6 +1,6 @@ - - +generic-y += asm-offsets.h generic-y += clkdev.h +generic-y += dma-contiguous.h generic-y += export.h generic-y += irq_work.h generic-y += mcs_spinlock.h diff --git a/arch/s390/include/asm/asm-offsets.h b/arch/s390/include/asm/asm-offsets.h deleted file mode 100644 index d370ee36a182..000000000000 --- a/arch/s390/include/asm/asm-offsets.h +++ /dev/null @@ -1 +0,0 @@ -#include <generated/asm-offsets.h> diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h index d28cc2f5b7b2..f7f69dfd2db2 100644 --- a/arch/s390/include/asm/atomic.h +++ b/arch/s390/include/asm/atomic.h @@ -1,13 +1,8 @@ /* - * Copyright IBM Corp. 1999, 2009 + * Copyright IBM Corp. 1999, 2016 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>, * Denis Joseph Barrow, - * Arnd Bergmann <arndb@de.ibm.com>, - * - * Atomic operations that C can't guarantee us. - * Useful for resource counting etc. - * s390 uses 'Compare And Swap' for atomicity in SMP environment. - * + * Arnd Bergmann, */ #ifndef __ARCH_S390_ATOMIC__ @@ -15,62 +10,12 @@ #include <linux/compiler.h> #include <linux/types.h> +#include <asm/atomic_ops.h> #include <asm/barrier.h> #include <asm/cmpxchg.h> #define ATOMIC_INIT(i) { (i) } -#define __ATOMIC_NO_BARRIER "\n" - -#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES - -#define __ATOMIC_OR "lao" -#define __ATOMIC_AND "lan" -#define __ATOMIC_ADD "laa" -#define __ATOMIC_XOR "lax" -#define __ATOMIC_BARRIER "bcr 14,0\n" - -#define __ATOMIC_LOOP(ptr, op_val, op_string, __barrier) \ -({ \ - int old_val; \ - \ - typecheck(atomic_t *, ptr); \ - asm volatile( \ - op_string " %0,%2,%1\n" \ - __barrier \ - : "=d" (old_val), "+Q" ((ptr)->counter) \ - : "d" (op_val) \ - : "cc", "memory"); \ - old_val; \ -}) - -#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */ - -#define __ATOMIC_OR "or" -#define __ATOMIC_AND "nr" -#define __ATOMIC_ADD "ar" -#define __ATOMIC_XOR "xr" -#define __ATOMIC_BARRIER "\n" - -#define __ATOMIC_LOOP(ptr, op_val, op_string, __barrier) \ -({ \ - int old_val, new_val; \ - \ - typecheck(atomic_t *, ptr); \ - asm volatile( \ - " l %0,%2\n" \ - "0: lr %1,%0\n" \ - op_string " %1,%3\n" \ - " cs %0,%1,%2\n" \ - " jl 0b" \ - : "=&d" (old_val), "=&d" (new_val), "+Q" ((ptr)->counter)\ - : "d" (op_val) \ - : "cc", "memory"); \ - old_val; \ -}) - -#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */ - static inline int atomic_read(const atomic_t *v) { int c; @@ -90,27 +35,23 @@ static inline void atomic_set(atomic_t *v, int i) static inline int atomic_add_return(int i, atomic_t *v) { - return __ATOMIC_LOOP(v, i, __ATOMIC_ADD, __ATOMIC_BARRIER) + i; + return __atomic_add_barrier(i, &v->counter) + i; } static inline int atomic_fetch_add(int i, atomic_t *v) { - return __ATOMIC_LOOP(v, i, __ATOMIC_ADD, __ATOMIC_BARRIER); + return __atomic_add_barrier(i, &v->counter); } static inline void atomic_add(int i, atomic_t *v) { #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES if (__builtin_constant_p(i) && (i > -129) && (i < 128)) { - asm volatile( - "asi %0,%1\n" - : "+Q" (v->counter) - : "i" (i) - : "cc", "memory"); + __atomic_add_const(i, &v->counter); return; } #endif - __ATOMIC_LOOP(v, i, __ATOMIC_ADD, __ATOMIC_NO_BARRIER); + __atomic_add(i, &v->counter); } #define atomic_add_negative(_i, _v) (atomic_add_return(_i, _v) < 0) @@ -125,19 +66,19 @@ static inline void atomic_add(int i, atomic_t *v) #define atomic_dec_return(_v) atomic_sub_return(1, _v) #define atomic_dec_and_test(_v) (atomic_sub_return(1, _v) == 0) -#define ATOMIC_OPS(op, OP) \ +#define ATOMIC_OPS(op) \ static inline void atomic_##op(int i, atomic_t *v) \ { \ - __ATOMIC_LOOP(v, i, __ATOMIC_##OP, __ATOMIC_NO_BARRIER); \ + __atomic_##op(i, &v->counter); \ } \ static inline int atomic_fetch_##op(int i, atomic_t *v) \ { \ - return __ATOMIC_LOOP(v, i, __ATOMIC_##OP, __ATOMIC_BARRIER); \ + return __atomic_##op##_barrier(i, &v->counter); \ } -ATOMIC_OPS(and, AND) -ATOMIC_OPS(or, OR) -ATOMIC_OPS(xor, XOR) +ATOMIC_OPS(and) +ATOMIC_OPS(or) +ATOMIC_OPS(xor) #undef ATOMIC_OPS @@ -145,12 +86,7 @@ ATOMIC_OPS(xor, XOR) static inline int atomic_cmpxchg(atomic_t *v, int old, int new) { - asm volatile( - " cs %0,%2,%1" - : "+d" (old), "+Q" (v->counter) - : "d" (new) - : "cc", "memory"); - return old; + return __atomic_cmpxchg(&v->counter, old, new); } static inline int __atomic_add_unless(atomic_t *v, int a, int u) @@ -168,65 +104,11 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u) return c; } - -#undef __ATOMIC_LOOP - #define ATOMIC64_INIT(i) { (i) } -#define __ATOMIC64_NO_BARRIER "\n" - -#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES - -#define __ATOMIC64_OR "laog" -#define __ATOMIC64_AND "lang" -#define __ATOMIC64_ADD "laag" -#define __ATOMIC64_XOR "laxg" -#define __ATOMIC64_BARRIER "bcr 14,0\n" - -#define __ATOMIC64_LOOP(ptr, op_val, op_string, __barrier) \ -({ \ - long long old_val; \ - \ - typecheck(atomic64_t *, ptr); \ - asm volatile( \ - op_string " %0,%2,%1\n" \ - __barrier \ - : "=d" (old_val), "+Q" ((ptr)->counter) \ - : "d" (op_val) \ - : "cc", "memory"); \ - old_val; \ -}) - -#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */ - -#define __ATOMIC64_OR "ogr" -#define __ATOMIC64_AND "ngr" -#define __ATOMIC64_ADD "agr" -#define __ATOMIC64_XOR "xgr" -#define __ATOMIC64_BARRIER "\n" - -#define __ATOMIC64_LOOP(ptr, op_val, op_string, __barrier) \ -({ \ - long long old_val, new_val; \ - \ - typecheck(atomic64_t *, ptr); \ - asm volatile( \ - " lg %0,%2\n" \ - "0: lgr %1,%0\n" \ - op_string " %1,%3\n" \ - " csg %0,%1,%2\n" \ - " jl 0b" \ - : "=&d" (old_val), "=&d" (new_val), "+Q" ((ptr)->counter)\ - : "d" (op_val) \ - : "cc", "memory"); \ - old_val; \ -}) - -#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */ - -static inline long long atomic64_read(const atomic64_t *v) +static inline long atomic64_read(const atomic64_t *v) { - long long c; + long c; asm volatile( " lg %0,%1\n" @@ -234,71 +116,60 @@ static inline long long atomic64_read(const atomic64_t *v) return c; } -static inline void atomic64_set(atomic64_t *v, long long i) +static inline void atomic64_set(atomic64_t *v, long i) { asm volatile( " stg %1,%0\n" : "=Q" (v->counter) : "d" (i)); } -static inline long long atomic64_add_return(long long i, atomic64_t *v) +static inline long atomic64_add_return(long i, atomic64_t *v) { - return __ATOMIC64_LOOP(v, i, __ATOMIC64_ADD, __ATOMIC64_BARRIER) + i; + return __atomic64_add_barrier(i, &v->counter) + i; } -static inline long long atomic64_fetch_add(long long i, atomic64_t *v) +static inline long atomic64_fetch_add(long i, atomic64_t *v) { - return __ATOMIC64_LOOP(v, i, __ATOMIC64_ADD, __ATOMIC64_BARRIER); + return __atomic64_add_barrier(i, &v->counter); } -static inline void atomic64_add(long long i, atomic64_t *v) +static inline void atomic64_add(long i, atomic64_t *v) { #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES if (__builtin_constant_p(i) && (i > -129) && (i < 128)) { - asm volatile( - "agsi %0,%1\n" - : "+Q" (v->counter) - : "i" (i) - : "cc", "memory"); + __atomic64_add_const(i, &v->counter); return; } #endif - __ATOMIC64_LOOP(v, i, __ATOMIC64_ADD, __ATOMIC64_NO_BARRIER); + __atomic64_add(i, &v->counter); } #define atomic64_xchg(v, new) (xchg(&((v)->counter), new)) -static inline long long atomic64_cmpxchg(atomic64_t *v, - long long old, long long new) +static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) { - asm volatile( - " csg %0,%2,%1" - : "+d" (old), "+Q" (v->counter) - : "d" (new) - : "cc", "memory"); - return old; + return __atomic64_cmpxchg(&v->counter, old, new); } -#define ATOMIC64_OPS(op, OP) \ +#define ATOMIC64_OPS(op) \ static inline void atomic64_##op(long i, atomic64_t *v) \ { \ - __ATOMIC64_LOOP(v, i, __ATOMIC64_##OP, __ATOMIC64_NO_BARRIER); \ + __atomic64_##op(i, &v->counter); \ } \ static inline long atomic64_fetch_##op(long i, atomic64_t *v) \ { \ - return __ATOMIC64_LOOP(v, i, __ATOMIC64_##OP, __ATOMIC64_BARRIER); \ + return __atomic64_##op##_barrier(i, &v->counter); \ } -ATOMIC64_OPS(and, AND) -ATOMIC64_OPS(or, OR) -ATOMIC64_OPS(xor, XOR) +ATOMIC64_OPS(and) +ATOMIC64_OPS(or) +ATOMIC64_OPS(xor) #undef ATOMIC64_OPS -#undef __ATOMIC64_LOOP -static inline int atomic64_add_unless(atomic64_t *v, long long i, long long u) +static inline int atomic64_add_unless(atomic64_t *v, long i, long u) { - long long c, old; + long c, old; c = atomic64_read(v); for (;;) { @@ -312,9 +183,9 @@ static inline int atomic64_add_unless(atomic64_t *v, long long i, long long u) return c != u; } -static inline long long atomic64_dec_if_positive(atomic64_t *v) +static inline long atomic64_dec_if_positive(atomic64_t *v) { - long long c, old, dec; + long c, old, dec; c = atomic64_read(v); for (;;) { @@ -333,9 +204,9 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v) #define atomic64_inc(_v) atomic64_add(1, _v) #define atomic64_inc_return(_v) atomic64_add_return(1, _v) #define atomic64_inc_and_test(_v) (atomic64_add_return(1, _v) == 0) -#define atomic64_sub_return(_i, _v) atomic64_add_return(-(long long)(_i), _v) -#define atomic64_fetch_sub(_i, _v) atomic64_fetch_add(-(long long)(_i), _v) -#define atomic64_sub(_i, _v) atomic64_add(-(long long)(_i), _v) +#define atomic64_sub_return(_i, _v) atomic64_add_return(-(long)(_i), _v) +#define atomic64_fetch_sub(_i, _v) atomic64_fetch_add(-(long)(_i), _v) +#define atomic64_sub(_i, _v) atomic64_add(-(long)(_i), _v) #define atomic64_sub_and_test(_i, _v) (atomic64_sub_return(_i, _v) == 0) #define atomic64_dec(_v) atomic64_sub(1, _v) #define atomic64_dec_return(_v) atomic64_sub_return(1, _v) diff --git a/arch/s390/include/asm/atomic_ops.h b/arch/s390/include/asm/atomic_ops.h new file mode 100644 index 000000000000..ac9e2b939d04 --- /dev/null +++ b/arch/s390/include/asm/atomic_ops.h @@ -0,0 +1,130 @@ +/* + * Low level function for atomic operations + * + * Copyright IBM Corp. 1999, 2016 + */ + +#ifndef __ARCH_S390_ATOMIC_OPS__ +#define __ARCH_S390_ATOMIC_OPS__ + +#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES + +#define __ATOMIC_OP(op_name, op_type, op_string, op_barrier) \ +static inline op_type op_name(op_type val, op_type *ptr) \ +{ \ + op_type old; \ + \ + asm volatile( \ + op_string " %[old],%[val],%[ptr]\n" \ + op_barrier \ + : [old] "=d" (old), [ptr] "+Q" (*ptr) \ + : [val] "d" (val) : "cc", "memory"); \ + return old; \ +} \ + +#define __ATOMIC_OPS(op_name, op_type, op_string) \ + __ATOMIC_OP(op_name, op_type, op_string, "\n") \ + __ATOMIC_OP(op_name##_barrier, op_type, op_string, "bcr 14,0\n") + +__ATOMIC_OPS(__atomic_add, int, "laa") +__ATOMIC_OPS(__atomic_and, int, "lan") +__ATOMIC_OPS(__atomic_or, int, "lao") +__ATOMIC_OPS(__atomic_xor, int, "lax") + +__ATOMIC_OPS(__atomic64_add, long, "laag") +__ATOMIC_OPS(__atomic64_and, long, "lang") +__ATOMIC_OPS(__atomic64_or, long, "laog") +__ATOMIC_OPS(__atomic64_xor, long, "laxg") + +#undef __ATOMIC_OPS +#undef __ATOMIC_OP + +static inline void __atomic_add_const(int val, int *ptr) +{ + asm volatile( + " asi %[ptr],%[val]\n" + : [ptr] "+Q" (*ptr) : [val] "i" (val) : "cc"); +} + +static inline void __atomic64_add_const(long val, long *ptr) +{ + asm volatile( + " agsi %[ptr],%[val]\n" + : [ptr] "+Q" (*ptr) : [val] "i" (val) : "cc"); +} + +#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */ + +#define __ATOMIC_OP(op_name, op_string) \ +static inline int op_name(int val, int *ptr) \ +{ \ + int old, new; \ + \ + asm volatile( \ + "0: lr %[new],%[old]\n" \ + op_string " %[new],%[val]\n" \ + " cs %[old],%[new],%[ptr]\n" \ + " jl 0b" \ + : [old] "=d" (old), [new] "=&d" (new), [ptr] "+Q" (*ptr)\ + : [val] "d" (val), "0" (*ptr) : "cc", "memory"); \ + return old; \ +} + +#define __ATOMIC_OPS(op_name, op_string) \ + __ATOMIC_OP(op_name, op_string) \ + __ATOMIC_OP(op_name##_barrier, op_string) + +__ATOMIC_OPS(__atomic_add, "ar") +__ATOMIC_OPS(__atomic_and, "nr") +__ATOMIC_OPS(__atomic_or, "or") +__ATOMIC_OPS(__atomic_xor, "xr") + +#undef __ATOMIC_OPS + +#define __ATOMIC64_OP(op_name, op_string) \ +static inline long op_name(long val, long *ptr) \ +{ \ + long old, new; \ + \ + asm volatile( \ + "0: lgr %[new],%[old]\n" \ + op_string " %[new],%[val]\n" \ + " csg %[old],%[new],%[ptr]\n" \ + " jl 0b" \ + : [old] "=d" (old), [new] "=&d" (new), [ptr] "+Q" (*ptr)\ + : [val] "d" (val), "0" (*ptr) : "cc", "memory"); \ + return old; \ +} + +#define __ATOMIC64_OPS(op_name, op_string) \ + __ATOMIC64_OP(op_name, op_string) \ + __ATOMIC64_OP(op_name##_barrier, op_string) + +__ATOMIC64_OPS(__atomic64_add, "agr") +__ATOMIC64_OPS(__atomic64_and, "ngr") +__ATOMIC64_OPS(__atomic64_or, "ogr") +__ATOMIC64_OPS(__atomic64_xor, "xgr") + +#undef __ATOMIC64_OPS + +#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */ + +static inline int __atomic_cmpxchg(int *ptr, int old, int new) +{ + asm volatile( + " cs %[old],%[new],%[ptr]" + : [old] "+d" (old), [ptr] "+Q" (*ptr) + : [new] "d" (new) : "cc", "memory"); + return old; +} + +static inline long __atomic64_cmpxchg(long *ptr, long old, long new) +{ + asm volatile( + " csg %[old],%[new],%[ptr]" + : [old] "+d" (old), [ptr] "+Q" (*ptr) + : [new] "d" (new) : "cc", "memory"); + return old; +} + +#endif /* __ARCH_S390_ATOMIC_OPS__ */ diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index 8043f10da6b5..d92047da5ccb 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -42,57 +42,9 @@ #include <linux/typecheck.h> #include <linux/compiler.h> +#include <asm/atomic_ops.h> #include <asm/barrier.h> -#define __BITOPS_NO_BARRIER "\n" - -#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES - -#define __BITOPS_OR "laog" -#define __BITOPS_AND "lang" -#define __BITOPS_XOR "laxg" -#define __BITOPS_BARRIER "bcr 14,0\n" - -#define __BITOPS_LOOP(__addr, __val, __op_string, __barrier) \ -({ \ - unsigned long __old; \ - \ - typecheck(unsigned long *, (__addr)); \ - asm volatile( \ - __op_string " %0,%2,%1\n" \ - __barrier \ - : "=d" (__old), "+Q" (*(__addr)) \ - : "d" (__val) \ - : "cc", "memory"); \ - __old; \ -}) - -#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */ - -#define __BITOPS_OR "ogr" -#define __BITOPS_AND "ngr" -#define __BITOPS_XOR "xgr" -#define __BITOPS_BARRIER "\n" - -#define __BITOPS_LOOP(__addr, __val, __op_string, __barrier) \ -({ \ - unsigned long __old, __new; \ - \ - typecheck(unsigned long *, (__addr)); \ - asm volatile( \ - " lg %0,%2\n" \ - "0: lgr %1,%0\n" \ - __op_string " %1,%3\n" \ - " csg %0,%1,%2\n" \ - " jl 0b" \ - : "=&d" (__old), "=&d" (__new), "+Q" (*(__addr))\ - : "d" (__val) \ - : "cc", "memory"); \ - __old; \ -}) - -#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */ - #define __BITOPS_WORDS(bits) (((bits) + BITS_PER_LONG - 1) / BITS_PER_LONG) static inline unsigned long * @@ -128,7 +80,7 @@ static inline void set_bit(unsigned long nr, volatile unsigned long *ptr) } #endif mask = 1UL << (nr & (BITS_PER_LONG - 1)); - __BITOPS_LOOP(addr, mask, __BITOPS_OR, __BITOPS_NO_BARRIER); + __atomic64_or(mask, addr); } static inline void clear_bit(unsigned long nr, volatile unsigned long *ptr) @@ -149,7 +101,7 @@ static inline void clear_bit(unsigned long nr, volatile unsigned long *ptr) } #endif mask = ~(1UL << (nr & (BITS_PER_LONG - 1))); - __BITOPS_LOOP(addr, mask, __BITOPS_AND, __BITOPS_NO_BARRIER); + __atomic64_and(mask, addr); } static inline void change_bit(unsigned long nr, volatile unsigned long *ptr) @@ -170,7 +122,7 @@ static inline void change_bit(unsigned long nr, volatile unsigned long *ptr) } #endif mask = 1UL << (nr & (BITS_PER_LONG - 1)); - __BITOPS_LOOP(addr, mask, __BITOPS_XOR, __BITOPS_NO_BARRIER); + __atomic64_xor(mask, addr); } static inline int @@ -180,7 +132,7 @@ test_and_set_bit(unsigned long nr, volatile unsigned long *ptr) unsigned long old, mask; mask = 1UL << (nr & (BITS_PER_LONG - 1)); - old = __BITOPS_LOOP(addr, mask, __BITOPS_OR, __BITOPS_BARRIER); + old = __atomic64_or_barrier(mask, addr); return (old & mask) != 0; } @@ -191,7 +143,7 @@ test_and_clear_bit(unsigned long nr, volatile unsigned long *ptr) unsigned long old, mask; mask = ~(1UL << (nr & (BITS_PER_LONG - 1))); - old = __BITOPS_LOOP(addr, mask, __BITOPS_AND, __BITOPS_BARRIER); + old = __atomic64_and_barrier(mask, addr); return (old & ~mask) != 0; } @@ -202,7 +154,7 @@ test_and_change_bit(unsigned long nr, volatile unsigned long *ptr) unsigned long old, mask; mask = 1UL << (nr & (BITS_PER_LONG - 1)); - old = __BITOPS_LOOP(addr, mask, __BITOPS_XOR, __BITOPS_BARRIER); + old = __atomic64_xor_barrier(mask, addr); return (old & mask) != 0; } diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h index 03516476127b..b69d8bc231a5 100644 --- a/arch/s390/include/asm/cpu_mf.h +++ b/arch/s390/include/asm/cpu_mf.h @@ -104,7 +104,8 @@ struct hws_basic_entry { unsigned int P:1; /* 28 PSW Problem state */ unsigned int AS:2; /* 29-30 PSW address-space control */ unsigned int I:1; /* 31 entry valid or invalid */ - unsigned int:16; + unsigned int CL:2; /* 32-33 Configuration Level */ + unsigned int:14; unsigned int prim_asn:16; /* primary ASN */ unsigned long long ia; /* Instruction Address */ unsigned long long gpp; /* Guest Program Parameter */ diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index 1736c7d3c94c..f4381e1fb19e 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -193,7 +193,7 @@ extern char elf_platform[]; do { \ set_personality(PER_LINUX | \ (current->personality & (~PER_MASK))); \ - current_thread_info()->sys_call_table = \ + current->thread.sys_call_table = \ (unsigned long) &sys_call_table; \ } while (0) #else /* CONFIG_COMPAT */ @@ -204,11 +204,11 @@ do { \ (current->personality & ~PER_MASK)); \ if ((ex).e_ident[EI_CLASS] == ELFCLASS32) { \ set_thread_flag(TIF_31BIT); \ - current_thread_info()->sys_call_table = \ + current->thread.sys_call_table = \ (unsigned long) &sys_call_table_emu; \ } else { \ clear_thread_flag(TIF_31BIT); \ - current_thread_info()->sys_call_table = \ + current->thread.sys_call_table = \ (unsigned long) &sys_call_table; \ } \ } while (0) diff --git a/arch/s390/include/asm/facilities_src.h b/arch/s390/include/asm/facilities_src.h deleted file mode 100644 index 3b758f66e48b..000000000000 --- a/arch/s390/include/asm/facilities_src.h +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright IBM Corp. 2015 - */ - -#ifndef S390_GEN_FACILITIES_C -#error "This file can only be included by gen_facilities.c" -#endif - -#include <linux/kconfig.h> - -struct facility_def { - char *name; - int *bits; -}; - -static struct facility_def facility_defs[] = { - { - /* - * FACILITIES_ALS contains the list of facilities that are - * required to run a kernel that is compiled e.g. with - * -march=<machine>. - */ - .name = "FACILITIES_ALS", - .bits = (int[]){ -#ifdef CONFIG_HAVE_MARCH_Z900_FEATURES - 0, /* N3 instructions */ - 1, /* z/Arch mode installed */ -#endif -#ifdef CONFIG_HAVE_MARCH_Z990_FEATURES - 18, /* long displacement facility */ -#endif -#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES - 7, /* stfle */ - 17, /* message security assist */ - 21, /* extended-immediate facility */ - 25, /* store clock fast */ -#endif -#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES - 27, /* mvcos */ - 32, /* compare and swap and store */ - 33, /* compare and swap and store 2 */ - 34, /* general extension facility */ - 35, /* execute extensions */ -#endif -#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES - 45, /* fast-BCR, etc. */ -#endif -#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES - 49, /* misc-instruction-extensions */ - 52, /* interlocked facility 2 */ -#endif -#ifdef CONFIG_HAVE_MARCH_Z13_FEATURES - 53, /* load-and-zero-rightmost-byte, etc. */ -#endif - -1 /* END */ - } - }, - { - .name = "FACILITIES_KVM", - .bits = (int[]){ - 0, /* N3 instructions */ - 1, /* z/Arch mode installed */ - 2, /* z/Arch mode active */ - 3, /* DAT-enhancement */ - 4, /* idte segment table */ - 5, /* idte region table */ - 6, /* ASN-and-LX reuse */ - 7, /* stfle */ - 8, /* enhanced-DAT 1 */ - 9, /* sense-running-status */ - 10, /* conditional sske */ - 13, /* ipte-range */ - 14, /* nonquiescing key-setting */ - 73, /* transactional execution */ - 75, /* access-exception-fetch/store indication */ - 76, /* msa extension 3 */ - 77, /* msa extension 4 */ - 78, /* enhanced-DAT 2 */ - -1 /* END */ - } - }, -}; diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h index 4da22b2f0521..edb5161df7e2 100644 --- a/arch/s390/include/asm/ipl.h +++ b/arch/s390/include/asm/ipl.h @@ -97,7 +97,7 @@ void __init save_area_add_vxrs(struct save_area *, __vector128 *vxrs); extern void do_reipl(void); extern void do_halt(void); extern void do_poff(void); -extern void ipl_save_parameters(void); +extern void ipl_verify_parameters(void); extern void ipl_update_parameters(void); extern size_t append_ipl_vmparm(char *, size_t); extern size_t append_ipl_scpdata(char *, size_t); diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 7b93b78f423c..9bfad2ad6312 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -95,7 +95,7 @@ struct lowcore { /* Current process. */ __u64 current_task; /* 0x0310 */ - __u64 thread_info; /* 0x0318 */ + __u8 pad_0x318[0x320-0x318]; /* 0x0318 */ __u64 kernel_stack; /* 0x0320 */ /* Interrupt, panic and restart stack. */ @@ -126,7 +126,8 @@ struct lowcore { __u64 percpu_offset; /* 0x0378 */ __u64 vdso_per_cpu_data; /* 0x0380 */ __u64 machine_flags; /* 0x0388 */ - __u8 pad_0x0390[0x0398-0x0390]; /* 0x0390 */ + __u32 preempt_count; /* 0x0390 */ + __u8 pad_0x0394[0x0398-0x0394]; /* 0x0394 */ __u64 gmap; /* 0x0398 */ __u32 spinlock_lockval; /* 0x03a0 */ __u32 fpu_flags; /* 0x03a4 */ diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h index e75c64cbcf08..c232ef9711f5 100644 --- a/arch/s390/include/asm/pci_clp.h +++ b/arch/s390/include/asm/pci_clp.h @@ -46,6 +46,8 @@ struct clp_fh_list_entry { #define CLP_UTIL_STR_LEN 64 #define CLP_PFIP_NR_SEGMENTS 4 +extern bool zpci_unique_uid; + /* List PCI functions request */ struct clp_req_list_pci { struct clp_req_hdr hdr; @@ -59,7 +61,8 @@ struct clp_rsp_list_pci { u64 resume_token; u32 reserved2; u16 max_fn; - u8 reserved3; + u8 : 7; + u8 uid_checking : 1; u8 entry_size; struct clp_fh_list_entry fh_list[CLP_FH_LIST_NR_ENTRIES]; } __packed; diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index f4eb9843eed4..166f703dad7c 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -27,17 +27,17 @@ extern int page_table_allocate_pgste; static inline void clear_table(unsigned long *s, unsigned long val, size_t n) { - typedef struct { char _[n]; } addrtype; - - *s = val; - n = (n / 256) - 1; - asm volatile( - " mvc 8(248,%0),0(%0)\n" - "0: mvc 256(256,%0),0(%0)\n" - " la %0,256(%0)\n" - " brct %1,0b\n" - : "+a" (s), "+d" (n), "=m" (*(addrtype *) s) - : "m" (*(addrtype *) s)); + struct addrtype { char _[256]; }; + int i; + + for (i = 0; i < n; i += 256) { + *s = val; + asm volatile( + "mvc 8(248,%[s]),0(%[s])\n" + : "+m" (*(struct addrtype *) s) + : [s] "a" (s)); + s += 256 / sizeof(long); + } } static inline void crst_table_init(unsigned long *crst, unsigned long entry) diff --git a/arch/s390/include/asm/preempt.h b/arch/s390/include/asm/preempt.h new file mode 100644 index 000000000000..b0776b2c8dcf --- /dev/null +++ b/arch/s390/include/asm/preempt.h @@ -0,0 +1,137 @@ +#ifndef __ASM_PREEMPT_H +#define __ASM_PREEMPT_H + +#include <asm/current.h> +#include <linux/thread_info.h> +#include <asm/atomic_ops.h> + +#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES + +#define PREEMPT_ENABLED (0 + PREEMPT_NEED_RESCHED) + +static inline int preempt_count(void) +{ + return READ_ONCE(S390_lowcore.preempt_count) & ~PREEMPT_NEED_RESCHED; +} + +static inline void preempt_count_set(int pc) +{ + int old, new; + + do { + old = READ_ONCE(S390_lowcore.preempt_count); + new = (old & PREEMPT_NEED_RESCHED) | + (pc & ~PREEMPT_NEED_RESCHED); + } while (__atomic_cmpxchg(&S390_lowcore.preempt_count, + old, new) != old); +} + +#define init_task_preempt_count(p) do { } while (0) + +#define init_idle_preempt_count(p, cpu) do { \ + S390_lowcore.preempt_count = PREEMPT_ENABLED; \ +} while (0) + +static inline void set_preempt_need_resched(void) +{ + __atomic_and(~PREEMPT_NEED_RESCHED, &S390_lowcore.preempt_count); +} + +static inline void clear_preempt_need_resched(void) +{ + __atomic_or(PREEMPT_NEED_RESCHED, &S390_lowcore.preempt_count); +} + +static inline bool test_preempt_need_resched(void) +{ + return !(READ_ONCE(S390_lowcore.preempt_count) & PREEMPT_NEED_RESCHED); +} + +static inline void __preempt_count_add(int val) +{ + if (__builtin_constant_p(val) && (val >= -128) && (val <= 127)) + __atomic_add_const(val, &S390_lowcore.preempt_count); + else + __atomic_add(val, &S390_lowcore.preempt_count); +} + +static inline void __preempt_count_sub(int val) +{ + __preempt_count_add(-val); +} + +static inline bool __preempt_count_dec_and_test(void) +{ + return __atomic_add(-1, &S390_lowcore.preempt_count) == 1; +} + +static inline bool should_resched(int preempt_offset) +{ + return unlikely(READ_ONCE(S390_lowcore.preempt_count) == + preempt_offset); +} + +#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */ + +#define PREEMPT_ENABLED (0) + +static inline int preempt_count(void) +{ + return READ_ONCE(S390_lowcore.preempt_count); +} + +static inline void preempt_count_set(int pc) +{ + S390_lowcore.preempt_count = pc; +} + +#define init_task_preempt_count(p) do { } while (0) + +#define init_idle_preempt_count(p, cpu) do { \ + S390_lowcore.preempt_count = PREEMPT_ENABLED; \ +} while (0) + +static inline void set_preempt_need_resched(void) +{ +} + +static inline void clear_preempt_need_resched(void) +{ +} + +static inline bool test_preempt_need_resched(void) +{ + return false; +} + +static inline void __preempt_count_add(int val) +{ + S390_lowcore.preempt_count += val; +} + +static inline void __preempt_count_sub(int val) +{ + S390_lowcore.preempt_count -= val; +} + +static inline bool __preempt_count_dec_and_test(void) +{ + return !--S390_lowcore.preempt_count && tif_need_resched(); +} + +static inline bool should_resched(int preempt_offset) +{ + return unlikely(preempt_count() == preempt_offset && + tif_need_resched()); +} + +#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */ + +#ifdef CONFIG_PREEMPT +extern asmlinkage void preempt_schedule(void); +#define __preempt_schedule() preempt_schedule() +extern asmlinkage void preempt_schedule_notrace(void); +#define __preempt_schedule_notrace() preempt_schedule_notrace() +#endif /* CONFIG_PREEMPT */ + +#endif /* __ASM_PREEMPT_H */ diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 9d3a21aedc97..6bca916a5ba0 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -110,14 +110,20 @@ typedef struct { struct thread_struct { unsigned int acrs[NUM_ACRS]; unsigned long ksp; /* kernel stack pointer */ + unsigned long user_timer; /* task cputime in user space */ + unsigned long system_timer; /* task cputime in kernel space */ + unsigned long sys_call_table; /* system call table address */ mm_segment_t mm_segment; unsigned long gmap_addr; /* address of last gmap fault. */ unsigned int gmap_write_flag; /* gmap fault write indication */ unsigned int gmap_int_code; /* int code of last gmap fault */ unsigned int gmap_pfault; /* signal of a pending guest pfault */ + /* Per-thread information related to debugging */ struct per_regs per_user; /* User specified PER registers */ struct per_event per_event; /* Cause of the last PER trap */ unsigned long per_flags; /* Flags to control debug behavior */ + unsigned int system_call; /* system call number in signal */ + unsigned long last_break; /* last breaking-event-address. */ /* pfault_wait is used to block the process on a pfault event */ unsigned long pfault_wait; struct list_head list; diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index 2ad9c204b1a2..8db92a5b3bf1 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -101,7 +101,8 @@ struct zpci_report_error_header { u8 data[0]; /* Subsequent Data passed verbatim to SCLP ET 24 */ } __packed; -int sclp_get_core_info(struct sclp_core_info *info); +int _sclp_get_core_info_early(struct sclp_core_info *info); +int _sclp_get_core_info(struct sclp_core_info *info); int sclp_core_configure(u8 core); int sclp_core_deconfigure(u8 core); int sclp_sdias_blk_count(void); @@ -119,4 +120,11 @@ void sclp_early_detect(void); void _sclp_print_early(const char *); void sclp_ocf_cpc_name_copy(char *dst); +static inline int sclp_get_core_info(struct sclp_core_info *info, int early) +{ + if (early) + return _sclp_get_core_info_early(info); + return _sclp_get_core_info(info); +} + #endif /* _ASM_S390_SCLP_H */ diff --git a/arch/s390/include/asm/scsw.h b/arch/s390/include/asm/scsw.h index 4af99cdaddf5..17a7904f001a 100644 --- a/arch/s390/include/asm/scsw.h +++ b/arch/s390/include/asm/scsw.h @@ -96,7 +96,8 @@ struct tm_scsw { u32 dstat:8; u32 cstat:8; u32 fcxs:8; - u32 schxs:8; + u32 ifob:1; + u32 sesq:7; } __attribute__ ((packed)); /** @@ -177,6 +178,9 @@ union scsw { #define SCHN_STAT_INTF_CTRL_CHK 0x02 #define SCHN_STAT_CHAIN_CHECK 0x01 +#define SCSW_SESQ_DEV_NOFCX 3 +#define SCSW_SESQ_PATH_NOFCX 4 + /* * architectured values for first sense byte */ diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index 0cc383b9be7f..3deb134587b7 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -36,6 +36,7 @@ extern void smp_yield_cpu(int cpu); extern void smp_cpu_set_polarization(int cpu, int val); extern int smp_cpu_get_polarization(int cpu); extern void smp_fill_possible_mask(void); +extern void smp_detect_cpus(void); #else /* CONFIG_SMP */ @@ -56,6 +57,7 @@ static inline int smp_store_status(int cpu) { return 0; } static inline int smp_vcpu_scheduled(int cpu) { return 1; } static inline void smp_yield_cpu(int cpu) { } static inline void smp_fill_possible_mask(void) { } +static inline void smp_detect_cpus(void) { } #endif /* CONFIG_SMP */ @@ -69,6 +71,12 @@ static inline void smp_stop_cpu(void) } } +/* Return thread 0 CPU number as base CPU */ +static inline int smp_get_base_cpu(int cpu) +{ + return cpu - (cpu % (smp_cpu_mtid + 1)); +} + #ifdef CONFIG_HOTPLUG_CPU extern int smp_rescan_cpus(void); extern void __noreturn cpu_die(void); diff --git a/arch/s390/include/asm/string.h b/arch/s390/include/asm/string.h index 8662f5c8e17f..15a3c005c274 100644 --- a/arch/s390/include/asm/string.h +++ b/arch/s390/include/asm/string.h @@ -14,6 +14,7 @@ #define __HAVE_ARCH_MEMCHR /* inline & arch function */ #define __HAVE_ARCH_MEMCMP /* arch function */ #define __HAVE_ARCH_MEMCPY /* gcc builtin & arch function */ +#define __HAVE_ARCH_MEMMOVE /* gcc builtin & arch function */ #define __HAVE_ARCH_MEMSCAN /* inline & arch function */ #define __HAVE_ARCH_MEMSET /* gcc builtin & arch function */ #define __HAVE_ARCH_STRCAT /* inline & arch function */ @@ -32,6 +33,7 @@ extern int memcmp(const void *, const void *, size_t); extern void *memcpy(void *, const void *, size_t); extern void *memset(void *, int, size_t); +extern void *memmove(void *, const void *, size_t); extern int strcmp(const char *,const char *); extern size_t strlcat(char *, const char *, size_t); extern size_t strlcpy(char *, const char *, size_t); @@ -40,7 +42,6 @@ extern char *strncpy(char *, const char *, size_t); extern char *strrchr(const char *, int); extern char *strstr(const char *, const char *); -#undef __HAVE_ARCH_MEMMOVE #undef __HAVE_ARCH_STRCHR #undef __HAVE_ARCH_STRNCHR #undef __HAVE_ARCH_STRNCMP diff --git a/arch/s390/include/asm/sysinfo.h b/arch/s390/include/asm/sysinfo.h index 2728114d5484..229326c942c7 100644 --- a/arch/s390/include/asm/sysinfo.h +++ b/arch/s390/include/asm/sysinfo.h @@ -107,6 +107,11 @@ struct sysinfo_2_2_2 { char reserved_3[5]; unsigned short cpus_dedicated; unsigned short cpus_shared; + char reserved_4[3]; + unsigned char vsne; + uuid_be uuid; + char reserved_5[160]; + char ext_name[256]; }; #define LPAR_CHAR_DEDICATED (1 << 7) @@ -127,7 +132,7 @@ struct sysinfo_3_2_2 { unsigned int caf; char cpi[16]; char reserved_1[3]; - char ext_name_encoding; + unsigned char evmne; unsigned int reserved_2; uuid_be uuid; } vm[8]; diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index f15c0398c363..a5b54a445eb8 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -12,10 +12,10 @@ /* * Size of kernel stack for each process */ -#define THREAD_ORDER 2 +#define THREAD_SIZE_ORDER 2 #define ASYNC_ORDER 2 -#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) +#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) #define ASYNC_SIZE (PAGE_SIZE << ASYNC_ORDER) #ifndef __ASSEMBLY__ @@ -30,15 +30,7 @@ * - if the contents of this structure are changed, the assembly constants must also be changed */ struct thread_info { - struct task_struct *task; /* main task structure */ unsigned long flags; /* low level flags */ - unsigned long sys_call_table; /* System call table address */ - unsigned int cpu; /* current CPU */ - int preempt_count; /* 0 => preemptable, <0 => BUG */ - unsigned int system_call; - __u64 user_timer; - __u64 system_timer; - unsigned long last_break; /* last breaking-event-address. */ }; /* @@ -46,26 +38,14 @@ struct thread_info { */ #define INIT_THREAD_INFO(tsk) \ { \ - .task = &tsk, \ .flags = 0, \ - .cpu = 0, \ - .preempt_count = INIT_PREEMPT_COUNT, \ } -#define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) -/* how to get the thread information struct from C */ -static inline struct thread_info *current_thread_info(void) -{ - return (struct thread_info *) S390_lowcore.thread_info; -} - void arch_release_task_struct(struct task_struct *tsk); int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); -#define THREAD_SIZE_ORDER THREAD_ORDER - #endif /* diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index 0bb08f341c09..de8298800722 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -52,11 +52,9 @@ static inline void store_clock_comparator(__u64 *time) void clock_comparator_work(void); -void __init ptff_init(void); +void __init time_early_init(void); extern unsigned char ptff_function_mask[16]; -extern unsigned long lpar_offset; -extern unsigned long initial_leap_seconds; /* Function codes for the ptff instruction. */ #define PTFF_QAF 0x00 /* query available functions */ @@ -100,21 +98,28 @@ struct ptff_qui { unsigned int pad_0x5c[41]; } __packed; -static inline int ptff(void *ptff_block, size_t len, unsigned int func) -{ - typedef struct { char _[len]; } addrtype; - register unsigned int reg0 asm("0") = func; - register unsigned long reg1 asm("1") = (unsigned long) ptff_block; - int rc; - - asm volatile( - " .word 0x0104\n" - " ipm %0\n" - " srl %0,28\n" - : "=d" (rc), "+m" (*(addrtype *) ptff_block) - : "d" (reg0), "d" (reg1) : "cc"); - return rc; -} +/* + * ptff - Perform timing facility function + * @ptff_block: Pointer to ptff parameter block + * @len: Length of parameter block + * @func: Function code + * Returns: Condition code (0 on success) + */ +#define ptff(ptff_block, len, func) \ +({ \ + struct addrtype { char _[len]; }; \ + register unsigned int reg0 asm("0") = func; \ + register unsigned long reg1 asm("1") = (unsigned long) (ptff_block);\ + int rc; \ + \ + asm volatile( \ + " .word 0x0104\n" \ + " ipm %0\n" \ + " srl %0,28\n" \ + : "=d" (rc), "+m" (*(struct addrtype *) reg1) \ + : "d" (reg0), "d" (reg1) : "cc"); \ + rc; \ +}) static inline unsigned long long local_tick_disable(void) { diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index f15f5571ca2b..fa1bfce10370 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -22,21 +22,22 @@ struct cpu_topology_s390 { cpumask_t drawer_mask; }; -DECLARE_PER_CPU(struct cpu_topology_s390, cpu_topology); - -#define topology_physical_package_id(cpu) (per_cpu(cpu_topology, cpu).socket_id) -#define topology_thread_id(cpu) (per_cpu(cpu_topology, cpu).thread_id) -#define topology_sibling_cpumask(cpu) \ - (&per_cpu(cpu_topology, cpu).thread_mask) -#define topology_core_id(cpu) (per_cpu(cpu_topology, cpu).core_id) -#define topology_core_cpumask(cpu) (&per_cpu(cpu_topology, cpu).core_mask) -#define topology_book_id(cpu) (per_cpu(cpu_topology, cpu).book_id) -#define topology_book_cpumask(cpu) (&per_cpu(cpu_topology, cpu).book_mask) -#define topology_drawer_id(cpu) (per_cpu(cpu_topology, cpu).drawer_id) -#define topology_drawer_cpumask(cpu) (&per_cpu(cpu_topology, cpu).drawer_mask) +extern struct cpu_topology_s390 cpu_topology[NR_CPUS]; +extern cpumask_t cpus_with_topology; + +#define topology_physical_package_id(cpu) (cpu_topology[cpu].socket_id) +#define topology_thread_id(cpu) (cpu_topology[cpu].thread_id) +#define topology_sibling_cpumask(cpu) (&cpu_topology[cpu].thread_mask) +#define topology_core_id(cpu) (cpu_topology[cpu].core_id) +#define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_mask) +#define topology_book_id(cpu) (cpu_topology[cpu].book_id) +#define topology_book_cpumask(cpu) (&cpu_topology[cpu].book_mask) +#define topology_drawer_id(cpu) (cpu_topology[cpu].drawer_id) +#define topology_drawer_cpumask(cpu) (&cpu_topology[cpu].drawer_mask) #define mc_capable() 1 +void topology_init_early(void); int topology_cpu_init(struct cpu *); int topology_set_cpu_management(int fc); void topology_schedule_update(void); @@ -46,6 +47,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu); #else /* CONFIG_SCHED_TOPOLOGY */ +static inline void topology_init_early(void) { } static inline void topology_schedule_update(void) { } static inline int topology_cpu_init(struct cpu *cpu) { return 0; } static inline void topology_expect_change(void) { } @@ -65,7 +67,7 @@ static inline void topology_expect_change(void) { } #define cpu_to_node cpu_to_node static inline int cpu_to_node(int cpu) { - return per_cpu(cpu_topology, cpu).node_id; + return cpu_topology[cpu].node_id; } /* Returns a pointer to the cpumask of CPUs on node 'node'. */ diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index 52d7c8709279..f82b04e85a21 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -37,14 +37,14 @@ #define get_ds() (KERNEL_DS) #define get_fs() (current->thread.mm_segment) -#define set_fs(x) \ -({ \ +#define set_fs(x) \ +{ \ unsigned long __pto; \ current->thread.mm_segment = (x); \ __pto = current->thread.mm_segment.ar4 ? \ S390_lowcore.user_asce : S390_lowcore.kernel_asce; \ __ctl_load(__pto, 7, 7); \ -}) +} #define segment_eq(a,b) ((a).ar4 == (b).ar4) diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h index d0a2dbf2433d..88bdc477a843 100644 --- a/arch/s390/include/asm/vdso.h +++ b/arch/s390/include/asm/vdso.h @@ -33,6 +33,8 @@ struct vdso_data { __u32 ectg_available; /* ECTG instruction present 0x58 */ __u32 tk_mult; /* Mult. used for xtime_nsec 0x5c */ __u32 tk_shift; /* Shift used for xtime_nsec 0x60 */ + __u32 ts_dir; /* TOD steering direction 0x64 */ + __u64 ts_end; /* TOD steering end 0x68 */ }; struct vdso_per_cpu_data { diff --git a/arch/s390/include/uapi/asm/Kbuild b/arch/s390/include/uapi/asm/Kbuild index cc44b09c25fc..bf736e764cb4 100644 --- a/arch/s390/include/uapi/asm/Kbuild +++ b/arch/s390/include/uapi/asm/Kbuild @@ -12,6 +12,7 @@ header-y += dasd.h header-y += debug.h header-y += errno.h header-y += fcntl.h +header-y += hypfs.h header-y += ioctl.h header-y += ioctls.h header-y += ipcbuf.h @@ -29,16 +30,16 @@ header-y += ptrace.h header-y += qeth.h header-y += resource.h header-y += schid.h +header-y += sclp_ctl.h header-y += sembuf.h header-y += setup.h header-y += shmbuf.h +header-y += sie.h header-y += sigcontext.h header-y += siginfo.h header-y += signal.h header-y += socket.h header-y += sockios.h -header-y += sclp_ctl.h -header-y += sie.h header-y += stat.h header-y += statfs.h header-y += swab.h diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 1f0fe98f6db9..36b5101c8606 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -2,20 +2,47 @@ # Makefile for the linux kernel. # -KCOV_INSTRUMENT_early.o := n -KCOV_INSTRUMENT_sclp.o := n -KCOV_INSTRUMENT_als.o := n - ifdef CONFIG_FUNCTION_TRACER -# Don't trace early setup code and tracing code -CFLAGS_REMOVE_early.o = $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) + +# Do not trace tracer code +CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) + +# Do not trace early setup code +CFLAGS_REMOVE_als.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_early.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_sclp.o = $(CC_FLAGS_FTRACE) + +endif + +GCOV_PROFILE_als.o := n +GCOV_PROFILE_early.o := n +GCOV_PROFILE_sclp.o := n + +KCOV_INSTRUMENT_als.o := n +KCOV_INSTRUMENT_early.o := n +KCOV_INSTRUMENT_sclp.o := n + +UBSAN_SANITIZE_als.o := n +UBSAN_SANITIZE_early.o := n +UBSAN_SANITIZE_sclp.o := n + +# +# Use -march=z900 for sclp.c and als.c to be able to print an error +# message if the kernel is started on a machine which is too old +# +ifneq ($(CC_FLAGS_MARCH),-march=z900) +CFLAGS_REMOVE_als.o += $(CC_FLAGS_MARCH) +CFLAGS_als.o += -march=z900 +CFLAGS_REMOVE_sclp.o += $(CC_FLAGS_MARCH) +CFLAGS_sclp.o += -march=z900 +AFLAGS_REMOVE_head.o += $(CC_FLAGS_MARCH) +AFLAGS_head.o += -march=z900 endif # # Passing null pointers is ok for smp code, since we access the lowcore here. # -CFLAGS_smp.o := -Wno-nonnull +CFLAGS_smp.o := -Wno-nonnull # # Disable tailcall optimizations for stack / callchain walking functions @@ -30,27 +57,7 @@ CFLAGS_dumpstack.o += -fno-optimize-sibling-calls # CFLAGS_ptrace.o += -DUTS_MACHINE='"$(UTS_MACHINE)"' -CFLAGS_sysinfo.o += -w - -# -# Use -march=z900 for sclp.c and als.c to be able to print an error -# message if the kernel is started on a machine which is too old -# -CFLAGS_REMOVE_sclp.o = $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_als.o = $(CC_FLAGS_FTRACE) -ifneq ($(CC_FLAGS_MARCH),-march=z900) -CFLAGS_REMOVE_sclp.o += $(CC_FLAGS_MARCH) -CFLAGS_sclp.o += -march=z900 -CFLAGS_REMOVE_als.o += $(CC_FLAGS_MARCH) -CFLAGS_als.o += -march=z900 -AFLAGS_REMOVE_head.o += $(CC_FLAGS_MARCH) -AFLAGS_head.o += -march=z900 -endif -GCOV_PROFILE_sclp.o := n -GCOV_PROFILE_als.o := n -UBSAN_SANITIZE_als.o := n -UBSAN_SANITIZE_early.o := n -UBSAN_SANITIZE_sclp.o := n +CFLAGS_sysinfo.o += -w obj-y := traps.o time.o process.o base.o early.o setup.o idle.o vtime.o obj-y += processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index f3df9e0a5dec..c4b3570ded5b 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -25,12 +25,14 @@ int main(void) { /* task struct offsets */ - OFFSET(__TASK_thread_info, task_struct, stack); + OFFSET(__TASK_stack, task_struct, stack); OFFSET(__TASK_thread, task_struct, thread); OFFSET(__TASK_pid, task_struct, pid); BLANK(); /* thread struct offsets */ OFFSET(__THREAD_ksp, thread_struct, ksp); + OFFSET(__THREAD_sysc_table, thread_struct, sys_call_table); + OFFSET(__THREAD_last_break, thread_struct, last_break); OFFSET(__THREAD_FPU_fpc, thread_struct, fpu.fpc); OFFSET(__THREAD_FPU_regs, thread_struct, fpu.regs); OFFSET(__THREAD_per_cause, thread_struct, per_event.cause); @@ -39,14 +41,7 @@ int main(void) OFFSET(__THREAD_trap_tdb, thread_struct, trap_tdb); BLANK(); /* thread info offsets */ - OFFSET(__TI_task, thread_info, task); - OFFSET(__TI_flags, thread_info, flags); - OFFSET(__TI_sysc_table, thread_info, sys_call_table); - OFFSET(__TI_cpu, thread_info, cpu); - OFFSET(__TI_precount, thread_info, preempt_count); - OFFSET(__TI_user_timer, thread_info, user_timer); - OFFSET(__TI_system_timer, thread_info, system_timer); - OFFSET(__TI_last_break, thread_info, last_break); + OFFSET(__TI_flags, task_struct, thread_info.flags); BLANK(); /* pt_regs offsets */ OFFSET(__PT_ARGS, pt_regs, args); @@ -79,6 +74,8 @@ int main(void) OFFSET(__VDSO_ECTG_OK, vdso_data, ectg_available); OFFSET(__VDSO_TK_MULT, vdso_data, tk_mult); OFFSET(__VDSO_TK_SHIFT, vdso_data, tk_shift); + OFFSET(__VDSO_TS_DIR, vdso_data, ts_dir); + OFFSET(__VDSO_TS_END, vdso_data, ts_end); OFFSET(__VDSO_ECTG_BASE, vdso_per_cpu_data, ectg_timer_base); OFFSET(__VDSO_ECTG_USER, vdso_per_cpu_data, ectg_user_time); OFFSET(__VDSO_CPU_NR, vdso_per_cpu_data, cpu_nr); @@ -159,7 +156,6 @@ int main(void) OFFSET(__LC_INT_CLOCK, lowcore, int_clock); OFFSET(__LC_MCCK_CLOCK, lowcore, mcck_clock); OFFSET(__LC_CURRENT, lowcore, current_task); - OFFSET(__LC_THREAD_INFO, lowcore, thread_info); OFFSET(__LC_KERNEL_STACK, lowcore, kernel_stack); OFFSET(__LC_ASYNC_STACK, lowcore, async_stack); OFFSET(__LC_PANIC_STACK, lowcore, panic_stack); @@ -173,6 +169,7 @@ int main(void) OFFSET(__LC_PERCPU_OFFSET, lowcore, percpu_offset); OFFSET(__LC_VDSO_PER_CPU, lowcore, vdso_per_cpu_data); OFFSET(__LC_MACHINE_FLAGS, lowcore, machine_flags); + OFFSET(__LC_PREEMPT_COUNT, lowcore, preempt_count); OFFSET(__LC_GMAP, lowcore, gmap); OFFSET(__LC_PASTE, lowcore, paste); /* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */ diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index 4af60374eba0..6f2a6ab13cb5 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -446,7 +446,7 @@ static int setup_frame32(struct ksignal *ksig, sigset_t *set, /* set extra registers only for synchronous signals */ regs->gprs[4] = regs->int_code & 127; regs->gprs[5] = regs->int_parm_long; - regs->gprs[6] = task_thread_info(current)->last_break; + regs->gprs[6] = current->thread.last_break; } return 0; @@ -523,7 +523,7 @@ static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set, regs->gprs[2] = ksig->sig; regs->gprs[3] = (__force __u64) &frame->info; regs->gprs[4] = (__force __u64) &frame->uc; - regs->gprs[5] = task_thread_info(current)->last_break; + regs->gprs[5] = current->thread.last_break; return 0; } diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 2374c5b46bbc..d038c8cea6cb 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -293,6 +293,7 @@ static noinline __init void setup_lowcore_early(void) psw.addr = (unsigned long) s390_base_pgm_handler; S390_lowcore.program_new_psw = psw; s390_base_pgm_handler_fn = early_pgm_check_handler; + S390_lowcore.preempt_count = INIT_PREEMPT_COUNT; } static noinline __init void setup_facility_list(void) @@ -391,7 +392,49 @@ static int __init cad_init(void) } early_initcall(cad_init); -static __init void rescue_initrd(void) +static __init void memmove_early(void *dst, const void *src, size_t n) +{ + unsigned long addr; + long incr; + psw_t old; + + if (!n) + return; + incr = 1; + if (dst > src) { + incr = -incr; + dst += n - 1; + src += n - 1; + } + old = S390_lowcore.program_new_psw; + S390_lowcore.program_new_psw.mask = __extract_psw(); + asm volatile( + " larl %[addr],1f\n" + " stg %[addr],%[psw_pgm_addr]\n" + "0: mvc 0(1,%[dst]),0(%[src])\n" + " agr %[dst],%[incr]\n" + " agr %[src],%[incr]\n" + " brctg %[n],0b\n" + "1:\n" + : [addr] "=&d" (addr), + [psw_pgm_addr] "=&Q" (S390_lowcore.program_new_psw.addr), + [dst] "+&a" (dst), [src] "+&a" (src), [n] "+d" (n) + : [incr] "d" (incr) + : "cc", "memory"); + S390_lowcore.program_new_psw = old; +} + +static __init noinline void ipl_save_parameters(void) +{ + void *src, *dst; + + src = (void *)(unsigned long) S390_lowcore.ipl_parmblock_ptr; + dst = (void *) IPL_PARMBLOCK_ORIGIN; + memmove_early(dst, src, PAGE_SIZE); + S390_lowcore.ipl_parmblock_ptr = IPL_PARMBLOCK_ORIGIN; +} + +static __init noinline void rescue_initrd(void) { #ifdef CONFIG_BLK_DEV_INITRD unsigned long min_initrd_addr = (unsigned long) _end + (4UL << 20); @@ -405,7 +448,7 @@ static __init void rescue_initrd(void) return; if (INITRD_START >= min_initrd_addr) return; - memmove((void *) min_initrd_addr, (void *) INITRD_START, INITRD_SIZE); + memmove_early((void *) min_initrd_addr, (void *) INITRD_START, INITRD_SIZE); INITRD_START = min_initrd_addr; #endif } @@ -467,7 +510,8 @@ void __init startup_init(void) ipl_save_parameters(); rescue_initrd(); clear_bss_section(); - ptff_init(); + ipl_verify_parameters(); + time_early_init(); init_kernel_storage_key(); lockdep_off(); setup_lowcore_early(); diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 49a30737adde..97298c58b2be 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -42,7 +42,7 @@ __PT_R13 = __PT_GPRS + 104 __PT_R14 = __PT_GPRS + 112 __PT_R15 = __PT_GPRS + 120 -STACK_SHIFT = PAGE_SHIFT + THREAD_ORDER +STACK_SHIFT = PAGE_SHIFT + THREAD_SIZE_ORDER STACK_SIZE = 1 << STACK_SHIFT STACK_INIT = STACK_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE @@ -123,8 +123,14 @@ _PIF_WORK = (_PIF_PER_TRAP) .macro LAST_BREAK scratch srag \scratch,%r10,23 +#ifdef CONFIG_HAVE_MARCH_Z990_FEATURES jz .+10 - stg %r10,__TI_last_break(%r12) + stg %r10,__TASK_thread+__THREAD_last_break(%r12) +#else + jz .+14 + lghi \scratch,__TASK_thread + stg %r10,__THREAD_last_break(\scratch,%r12) +#endif .endm .macro REENABLE_IRQS @@ -186,14 +192,13 @@ ENTRY(__switch_to) stmg %r6,%r15,__SF_GPRS(%r15) # store gprs of prev task lgr %r1,%r2 aghi %r1,__TASK_thread # thread_struct of prev task - lg %r5,__TASK_thread_info(%r3) # get thread_info of next + lg %r5,__TASK_stack(%r3) # start of kernel stack of next stg %r15,__THREAD_ksp(%r1) # store kernel stack of prev lgr %r1,%r3 aghi %r1,__TASK_thread # thread_struct of next task lgr %r15,%r5 aghi %r15,STACK_INIT # end of kernel stack of next stg %r3,__LC_CURRENT # store task struct of next - stg %r5,__LC_THREAD_INFO # store thread info of next stg %r15,__LC_KERNEL_STACK # store end of kernel stack lg %r15,__THREAD_ksp(%r1) # load kernel stack of next /* c4 is used in guest detection: arch/s390/kernel/perf_cpum_sf.c */ @@ -274,7 +279,7 @@ ENTRY(system_call) .Lsysc_stmg: stmg %r8,%r15,__LC_SAVE_AREA_SYNC lg %r10,__LC_LAST_BREAK - lg %r12,__LC_THREAD_INFO + lg %r12,__LC_CURRENT lghi %r14,_PIF_SYSCALL .Lsysc_per: lg %r15,__LC_KERNEL_STACK @@ -288,7 +293,13 @@ ENTRY(system_call) mvc __PT_INT_CODE(4,%r11),__LC_SVC_ILC stg %r14,__PT_FLAGS(%r11) .Lsysc_do_svc: - lg %r10,__TI_sysc_table(%r12) # address of system call table + # load address of system call table +#ifdef CONFIG_HAVE_MARCH_Z990_FEATURES + lg %r10,__TASK_thread+__THREAD_sysc_table(%r12) +#else + lghi %r13,__TASK_thread + lg %r10,__THREAD_sysc_table(%r13,%r12) +#endif llgh %r8,__PT_INT_CODE+2(%r11) slag %r8,%r8,2 # shift and test for svc 0 jnz .Lsysc_nr_ok @@ -389,7 +400,6 @@ ENTRY(system_call) TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL jno .Lsysc_return lmg %r2,%r7,__PT_R2(%r11) # load svc arguments - lg %r10,__TI_sysc_table(%r12) # address of system call table lghi %r8,0 # svc 0 returns -ENOSYS llgh %r1,__PT_INT_CODE+2(%r11) # load new svc number cghi %r1,NR_syscalls @@ -457,7 +467,7 @@ ENTRY(system_call) # ENTRY(ret_from_fork) la %r11,STACK_FRAME_OVERHEAD(%r15) - lg %r12,__LC_THREAD_INFO + lg %r12,__LC_CURRENT brasl %r14,schedule_tail TRACE_IRQS_ON ssm __LC_SVC_NEW_PSW # reenable interrupts @@ -478,7 +488,7 @@ ENTRY(pgm_check_handler) stpt __LC_SYNC_ENTER_TIMER stmg %r8,%r15,__LC_SAVE_AREA_SYNC lg %r10,__LC_LAST_BREAK - lg %r12,__LC_THREAD_INFO + lg %r12,__LC_CURRENT larl %r13,cleanup_critical lmg %r8,%r9,__LC_PGM_OLD_PSW tmhh %r8,0x0001 # test problem state bit @@ -501,7 +511,7 @@ ENTRY(pgm_check_handler) 2: LAST_BREAK %r14 UPDATE_VTIME %r14,%r15,__LC_SYNC_ENTER_TIMER lg %r15,__LC_KERNEL_STACK - lg %r14,__TI_task(%r12) + lgr %r14,%r12 aghi %r14,__TASK_thread # pointer to thread_struct lghi %r13,__LC_PGM_TDB tm __LC_PGM_ILC+2,0x02 # check for transaction abort @@ -567,7 +577,7 @@ ENTRY(io_int_handler) stpt __LC_ASYNC_ENTER_TIMER stmg %r8,%r15,__LC_SAVE_AREA_ASYNC lg %r10,__LC_LAST_BREAK - lg %r12,__LC_THREAD_INFO + lg %r12,__LC_CURRENT larl %r13,cleanup_critical lmg %r8,%r9,__LC_IO_OLD_PSW SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER @@ -626,7 +636,7 @@ ENTRY(io_int_handler) jo .Lio_work_user # yes -> do resched & signal #ifdef CONFIG_PREEMPT # check for preemptive scheduling - icm %r0,15,__TI_precount(%r12) + icm %r0,15,__LC_PREEMPT_COUNT jnz .Lio_restore # preemption is disabled TSTMSK __TI_flags(%r12),_TIF_NEED_RESCHED jno .Lio_restore @@ -741,7 +751,7 @@ ENTRY(ext_int_handler) stpt __LC_ASYNC_ENTER_TIMER stmg %r8,%r15,__LC_SAVE_AREA_ASYNC lg %r10,__LC_LAST_BREAK - lg %r12,__LC_THREAD_INFO + lg %r12,__LC_CURRENT larl %r13,cleanup_critical lmg %r8,%r9,__LC_EXT_OLD_PSW SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER @@ -798,13 +808,10 @@ ENTRY(save_fpu_regs) TSTMSK __LC_CPU_FLAGS,_CIF_FPU bor %r14 stfpc __THREAD_FPU_fpc(%r2) -.Lsave_fpu_regs_fpc_end: lg %r3,__THREAD_FPU_regs(%r2) TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_VX jz .Lsave_fpu_regs_fp # no -> store FP regs -.Lsave_fpu_regs_vx_low: VSTM %v0,%v15,0,%r3 # vstm 0,15,0(3) -.Lsave_fpu_regs_vx_high: VSTM %v16,%v31,256,%r3 # vstm 16,31,256(3) j .Lsave_fpu_regs_done # -> set CIF_FPU flag .Lsave_fpu_regs_fp: @@ -851,9 +858,7 @@ load_fpu_regs: TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_VX lg %r4,__THREAD_FPU_regs(%r4) # %r4 <- reg save area jz .Lload_fpu_regs_fp # -> no VX, load FP regs -.Lload_fpu_regs_vx: VLM %v0,%v15,0,%r4 -.Lload_fpu_regs_vx_high: VLM %v16,%v31,256,%r4 j .Lload_fpu_regs_done .Lload_fpu_regs_fp: @@ -889,7 +894,7 @@ ENTRY(mcck_int_handler) spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # revalidate cpu timer lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# revalidate gprs lg %r10,__LC_LAST_BREAK - lg %r12,__LC_THREAD_INFO + lg %r12,__LC_CURRENT larl %r13,cleanup_critical lmg %r8,%r9,__LC_MCK_OLD_PSW TSTMSK __LC_MCCK_CODE,MCCK_CODE_SYSTEM_DAMAGE @@ -948,7 +953,7 @@ ENTRY(mcck_int_handler) .Lmcck_panic: lg %r15,__LC_PANIC_STACK - aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) + la %r11,STACK_FRAME_OVERHEAD(%r15) j .Lmcck_skip # @@ -1085,7 +1090,7 @@ cleanup_critical: jhe 0f # set up saved registers r10 and r12 stg %r10,16(%r11) # r10 last break - stg %r12,32(%r11) # r12 thread-info pointer + stg %r12,32(%r11) # r12 task struct pointer 0: # check if the user time update has been done clg %r9,BASED(.Lcleanup_system_call_insn+24) jh 0f @@ -1106,7 +1111,9 @@ cleanup_critical: lg %r9,16(%r11) srag %r9,%r9,23 jz 0f - mvc __TI_last_break(8,%r12),16(%r11) + lgr %r9,%r12 + aghi %r9,__TASK_thread + mvc __THREAD_last_break(8,%r9),16(%r11) 0: # set up saved register r11 lg %r15,__LC_KERNEL_STACK la %r9,STACK_FRAME_OVERHEAD(%r15) diff --git a/arch/s390/kernel/head.S b/arch/s390/kernel/head.S index 4431905f8cfa..0b5ebf8a3d30 100644 --- a/arch/s390/kernel/head.S +++ b/arch/s390/kernel/head.S @@ -315,7 +315,7 @@ ENTRY(startup_kdump) jg startup_continue .Lstack: - .long 0x8000 + (1<<(PAGE_SHIFT+THREAD_ORDER)) + .long 0x8000 + (1<<(PAGE_SHIFT+THREAD_SIZE_ORDER)) .align 8 6: .long 0x7fffffff,0xffffffff diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S index 03c2b469c472..482d3526e32b 100644 --- a/arch/s390/kernel/head64.S +++ b/arch/s390/kernel/head64.S @@ -32,11 +32,10 @@ ENTRY(startup_continue) # # Setup stack # - larl %r15,init_thread_union - stg %r15,__LC_THREAD_INFO # cache thread info in lowcore - lg %r14,__TI_task(%r15) # cache current in lowcore + larl %r14,init_task stg %r14,__LC_CURRENT - aghi %r15,1<<(PAGE_SHIFT+THREAD_ORDER) # init_task_union + THREAD_SIZE + larl %r15,init_thread_union + aghi %r15,1<<(PAGE_SHIFT+THREAD_SIZE_ORDER) # init_task_union + THREAD_SIZE stg %r15,__LC_KERNEL_STACK # set end of kernel stack aghi %r15,-160 # diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 295bfb7124bc..ff3364a067ff 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -1991,10 +1991,9 @@ void __init ipl_update_parameters(void) diag308_set_works = 1; } -void __init ipl_save_parameters(void) +void __init ipl_verify_parameters(void) { struct cio_iplinfo iplinfo; - void *src, *dst; if (cio_get_iplinfo(&iplinfo)) return; @@ -2005,10 +2004,6 @@ void __init ipl_save_parameters(void) if (!iplinfo.is_qdio) return; ipl_flags |= IPL_PARMBLOCK_VALID; - src = (void *)(unsigned long)S390_lowcore.ipl_parmblock_ptr; - dst = (void *)IPL_PARMBLOCK_ORIGIN; - memmove(dst, src, PAGE_SIZE); - S390_lowcore.ipl_parmblock_ptr = IPL_PARMBLOCK_ORIGIN; } static LIST_HEAD(rcall); diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 285d6561076d..ef60f4177331 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -168,7 +168,7 @@ void do_softirq_own_stack(void) old = current_stack_pointer(); /* Check against async. stack address range. */ new = S390_lowcore.async_stack; - if (((new - old) >> (PAGE_SHIFT + THREAD_ORDER)) != 0) { + if (((new - old) >> (PAGE_SHIFT + THREAD_SIZE_ORDER)) != 0) { /* Need to switch to the async. stack. */ new -= STACK_FRAME_OVERHEAD; ((struct stack_frame *) new)->back_chain = old; diff --git a/arch/s390/kernel/lgr.c b/arch/s390/kernel/lgr.c index 6ea6d69339b5..ae7dff110054 100644 --- a/arch/s390/kernel/lgr.c +++ b/arch/s390/kernel/lgr.c @@ -5,7 +5,8 @@ * Author(s): Michael Holzheu <holzheu@linux.vnet.ibm.com> */ -#include <linux/module.h> +#include <linux/init.h> +#include <linux/export.h> #include <linux/timer.h> #include <linux/slab.h> #include <asm/facility.h> @@ -183,4 +184,4 @@ static int __init lgr_init(void) lgr_timer_set(); return 0; } -module_init(lgr_init); +device_initcall(lgr_init); diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index fcc634c1479a..763dec18edcd 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -995,39 +995,36 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr) regs.int_parm = CPU_MF_INT_SF_PRA; sde_regs = (struct perf_sf_sde_regs *) ®s.int_parm_long; - regs.psw.addr = sfr->basic.ia; - if (sfr->basic.T) - regs.psw.mask |= PSW_MASK_DAT; - if (sfr->basic.W) - regs.psw.mask |= PSW_MASK_WAIT; - if (sfr->basic.P) - regs.psw.mask |= PSW_MASK_PSTATE; - switch (sfr->basic.AS) { - case 0x0: - regs.psw.mask |= PSW_ASC_PRIMARY; - break; - case 0x1: - regs.psw.mask |= PSW_ASC_ACCREG; - break; - case 0x2: - regs.psw.mask |= PSW_ASC_SECONDARY; - break; - case 0x3: - regs.psw.mask |= PSW_ASC_HOME; - break; - } + psw_bits(regs.psw).ia = sfr->basic.ia; + psw_bits(regs.psw).t = sfr->basic.T; + psw_bits(regs.psw).w = sfr->basic.W; + psw_bits(regs.psw).p = sfr->basic.P; + psw_bits(regs.psw).as = sfr->basic.AS; /* - * A non-zero guest program parameter indicates a guest - * sample. - * Note that some early samples or samples from guests without + * Use the hardware provided configuration level to decide if the + * sample belongs to a guest or host. If that is not available, + * fall back to the following heuristics: + * A non-zero guest program parameter always indicates a guest + * sample. Some early samples or samples from guests without * lpp usage would be misaccounted to the host. We use the asn - * value as a heuristic to detect most of these guest samples. - * If the value differs from the host hpp value, we assume - * it to be a KVM guest. + * value as an addon heuristic to detect most of these guest samples. + * If the value differs from the host hpp value, we assume to be a + * KVM guest. */ - if (sfr->basic.gpp || sfr->basic.prim_asn != (u16) sfr->basic.hpp) + switch (sfr->basic.CL) { + case 1: /* logical partition */ + sde_regs->in_guest = 0; + break; + case 2: /* virtual machine */ sde_regs->in_guest = 1; + break; + default: /* old machine, use heuristics */ + if (sfr->basic.gpp || + sfr->basic.prim_asn != (u16)sfr->basic.hpp) + sde_regs->in_guest = 1; + break; + } overflow = 0; if (perf_exclude_event(event, ®s, sde_regs)) diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index bba4fa74b321..400d14f0b9f5 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -103,7 +103,6 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) int copy_thread(unsigned long clone_flags, unsigned long new_stackp, unsigned long arg, struct task_struct *p) { - struct thread_info *ti; struct fake_frame { struct stack_frame sf; @@ -121,9 +120,8 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp, memset(&p->thread.per_event, 0, sizeof(p->thread.per_event)); clear_tsk_thread_flag(p, TIF_SINGLE_STEP); /* Initialize per thread user and system timer values */ - ti = task_thread_info(p); - ti->user_timer = 0; - ti->system_timer = 0; + p->thread.user_timer = 0; + p->thread.system_timer = 0; frame->sf.back_chain = 0; /* new return point is ret_from_fork */ diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 9336e824e2db..b81ab8882e2e 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -461,7 +461,7 @@ long arch_ptrace(struct task_struct *child, long request, } return 0; case PTRACE_GET_LAST_BREAK: - put_user(task_thread_info(child)->last_break, + put_user(child->thread.last_break, (unsigned long __user *) data); return 0; case PTRACE_ENABLE_TE: @@ -811,7 +811,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, } return 0; case PTRACE_GET_LAST_BREAK: - put_user(task_thread_info(child)->last_break, + put_user(child->thread.last_break, (unsigned int __user *) data); return 0; } @@ -997,10 +997,10 @@ static int s390_last_break_get(struct task_struct *target, if (count > 0) { if (kbuf) { unsigned long *k = kbuf; - *k = task_thread_info(target)->last_break; + *k = target->thread.last_break; } else { unsigned long __user *u = ubuf; - if (__put_user(task_thread_info(target)->last_break, u)) + if (__put_user(target->thread.last_break, u)) return -EFAULT; } } @@ -1113,7 +1113,7 @@ static int s390_system_call_get(struct task_struct *target, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { - unsigned int *data = &task_thread_info(target)->system_call; + unsigned int *data = &target->thread.system_call; return user_regset_copyout(&pos, &count, &kbuf, &ubuf, data, 0, sizeof(unsigned int)); } @@ -1123,7 +1123,7 @@ static int s390_system_call_set(struct task_struct *target, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - unsigned int *data = &task_thread_info(target)->system_call; + unsigned int *data = &target->thread.system_call; return user_regset_copyin(&pos, &count, &kbuf, &ubuf, data, 0, sizeof(unsigned int)); } @@ -1327,7 +1327,7 @@ static int s390_compat_last_break_get(struct task_struct *target, compat_ulong_t last_break; if (count > 0) { - last_break = task_thread_info(target)->last_break; + last_break = target->thread.last_break; if (kbuf) { unsigned long *k = kbuf; *k = last_break; diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 7f7ba5f23f13..adfac9f0a89f 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -35,6 +35,7 @@ #include <linux/root_dev.h> #include <linux/console.h> #include <linux/kernel_stat.h> +#include <linux/dma-contiguous.h> #include <linux/device.h> #include <linux/notifier.h> #include <linux/pfn.h> @@ -303,7 +304,7 @@ static void __init setup_lowcore(void) * Setup lowcore for boot cpu */ BUILD_BUG_ON(sizeof(struct lowcore) != LC_PAGES * 4096); - lc = __alloc_bootmem_low(LC_PAGES * PAGE_SIZE, LC_PAGES * PAGE_SIZE, 0); + lc = memblock_virt_alloc_low(sizeof(*lc), sizeof(*lc)); lc->restart_psw.mask = PSW_KERNEL_BITS; lc->restart_psw.addr = (unsigned long) restart_int_handler; lc->external_new_psw.mask = PSW_KERNEL_BITS | @@ -324,15 +325,15 @@ static void __init setup_lowcore(void) lc->kernel_stack = ((unsigned long) &init_thread_union) + THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs); lc->async_stack = (unsigned long) - __alloc_bootmem(ASYNC_SIZE, ASYNC_SIZE, 0) + memblock_virt_alloc(ASYNC_SIZE, ASYNC_SIZE) + ASYNC_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs); lc->panic_stack = (unsigned long) - __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, 0) + memblock_virt_alloc(PAGE_SIZE, PAGE_SIZE) + PAGE_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs); - lc->current_task = (unsigned long) init_thread_union.thread_info.task; - lc->thread_info = (unsigned long) &init_thread_union; + lc->current_task = (unsigned long)&init_task; lc->lpp = LPP_MAGIC; lc->machine_flags = S390_lowcore.machine_flags; + lc->preempt_count = S390_lowcore.preempt_count; lc->stfl_fac_list = S390_lowcore.stfl_fac_list; memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list, MAX_FACILITY_BIT/8); @@ -349,7 +350,7 @@ static void __init setup_lowcore(void) lc->last_update_timer = S390_lowcore.last_update_timer; lc->last_update_clock = S390_lowcore.last_update_clock; - restart_stack = __alloc_bootmem(ASYNC_SIZE, ASYNC_SIZE, 0); + restart_stack = memblock_virt_alloc(ASYNC_SIZE, ASYNC_SIZE); restart_stack += ASYNC_SIZE; /* @@ -412,7 +413,7 @@ static void __init setup_resources(void) bss_resource.end = (unsigned long) &__bss_stop - 1; for_each_memblock(memory, reg) { - res = alloc_bootmem_low(sizeof(*res)); + res = memblock_virt_alloc(sizeof(*res), 8); res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; res->name = "System RAM"; @@ -426,7 +427,7 @@ static void __init setup_resources(void) std_res->start > res->end) continue; if (std_res->end > res->end) { - sub_res = alloc_bootmem_low(sizeof(*sub_res)); + sub_res = memblock_virt_alloc(sizeof(*sub_res), 8); *sub_res = *std_res; sub_res->end = res->end; std_res->start = res->end + 1; @@ -445,7 +446,7 @@ static void __init setup_resources(void) * part of the System RAM resource. */ if (crashk_res.end) { - memblock_add(crashk_res.start, resource_size(&crashk_res)); + memblock_add_node(crashk_res.start, resource_size(&crashk_res), 0); memblock_reserve(crashk_res.start, resource_size(&crashk_res)); insert_resource(&iomem_resource, &crashk_res); } @@ -903,6 +904,7 @@ void __init setup_arch(char **cmdline_p) setup_memory_end(); setup_memory(); + dma_contiguous_reserve(memory_end); check_initrd(); reserve_crashkernel(); @@ -921,6 +923,8 @@ void __init setup_arch(char **cmdline_p) cpu_detect_mhz_feature(); cpu_init(); numa_setup(); + smp_detect_cpus(); + topology_init_early(); /* * Create kernel page tables and switch to virtual addressing. diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index d82562cf0a0e..9f241d1efeda 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -359,7 +359,7 @@ static int setup_frame(int sig, struct k_sigaction *ka, /* set extra registers only for synchronous signals */ regs->gprs[4] = regs->int_code & 127; regs->gprs[5] = regs->int_parm_long; - regs->gprs[6] = task_thread_info(current)->last_break; + regs->gprs[6] = current->thread.last_break; } return 0; } @@ -430,7 +430,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, regs->gprs[2] = ksig->sig; regs->gprs[3] = (unsigned long) &frame->info; regs->gprs[4] = (unsigned long) &frame->uc; - regs->gprs[5] = task_thread_info(current)->last_break; + regs->gprs[5] = current->thread.last_break; return 0; } @@ -467,13 +467,13 @@ void do_signal(struct pt_regs *regs) * the debugger may change all our registers, including the system * call information. */ - current_thread_info()->system_call = + current->thread.system_call = test_pt_regs_flag(regs, PIF_SYSCALL) ? regs->int_code : 0; if (get_signal(&ksig)) { /* Whee! Actually deliver the signal. */ - if (current_thread_info()->system_call) { - regs->int_code = current_thread_info()->system_call; + if (current->thread.system_call) { + regs->int_code = current->thread.system_call; /* Check for system call restarting. */ switch (regs->gprs[2]) { case -ERESTART_RESTARTBLOCK: @@ -506,8 +506,8 @@ void do_signal(struct pt_regs *regs) /* No handlers present - check for system call restart */ clear_pt_regs_flag(regs, PIF_SYSCALL); - if (current_thread_info()->system_call) { - regs->int_code = current_thread_info()->system_call; + if (current->thread.system_call) { + regs->int_code = current->thread.system_call; switch (regs->gprs[2]) { case -ERESTART_RESTARTBLOCK: /* Restart with sys_restart_syscall */ diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index df4a508ff35c..e49f61aadaf9 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -19,6 +19,7 @@ #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/workqueue.h> +#include <linux/bootmem.h> #include <linux/module.h> #include <linux/init.h> #include <linux/mm.h> @@ -259,16 +260,14 @@ static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu) static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk) { struct lowcore *lc = pcpu->lowcore; - struct thread_info *ti = task_thread_info(tsk); lc->kernel_stack = (unsigned long) task_stack_page(tsk) + THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs); - lc->thread_info = (unsigned long) task_thread_info(tsk); lc->current_task = (unsigned long) tsk; lc->lpp = LPP_MAGIC; lc->current_pid = tsk->pid; - lc->user_timer = ti->user_timer; - lc->system_timer = ti->system_timer; + lc->user_timer = tsk->thread.user_timer; + lc->system_timer = tsk->thread.system_timer; lc->steal_timer = 0; } @@ -662,14 +661,12 @@ int smp_cpu_get_polarization(int cpu) return pcpu_devices[cpu].polarization; } -static struct sclp_core_info *smp_get_core_info(void) +static void __ref smp_get_core_info(struct sclp_core_info *info, int early) { static int use_sigp_detection; - struct sclp_core_info *info; int address; - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (info && (use_sigp_detection || sclp_get_core_info(info))) { + if (use_sigp_detection || sclp_get_core_info(info, early)) { use_sigp_detection = 1; for (address = 0; address < (SCLP_MAX_CORES << smp_cpu_mt_shift); @@ -683,7 +680,6 @@ static struct sclp_core_info *smp_get_core_info(void) } info->combined = info->configured; } - return info; } static int smp_add_present_cpu(int cpu); @@ -724,17 +720,15 @@ static int __smp_rescan_cpus(struct sclp_core_info *info, int sysfs_add) return nr; } -static void __init smp_detect_cpus(void) +void __init smp_detect_cpus(void) { unsigned int cpu, mtid, c_cpus, s_cpus; struct sclp_core_info *info; u16 address; /* Get CPU information */ - info = smp_get_core_info(); - if (!info) - panic("smp_detect_cpus failed to allocate memory\n"); - + info = memblock_virt_alloc(sizeof(*info), 8); + smp_get_core_info(info, 1); /* Find boot CPU type */ if (sclp.has_core_type) { address = stap(); @@ -770,7 +764,7 @@ static void __init smp_detect_cpus(void) get_online_cpus(); __smp_rescan_cpus(info, 0); put_online_cpus(); - kfree(info); + memblock_free_early((unsigned long)info, sizeof(*info)); } /* @@ -807,7 +801,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) pcpu = pcpu_devices + cpu; if (pcpu->state != CPU_STATE_CONFIGURED) return -EIO; - base = cpu - (cpu % (smp_cpu_mtid + 1)); + base = smp_get_base_cpu(cpu); for (i = 0; i <= smp_cpu_mtid; i++) { if (base + i < nr_cpu_ids) if (cpu_online(base + i)) @@ -907,7 +901,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus) /* request the 0x1202 external call external interrupt */ if (register_external_irq(EXT_IRQ_EXTERNAL_CALL, do_ext_call_interrupt)) panic("Couldn't request external interrupt 0x1202"); - smp_detect_cpus(); } void __init smp_prepare_boot_cpu(void) @@ -973,7 +966,7 @@ static ssize_t cpu_configure_store(struct device *dev, rc = -EBUSY; /* disallow configuration changes of online cpus and cpu 0 */ cpu = dev->id; - cpu -= cpu % (smp_cpu_mtid + 1); + cpu = smp_get_base_cpu(cpu); if (cpu == 0) goto out; for (i = 0; i <= smp_cpu_mtid; i++) @@ -1106,9 +1099,10 @@ int __ref smp_rescan_cpus(void) struct sclp_core_info *info; int nr; - info = smp_get_core_info(); + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; + smp_get_core_info(info, 0); get_online_cpus(); mutex_lock(&smp_cpu_state_mutex); nr = __smp_rescan_cpus(info, 1); diff --git a/arch/s390/kernel/swsusp.S b/arch/s390/kernel/swsusp.S index 2d6b6e81f812..1ff21f05d7dd 100644 --- a/arch/s390/kernel/swsusp.S +++ b/arch/s390/kernel/swsusp.S @@ -194,7 +194,7 @@ pgm_check_entry: /* Suspend CPU not available -> panic */ larl %r15,init_thread_union - ahi %r15,1<<(PAGE_SHIFT+THREAD_ORDER) + ahi %r15,1<<(PAGE_SHIFT+THREAD_SIZE_ORDER) larl %r2,.Lpanic_string larl %r3,_sclp_print_early lghi %r1,0 diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c index bfda6aa40280..24021c1e3ecb 100644 --- a/arch/s390/kernel/sysinfo.c +++ b/arch/s390/kernel/sysinfo.c @@ -56,6 +56,20 @@ int stsi(void *sysinfo, int fc, int sel1, int sel2) } EXPORT_SYMBOL(stsi); +static bool convert_ext_name(unsigned char encoding, char *name, size_t len) +{ + switch (encoding) { + case 1: /* EBCDIC */ + EBCASC(name, len); + break; + case 2: /* UTF-8 */ + break; + default: + return false; + } + return true; +} + static void stsi_1_1_1(struct seq_file *m, struct sysinfo_1_1_1 *info) { int i; @@ -207,24 +221,19 @@ static void stsi_2_2_2(struct seq_file *m, struct sysinfo_2_2_2 *info) seq_printf(m, "LPAR CPUs S-MTID: %d\n", info->mt_stid); seq_printf(m, "LPAR CPUs PS-MTID: %d\n", info->mt_psmtid); } + if (convert_ext_name(info->vsne, info->ext_name, sizeof(info->ext_name))) { + seq_printf(m, "LPAR Extended Name: %-.256s\n", info->ext_name); + seq_printf(m, "LPAR UUID: %pUb\n", &info->uuid); + } } static void print_ext_name(struct seq_file *m, int lvl, struct sysinfo_3_2_2 *info) { - if (info->vm[lvl].ext_name_encoding == 0) - return; - if (info->ext_names[lvl][0] == 0) - return; - switch (info->vm[lvl].ext_name_encoding) { - case 1: /* EBCDIC */ - EBCASC(info->ext_names[lvl], sizeof(info->ext_names[lvl])); - break; - case 2: /* UTF-8 */ - break; - default: + size_t len = sizeof(info->ext_names[lvl]); + + if (!convert_ext_name(info->vm[lvl].evmne, info->ext_names[lvl], len)) return; - } seq_printf(m, "VM%02d Extended Name: %-.256s\n", lvl, info->ext_names[lvl]); } diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 0bfcc492987e..867d0a057046 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -59,19 +59,27 @@ ATOMIC_NOTIFIER_HEAD(s390_epoch_delta_notifier); EXPORT_SYMBOL(s390_epoch_delta_notifier); unsigned char ptff_function_mask[16]; -unsigned long lpar_offset; -unsigned long initial_leap_seconds; + +static unsigned long long lpar_offset; +static unsigned long long initial_leap_seconds; +static unsigned long long tod_steering_end; +static long long tod_steering_delta; /* * Get time offsets with PTFF */ -void __init ptff_init(void) +void __init time_early_init(void) { struct ptff_qto qto; struct ptff_qui qui; + /* Initialize TOD steering parameters */ + tod_steering_end = sched_clock_base_cc; + vdso_data->ts_end = tod_steering_end; + if (!test_facility(28)) return; + ptff(&ptff_function_mask, sizeof(ptff_function_mask), PTFF_QAF); /* get LPAR offset */ @@ -80,7 +88,7 @@ void __init ptff_init(void) /* get initial leap seconds */ if (ptff_query(PTFF_QUI) && ptff(&qui, sizeof(qui), PTFF_QUI) == 0) - initial_leap_seconds = (unsigned long) + initial_leap_seconds = (unsigned long long) ((long) qui.old_leap * 4096000000L); } @@ -123,18 +131,6 @@ void clock_comparator_work(void) cd->event_handler(cd); } -/* - * Fixup the clock comparator. - */ -static void fixup_clock_comparator(unsigned long long delta) -{ - /* If nobody is waiting there's nothing to fix. */ - if (S390_lowcore.clock_comparator == -1ULL) - return; - S390_lowcore.clock_comparator += delta; - set_clock_comparator(S390_lowcore.clock_comparator); -} - static int s390_next_event(unsigned long delta, struct clock_event_device *evt) { @@ -215,7 +211,21 @@ void read_boot_clock64(struct timespec64 *ts) static cycle_t read_tod_clock(struct clocksource *cs) { - return get_tod_clock(); + unsigned long long now, adj; + + preempt_disable(); /* protect from changes to steering parameters */ + now = get_tod_clock(); + adj = tod_steering_end - now; + if (unlikely((s64) adj >= 0)) + /* + * manually steer by 1 cycle every 2^16 cycles. This + * corresponds to shifting the tod delta by 15. 1s is + * therefore steered in ~9h. The adjust will decrease + * over time, until it finally reaches 0. + */ + now += (tod_steering_delta < 0) ? (adj >> 15) : -(adj >> 15); + preempt_enable(); + return now; } static struct clocksource clocksource_tod = { @@ -384,6 +394,55 @@ static inline int check_sync_clock(void) return rc; } +/* + * Apply clock delta to the global data structures. + * This is called once on the CPU that performed the clock sync. + */ +static void clock_sync_global(unsigned long long delta) +{ + unsigned long now, adj; + struct ptff_qto qto; + + /* Fixup the monotonic sched clock. */ + sched_clock_base_cc += delta; + /* Adjust TOD steering parameters. */ + vdso_data->tb_update_count++; + now = get_tod_clock(); + adj = tod_steering_end - now; + if (unlikely((s64) adj >= 0)) + /* Calculate how much of the old adjustment is left. */ + tod_steering_delta = (tod_steering_delta < 0) ? + -(adj >> 15) : (adj >> 15); + tod_steering_delta += delta; + if ((abs(tod_steering_delta) >> 48) != 0) + panic("TOD clock sync offset %lli is too large to drift\n", + tod_steering_delta); + tod_steering_end = now + (abs(tod_steering_delta) << 15); + vdso_data->ts_dir = (tod_steering_delta < 0) ? 0 : 1; + vdso_data->ts_end = tod_steering_end; + vdso_data->tb_update_count++; + /* Update LPAR offset. */ + if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0) + lpar_offset = qto.tod_epoch_difference; + /* Call the TOD clock change notifier. */ + atomic_notifier_call_chain(&s390_epoch_delta_notifier, 0, &delta); +} + +/* + * Apply clock delta to the per-CPU data structures of this CPU. + * This is called for each online CPU after the call to clock_sync_global. + */ +static void clock_sync_local(unsigned long long delta) +{ + /* Add the delta to the clock comparator. */ + if (S390_lowcore.clock_comparator != -1ULL) { + S390_lowcore.clock_comparator += delta; + set_clock_comparator(S390_lowcore.clock_comparator); + } + /* Adjust the last_update_clock time-stamp. */ + S390_lowcore.last_update_clock += delta; +} + /* Single threaded workqueue used for stp sync events */ static struct workqueue_struct *time_sync_wq; @@ -397,31 +456,9 @@ static void __init time_init_wq(void) struct clock_sync_data { atomic_t cpus; int in_sync; - unsigned long long fixup_cc; + unsigned long long clock_delta; }; -static void clock_sync_cpu(struct clock_sync_data *sync) -{ - atomic_dec(&sync->cpus); - enable_sync_clock(); - while (sync->in_sync == 0) { - __udelay(1); - /* - * A different cpu changes *in_sync. Therefore use - * barrier() to force memory access. - */ - barrier(); - } - if (sync->in_sync != 1) - /* Didn't work. Clear per-cpu in sync bit again. */ - disable_sync_clock(NULL); - /* - * This round of TOD syncing is done. Set the clock comparator - * to the next tick and let the processor continue. - */ - fixup_clock_comparator(sync->fixup_cc); -} - /* * Server Time Protocol (STP) code. */ @@ -523,54 +560,46 @@ void stp_queue_work(void) static int stp_sync_clock(void *data) { - static int first; + struct clock_sync_data *sync = data; unsigned long long clock_delta; - struct clock_sync_data *stp_sync; - struct ptff_qto qto; + static int first; int rc; - stp_sync = data; - - if (xchg(&first, 1) == 1) { - /* Slave */ - clock_sync_cpu(stp_sync); - return 0; - } - - /* Wait until all other cpus entered the sync function. */ - while (atomic_read(&stp_sync->cpus) != 0) - cpu_relax(); - enable_sync_clock(); - - rc = 0; - if (stp_info.todoff[0] || stp_info.todoff[1] || - stp_info.todoff[2] || stp_info.todoff[3] || - stp_info.tmd != 2) { - rc = chsc_sstpc(stp_page, STP_OP_SYNC, 0, &clock_delta); - if (rc == 0) { - /* fixup the monotonic sched clock */ - sched_clock_base_cc += clock_delta; - if (ptff_query(PTFF_QTO) && - ptff(&qto, sizeof(qto), PTFF_QTO) == 0) - /* Update LPAR offset */ - lpar_offset = qto.tod_epoch_difference; - atomic_notifier_call_chain(&s390_epoch_delta_notifier, - 0, &clock_delta); - stp_sync->fixup_cc = clock_delta; - fixup_clock_comparator(clock_delta); - rc = chsc_sstpi(stp_page, &stp_info, - sizeof(struct stp_sstpi)); - if (rc == 0 && stp_info.tmd != 2) - rc = -EAGAIN; + if (xchg(&first, 1) == 0) { + /* Wait until all other cpus entered the sync function. */ + while (atomic_read(&sync->cpus) != 0) + cpu_relax(); + rc = 0; + if (stp_info.todoff[0] || stp_info.todoff[1] || + stp_info.todoff[2] || stp_info.todoff[3] || + stp_info.tmd != 2) { + rc = chsc_sstpc(stp_page, STP_OP_SYNC, 0, + &clock_delta); + if (rc == 0) { + sync->clock_delta = clock_delta; + clock_sync_global(clock_delta); + rc = chsc_sstpi(stp_page, &stp_info, + sizeof(struct stp_sstpi)); + if (rc == 0 && stp_info.tmd != 2) + rc = -EAGAIN; + } } + sync->in_sync = rc ? -EAGAIN : 1; + xchg(&first, 0); + } else { + /* Slave */ + atomic_dec(&sync->cpus); + /* Wait for in_sync to be set. */ + while (READ_ONCE(sync->in_sync) == 0) + __udelay(1); } - if (rc) { + if (sync->in_sync != 1) + /* Didn't work. Clear per-cpu in sync bit again. */ disable_sync_clock(NULL); - stp_sync->in_sync = -EAGAIN; - } else - stp_sync->in_sync = 1; - xchg(&first, 0); + /* Apply clock delta to per-CPU fields of this CPU. */ + clock_sync_local(sync->clock_delta); + return 0; } diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index e959c02e0cac..93dcbae1e98d 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -7,6 +7,7 @@ #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/workqueue.h> +#include <linux/bootmem.h> #include <linux/cpuset.h> #include <linux/device.h> #include <linux/export.h> @@ -41,15 +42,17 @@ static bool topology_enabled = true; static DECLARE_WORK(topology_work, topology_work_fn); /* - * Socket/Book linked lists and per_cpu(cpu_topology) updates are + * Socket/Book linked lists and cpu_topology updates are * protected by "sched_domains_mutex". */ static struct mask_info socket_info; static struct mask_info book_info; static struct mask_info drawer_info; -DEFINE_PER_CPU(struct cpu_topology_s390, cpu_topology); -EXPORT_PER_CPU_SYMBOL_GPL(cpu_topology); +struct cpu_topology_s390 cpu_topology[NR_CPUS]; +EXPORT_SYMBOL_GPL(cpu_topology); + +cpumask_t cpus_with_topology; static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu) { @@ -97,7 +100,7 @@ static void add_cpus_to_mask(struct topology_core *tl_core, if (lcpu < 0) continue; for (i = 0; i <= smp_cpu_mtid; i++) { - topo = &per_cpu(cpu_topology, lcpu + i); + topo = &cpu_topology[lcpu + i]; topo->drawer_id = drawer->id; topo->book_id = book->id; topo->socket_id = socket->id; @@ -106,6 +109,7 @@ static void add_cpus_to_mask(struct topology_core *tl_core, cpumask_set_cpu(lcpu + i, &drawer->mask); cpumask_set_cpu(lcpu + i, &book->mask); cpumask_set_cpu(lcpu + i, &socket->mask); + cpumask_set_cpu(lcpu + i, &cpus_with_topology); smp_cpu_set_polarization(lcpu + i, tl_core->pp); } } @@ -220,7 +224,7 @@ static void update_cpu_masks(void) int cpu; for_each_possible_cpu(cpu) { - topo = &per_cpu(cpu_topology, cpu); + topo = &cpu_topology[cpu]; topo->thread_mask = cpu_thread_map(cpu); topo->core_mask = cpu_group_map(&socket_info, cpu); topo->book_mask = cpu_group_map(&book_info, cpu); @@ -231,6 +235,8 @@ static void update_cpu_masks(void) topo->socket_id = cpu; topo->book_id = cpu; topo->drawer_id = cpu; + if (cpu_present(cpu)) + cpumask_set_cpu(cpu, &cpus_with_topology); } } numa_update_cpu_topology(); @@ -241,12 +247,12 @@ void store_topology(struct sysinfo_15_1_x *info) stsi(info, 15, 1, min(topology_max_mnest, 4)); } -int arch_update_cpu_topology(void) +static int __arch_update_cpu_topology(void) { struct sysinfo_15_1_x *info = tl_info; - struct device *dev; - int cpu, rc = 0; + int rc = 0; + cpumask_clear(&cpus_with_topology); if (MACHINE_HAS_TOPOLOGY) { rc = 1; store_topology(info); @@ -255,6 +261,15 @@ int arch_update_cpu_topology(void) update_cpu_masks(); if (!MACHINE_HAS_TOPOLOGY) topology_update_polarization_simple(); + return rc; +} + +int arch_update_cpu_topology(void) +{ + struct device *dev; + int cpu, rc; + + rc = __arch_update_cpu_topology(); for_each_online_cpu(cpu) { dev = get_cpu_device(cpu); kobject_uevent(&dev->kobj, KOBJ_CHANGE); @@ -394,23 +409,23 @@ int topology_cpu_init(struct cpu *cpu) static const struct cpumask *cpu_thread_mask(int cpu) { - return &per_cpu(cpu_topology, cpu).thread_mask; + return &cpu_topology[cpu].thread_mask; } const struct cpumask *cpu_coregroup_mask(int cpu) { - return &per_cpu(cpu_topology, cpu).core_mask; + return &cpu_topology[cpu].core_mask; } static const struct cpumask *cpu_book_mask(int cpu) { - return &per_cpu(cpu_topology, cpu).book_mask; + return &cpu_topology[cpu].book_mask; } static const struct cpumask *cpu_drawer_mask(int cpu) { - return &per_cpu(cpu_topology, cpu).drawer_mask; + return &cpu_topology[cpu].drawer_mask; } static int __init early_parse_topology(char *p) @@ -438,19 +453,20 @@ static void __init alloc_masks(struct sysinfo_15_1_x *info, nr_masks *= info->mag[TOPOLOGY_NR_MAG - offset - 1 - i]; nr_masks = max(nr_masks, 1); for (i = 0; i < nr_masks; i++) { - mask->next = kzalloc(sizeof(*mask->next), GFP_KERNEL); + mask->next = memblock_virt_alloc(sizeof(*mask->next), 8); mask = mask->next; } } -static int __init s390_topology_init(void) +void __init topology_init_early(void) { struct sysinfo_15_1_x *info; int i; + set_sched_topology(s390_topology); if (!MACHINE_HAS_TOPOLOGY) - return 0; - tl_info = (struct sysinfo_15_1_x *)__get_free_page(GFP_KERNEL); + goto out; + tl_info = memblock_virt_alloc(sizeof(*tl_info), PAGE_SIZE); info = tl_info; store_topology(info); pr_info("The CPU configuration topology of the machine is:"); @@ -460,10 +476,9 @@ static int __init s390_topology_init(void) alloc_masks(info, &socket_info, 1); alloc_masks(info, &book_info, 2); alloc_masks(info, &drawer_info, 3); - set_sched_topology(s390_topology); - return 0; +out: + __arch_update_cpu_topology(); } -early_initcall(s390_topology_init); static int __init topology_init(void) { diff --git a/arch/s390/kernel/vdso32/clock_gettime.S b/arch/s390/kernel/vdso32/clock_gettime.S index 5eec9afbb5b5..a5769b83d90e 100644 --- a/arch/s390/kernel/vdso32/clock_gettime.S +++ b/arch/s390/kernel/vdso32/clock_gettime.S @@ -99,8 +99,27 @@ __kernel_clock_gettime: tml %r4,0x0001 /* pending update ? loop */ jnz 11b stcke 0(%r15) /* Store TOD clock */ - lm %r0,%r1,1(%r15) - s %r0,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ + lm %r0,%r1,__VDSO_TS_END(%r5) /* TOD steering end time */ + s %r0,1(%r15) /* no - ts_steering_end */ + sl %r1,5(%r15) + brc 3,22f + ahi %r0,-1 +22: ltr %r0,%r0 /* past end of steering? */ + jm 24f + srdl %r0,15 /* 1 per 2^16 */ + tm __VDSO_TS_DIR+3(%r5),0x01 /* steering direction? */ + jz 23f + lcr %r0,%r0 /* negative TOD offset */ + lcr %r1,%r1 + je 23f + ahi %r0,-1 +23: a %r0,1(%r15) /* add TOD timestamp */ + al %r1,5(%r15) + brc 12,25f + ahi %r0,1 + j 25f +24: lm %r0,%r1,1(%r15) /* load TOD timestamp */ +25: s %r0,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ sl %r1,__VDSO_XTIME_STAMP+4(%r5) brc 3,12f ahi %r0,-1 diff --git a/arch/s390/kernel/vdso32/gettimeofday.S b/arch/s390/kernel/vdso32/gettimeofday.S index 719de6186b20..63b86dceb0bf 100644 --- a/arch/s390/kernel/vdso32/gettimeofday.S +++ b/arch/s390/kernel/vdso32/gettimeofday.S @@ -31,8 +31,27 @@ __kernel_gettimeofday: tml %r4,0x0001 /* pending update ? loop */ jnz 1b stcke 0(%r15) /* Store TOD clock */ - lm %r0,%r1,1(%r15) - s %r0,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ + lm %r0,%r1,__VDSO_TS_END(%r5) /* TOD steering end time */ + s %r0,1(%r15) + sl %r1,5(%r15) + brc 3,14f + ahi %r0,-1 +14: ltr %r0,%r0 /* past end of steering? */ + jm 16f + srdl %r0,15 /* 1 per 2^16 */ + tm __VDSO_TS_DIR+3(%r5),0x01 /* steering direction? */ + jz 15f + lcr %r0,%r0 /* negative TOD offset */ + lcr %r1,%r1 + je 15f + ahi %r0,-1 +15: a %r0,1(%r15) /* add TOD timestamp */ + al %r1,5(%r15) + brc 12,17f + ahi %r0,1 + j 17f +16: lm %r0,%r1,1(%r15) /* load TOD timestamp */ +17: s %r0,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ sl %r1,__VDSO_XTIME_STAMP+4(%r5) brc 3,3f ahi %r0,-1 diff --git a/arch/s390/kernel/vdso64/clock_gettime.S b/arch/s390/kernel/vdso64/clock_gettime.S index 61541fb93dc6..9c3b12626dba 100644 --- a/arch/s390/kernel/vdso64/clock_gettime.S +++ b/arch/s390/kernel/vdso64/clock_gettime.S @@ -83,8 +83,17 @@ __kernel_clock_gettime: tmll %r4,0x0001 /* pending update ? loop */ jnz 5b stcke 0(%r15) /* Store TOD clock */ - lgf %r2,__VDSO_TK_SHIFT(%r5) /* Timekeeper shift */ lg %r1,1(%r15) + lg %r0,__VDSO_TS_END(%r5) /* TOD steering end time */ + slgr %r0,%r1 /* now - ts_steering_end */ + ltgr %r0,%r0 /* past end of steering ? */ + jm 17f + srlg %r0,%r0,15 /* 1 per 2^16 */ + tm __VDSO_TS_DIR+3(%r5),0x01 /* steering direction? */ + jz 18f + lcgr %r0,%r0 /* negative TOD offset */ +18: algr %r1,%r0 /* add steering offset */ +17: lgf %r2,__VDSO_TK_SHIFT(%r5) /* Timekeeper shift */ sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ msgf %r1,__VDSO_TK_MULT(%r5) /* * tk->mult */ alg %r1,__VDSO_XTIME_NSEC(%r5) /* + tk->xtime_nsec */ diff --git a/arch/s390/kernel/vdso64/gettimeofday.S b/arch/s390/kernel/vdso64/gettimeofday.S index 6ce46707663c..b02e62f3bc12 100644 --- a/arch/s390/kernel/vdso64/gettimeofday.S +++ b/arch/s390/kernel/vdso64/gettimeofday.S @@ -31,7 +31,16 @@ __kernel_gettimeofday: jnz 0b stcke 0(%r15) /* Store TOD clock */ lg %r1,1(%r15) - sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ + lg %r0,__VDSO_TS_END(%r5) /* TOD steering end time */ + slgr %r0,%r1 /* now - ts_steering_end */ + ltgr %r0,%r0 /* past end of steering ? */ + jm 6f + srlg %r0,%r0,15 /* 1 per 2^16 */ + tm __VDSO_TS_DIR+3(%r5),0x01 /* steering direction? */ + jz 7f + lcgr %r0,%r0 /* negative TOD offset */ +7: algr %r1,%r0 /* add steering offset */ +6: sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ msgf %r1,__VDSO_TK_MULT(%r5) /* * tk->mult */ alg %r1,__VDSO_XTIME_NSEC(%r5) /* + tk->xtime_nsec */ lg %r0,__VDSO_XTIME_SEC(%r5) /* tk->xtime_sec */ diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 1bd5dde2d5a9..6b246aadf311 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -96,7 +96,6 @@ static void update_mt_scaling(void) */ static int do_account_vtime(struct task_struct *tsk, int hardirq_offset) { - struct thread_info *ti = task_thread_info(tsk); u64 timer, clock, user, system, steal; u64 user_scaled, system_scaled; @@ -119,13 +118,13 @@ static int do_account_vtime(struct task_struct *tsk, int hardirq_offset) time_after64(jiffies_64, this_cpu_read(mt_scaling_jiffies))) update_mt_scaling(); - user = S390_lowcore.user_timer - ti->user_timer; + user = S390_lowcore.user_timer - tsk->thread.user_timer; S390_lowcore.steal_timer -= user; - ti->user_timer = S390_lowcore.user_timer; + tsk->thread.user_timer = S390_lowcore.user_timer; - system = S390_lowcore.system_timer - ti->system_timer; + system = S390_lowcore.system_timer - tsk->thread.system_timer; S390_lowcore.steal_timer -= system; - ti->system_timer = S390_lowcore.system_timer; + tsk->thread.system_timer = S390_lowcore.system_timer; user_scaled = user; system_scaled = system; @@ -153,15 +152,11 @@ static int do_account_vtime(struct task_struct *tsk, int hardirq_offset) void vtime_task_switch(struct task_struct *prev) { - struct thread_info *ti; - do_account_vtime(prev, 0); - ti = task_thread_info(prev); - ti->user_timer = S390_lowcore.user_timer; - ti->system_timer = S390_lowcore.system_timer; - ti = task_thread_info(current); - S390_lowcore.user_timer = ti->user_timer; - S390_lowcore.system_timer = ti->system_timer; + prev->thread.user_timer = S390_lowcore.user_timer; + prev->thread.system_timer = S390_lowcore.system_timer; + S390_lowcore.user_timer = current->thread.user_timer; + S390_lowcore.system_timer = current->thread.system_timer; } /* @@ -181,7 +176,6 @@ void vtime_account_user(struct task_struct *tsk) */ void vtime_account_irq_enter(struct task_struct *tsk) { - struct thread_info *ti = task_thread_info(tsk); u64 timer, system, system_scaled; timer = S390_lowcore.last_update_timer; @@ -193,9 +187,9 @@ void vtime_account_irq_enter(struct task_struct *tsk) time_after64(jiffies_64, this_cpu_read(mt_scaling_jiffies))) update_mt_scaling(); - system = S390_lowcore.system_timer - ti->system_timer; + system = S390_lowcore.system_timer - tsk->thread.system_timer; S390_lowcore.steal_timer -= system; - ti->system_timer = S390_lowcore.system_timer; + tsk->thread.system_timer = S390_lowcore.system_timer; system_scaled = system; /* Do MT utilization scaling */ if (smp_cpu_mtid) { diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index be4db07f70d3..af13f1a135b6 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -415,7 +415,7 @@ static int __write_machine_check(struct kvm_vcpu *vcpu, int rc; mci.val = mchk->mcic; - /* take care of lazy register loading via vcpu load/put */ + /* take care of lazy register loading */ save_fpu_regs(); save_access_regs(vcpu->run->s.regs.acrs); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 9c7a1ecfe6bd..bec71e902be3 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1812,22 +1812,7 @@ __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - /* Save host register state */ - save_fpu_regs(); - vcpu->arch.host_fpregs.fpc = current->thread.fpu.fpc; - vcpu->arch.host_fpregs.regs = current->thread.fpu.regs; - - if (MACHINE_HAS_VX) - current->thread.fpu.regs = vcpu->run->s.regs.vrs; - else - current->thread.fpu.regs = vcpu->run->s.regs.fprs; - current->thread.fpu.fpc = vcpu->run->s.regs.fpc; - if (test_fp_ctl(current->thread.fpu.fpc)) - /* User space provided an invalid FPC, let's clear it */ - current->thread.fpu.fpc = 0; - save_access_regs(vcpu->arch.host_acrs); - restore_access_regs(vcpu->run->s.regs.acrs); gmap_enable(vcpu->arch.enabled_gmap); atomic_or(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags); if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu)) @@ -1844,16 +1829,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) vcpu->arch.enabled_gmap = gmap_get_enabled(); gmap_disable(vcpu->arch.enabled_gmap); - /* Save guest register state */ - save_fpu_regs(); - vcpu->run->s.regs.fpc = current->thread.fpu.fpc; - - /* Restore host register state */ - current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc; - current->thread.fpu.regs = vcpu->arch.host_fpregs.regs; - - save_access_regs(vcpu->run->s.regs.acrs); - restore_access_regs(vcpu->arch.host_acrs); } static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu) @@ -2243,7 +2218,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, { memcpy(&vcpu->run->s.regs.acrs, &sregs->acrs, sizeof(sregs->acrs)); memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs)); - restore_access_regs(vcpu->run->s.regs.acrs); return 0; } @@ -2257,11 +2231,9 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - /* make sure the new values will be lazily loaded */ - save_fpu_regs(); if (test_fp_ctl(fpu->fpc)) return -EINVAL; - current->thread.fpu.fpc = fpu->fpc; + vcpu->run->s.regs.fpc = fpu->fpc; if (MACHINE_HAS_VX) convert_fp_to_vx((__vector128 *) vcpu->run->s.regs.vrs, (freg_t *) fpu->fprs); @@ -2279,7 +2251,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) (__vector128 *) vcpu->run->s.regs.vrs); else memcpy(fpu->fprs, vcpu->run->s.regs.fprs, sizeof(fpu->fprs)); - fpu->fpc = current->thread.fpu.fpc; + fpu->fpc = vcpu->run->s.regs.fpc; return 0; } @@ -2740,6 +2712,20 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (riccb->valid) vcpu->arch.sie_block->ecb3 |= 0x01; } + save_access_regs(vcpu->arch.host_acrs); + restore_access_regs(vcpu->run->s.regs.acrs); + /* save host (userspace) fprs/vrs */ + save_fpu_regs(); + vcpu->arch.host_fpregs.fpc = current->thread.fpu.fpc; + vcpu->arch.host_fpregs.regs = current->thread.fpu.regs; + if (MACHINE_HAS_VX) + current->thread.fpu.regs = vcpu->run->s.regs.vrs; + else + current->thread.fpu.regs = vcpu->run->s.regs.fprs; + current->thread.fpu.fpc = vcpu->run->s.regs.fpc; + if (test_fp_ctl(current->thread.fpu.fpc)) + /* User space provided an invalid FPC, let's clear it */ + current->thread.fpu.fpc = 0; kvm_run->kvm_dirty_regs = 0; } @@ -2758,6 +2744,15 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_run->s.regs.pft = vcpu->arch.pfault_token; kvm_run->s.regs.pfs = vcpu->arch.pfault_select; kvm_run->s.regs.pfc = vcpu->arch.pfault_compare; + save_access_regs(vcpu->run->s.regs.acrs); + restore_access_regs(vcpu->arch.host_acrs); + /* Save guest register state */ + save_fpu_regs(); + vcpu->run->s.regs.fpc = current->thread.fpu.fpc; + /* Restore will be done lazily at return */ + current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc; + current->thread.fpu.regs = vcpu->arch.host_fpregs.regs; + } int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) @@ -2874,7 +2869,7 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) { /* * The guest FPRS and ACRS are in the host FPRS/ACRS due to the lazy - * copying in vcpu load/put. Lets update our copies before we save + * switch in the run ioctl. Let's update our copies before we save * it into the save area */ save_fpu_regs(); diff --git a/arch/s390/lib/mem.S b/arch/s390/lib/mem.S index be9fa65bfac4..7422a706f310 100644 --- a/arch/s390/lib/mem.S +++ b/arch/s390/lib/mem.S @@ -8,6 +8,45 @@ #include <asm/export.h> /* + * void *memmove(void *dest, const void *src, size_t n) + */ +ENTRY(memmove) + ltgr %r4,%r4 + lgr %r1,%r2 + bzr %r14 + clgr %r2,%r3 + jnh .Lmemmove_forward + la %r5,0(%r4,%r3) + clgr %r2,%r5 + jl .Lmemmove_reverse +.Lmemmove_forward: + aghi %r4,-1 + srlg %r0,%r4,8 + ltgr %r0,%r0 + jz .Lmemmove_rest +.Lmemmove_loop: + mvc 0(256,%r1),0(%r3) + la %r1,256(%r1) + la %r3,256(%r3) + brctg %r0,.Lmemmove_loop +.Lmemmove_rest: + larl %r5,.Lmemmove_mvc + ex %r4,0(%r5) + br %r14 +.Lmemmove_reverse: + aghi %r4,-1 +.Lmemmove_reverse_loop: + ic %r0,0(%r4,%r3) + stc %r0,0(%r4,%r1) + brctg %r4,.Lmemmove_reverse_loop + ic %r0,0(%r4,%r3) + stc %r0,0(%r4,%r1) + br %r14 +.Lmemmove_mvc: + mvc 0(1,%r1),0(%r3) +EXPORT_SYMBOL(memmove) + +/* * memset implementation * * This code corresponds to the C construct below. We do distinguish diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 661d9fe63c43..d1faae5cdd12 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -733,6 +733,7 @@ block: * return to userspace schedule() to block. */ __set_current_state(TASK_UNINTERRUPTIBLE); set_tsk_need_resched(tsk); + set_preempt_need_resched(); } } out: diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 1848292766ef..45becc8a44ec 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -34,7 +34,7 @@ static void __ref *vmem_alloc_pages(unsigned int order) if (slab_is_available()) return (void *)__get_free_pages(GFP_KERNEL, order); - return alloc_bootmem_align(size, size); + return (void *) memblock_alloc(size, size); } static inline pud_t *vmem_pud_alloc(void) @@ -61,17 +61,16 @@ pmd_t *vmem_pmd_alloc(void) pte_t __ref *vmem_pte_alloc(void) { + unsigned long size = PTRS_PER_PTE * sizeof(pte_t); pte_t *pte; if (slab_is_available()) pte = (pte_t *) page_table_alloc(&init_mm); else - pte = alloc_bootmem_align(PTRS_PER_PTE * sizeof(pte_t), - PTRS_PER_PTE * sizeof(pte_t)); + pte = (pte_t *) memblock_alloc(size, size); if (!pte) return NULL; - clear_table((unsigned long *) pte, _PAGE_INVALID, - PTRS_PER_PTE * sizeof(pte_t)); + clear_table((unsigned long *) pte, _PAGE_INVALID, size); return pte; } diff --git a/arch/s390/numa/mode_emu.c b/arch/s390/numa/mode_emu.c index 37e0bb835516..cfd08384f0ab 100644 --- a/arch/s390/numa/mode_emu.c +++ b/arch/s390/numa/mode_emu.c @@ -21,6 +21,7 @@ #include <linux/kernel.h> #include <linux/cpumask.h> #include <linux/memblock.h> +#include <linux/bootmem.h> #include <linux/node.h> #include <linux/memory.h> #include <linux/slab.h> @@ -307,13 +308,11 @@ fail: /* * Allocate and initialize core to node mapping */ -static void create_core_to_node_map(void) +static void __ref create_core_to_node_map(void) { int i; - emu_cores = kzalloc(sizeof(*emu_cores), GFP_KERNEL); - if (emu_cores == NULL) - panic("Could not allocate cores to node memory"); + emu_cores = memblock_virt_alloc(sizeof(*emu_cores), 8); for (i = 0; i < ARRAY_SIZE(emu_cores->to_node_id); i++) emu_cores->to_node_id[i] = NODE_ID_FREE; } @@ -354,13 +353,13 @@ static struct toptree *toptree_from_topology(void) phys = toptree_new(TOPTREE_ID_PHYS, 1); - for_each_online_cpu(cpu) { - top = &per_cpu(cpu_topology, cpu); + for_each_cpu(cpu, &cpus_with_topology) { + top = &cpu_topology[cpu]; node = toptree_get_child(phys, 0); drawer = toptree_get_child(node, top->drawer_id); book = toptree_get_child(drawer, top->book_id); mc = toptree_get_child(book, top->socket_id); - core = toptree_get_child(mc, top->core_id); + core = toptree_get_child(mc, smp_get_base_cpu(cpu)); if (!drawer || !book || !mc || !core) panic("NUMA emulation could not allocate memory"); cpumask_set_cpu(cpu, &core->mask); @@ -378,7 +377,7 @@ static void topology_add_core(struct toptree *core) int cpu; for_each_cpu(cpu, &core->mask) { - top = &per_cpu(cpu_topology, cpu); + top = &cpu_topology[cpu]; cpumask_copy(&top->thread_mask, &core->mask); cpumask_copy(&top->core_mask, &core_mc(core)->mask); cpumask_copy(&top->book_mask, &core_book(core)->mask); @@ -425,6 +424,27 @@ static void print_node_to_core_map(void) } } +static void pin_all_possible_cpus(void) +{ + int core_id, node_id, cpu; + static int initialized; + + if (initialized) + return; + print_node_to_core_map(); + node_id = 0; + for_each_possible_cpu(cpu) { + core_id = smp_get_base_cpu(cpu); + if (emu_cores->to_node_id[core_id] != NODE_ID_FREE) + continue; + pin_core_to_node(core_id, node_id); + cpu_topology[cpu].node_id = node_id; + node_id = (node_id + 1) % emu_nodes; + } + print_node_to_core_map(); + initialized = 1; +} + /* * Transfer physical topology into a NUMA topology and modify CPU masks * according to the NUMA topology. @@ -442,7 +462,7 @@ static void emu_update_cpu_topology(void) toptree_free(phys); toptree_to_topology(numa); toptree_free(numa); - print_node_to_core_map(); + pin_all_possible_cpus(); } /* diff --git a/arch/s390/numa/toptree.c b/arch/s390/numa/toptree.c index 902d350d859a..26f622b1cd11 100644 --- a/arch/s390/numa/toptree.c +++ b/arch/s390/numa/toptree.c @@ -7,6 +7,7 @@ */ #include <linux/kernel.h> +#include <linux/bootmem.h> #include <linux/cpumask.h> #include <linux/list.h> #include <linux/list_sort.h> @@ -25,10 +26,14 @@ * RETURNS: * Pointer to the new tree node or NULL on error */ -struct toptree *toptree_alloc(int level, int id) +struct toptree __ref *toptree_alloc(int level, int id) { - struct toptree *res = kzalloc(sizeof(struct toptree), GFP_KERNEL); + struct toptree *res; + if (slab_is_available()) + res = kzalloc(sizeof(*res), GFP_KERNEL); + else + res = memblock_virt_alloc(sizeof(*res), 8); if (!res) return res; @@ -65,7 +70,7 @@ static void toptree_remove(struct toptree *cand) * cleanly using toptree_remove. Possible children are freed * recursively. In the end @cand itself is freed. */ -void toptree_free(struct toptree *cand) +void __ref toptree_free(struct toptree *cand) { struct toptree *child, *tmp; @@ -73,7 +78,10 @@ void toptree_free(struct toptree *cand) toptree_remove(cand); toptree_for_each_child_safe(child, tmp, cand) toptree_free(child); - kfree(cand); + if (slab_is_available()) + kfree(cand); + else + memblock_free_early((unsigned long)cand, sizeof(*cand)); } /** diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 15ffc19c8c0c..64e1734bebb7 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -722,6 +722,11 @@ struct dev_pm_ops pcibios_pm_ops = { static int zpci_alloc_domain(struct zpci_dev *zdev) { + if (zpci_unique_uid) { + zdev->domain = (u16) zdev->uid; + return 0; + } + spin_lock(&zpci_domain_lock); zdev->domain = find_first_zero_bit(zpci_domain, ZPCI_NR_DEVICES); if (zdev->domain == ZPCI_NR_DEVICES) { @@ -735,6 +740,9 @@ static int zpci_alloc_domain(struct zpci_dev *zdev) static void zpci_free_domain(struct zpci_dev *zdev) { + if (zpci_unique_uid) + return; + spin_lock(&zpci_domain_lock); clear_bit(zdev->domain, zpci_domain); spin_unlock(&zpci_domain_lock); diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index 1a4512c8544a..e3ef63b36b5a 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -22,6 +22,8 @@ #include <asm/clp.h> #include <uapi/asm/clp.h> +bool zpci_unique_uid; + static inline void zpci_err_clp(unsigned int rsp, int rc) { struct { @@ -315,6 +317,7 @@ static int clp_list_pci(struct clp_req_rsp_list_pci *rrb, goto out; } + zpci_unique_uid = rrb->response.uid_checking; WARN_ON_ONCE(rrb->response.entry_size != sizeof(struct clp_fh_list_entry)); diff --git a/arch/s390/pci/pci_debug.c b/arch/s390/pci/pci_debug.c index 38993b156924..c2f786f0ea06 100644 --- a/arch/s390/pci/pci_debug.c +++ b/arch/s390/pci/pci_debug.c @@ -69,7 +69,7 @@ static void pci_sw_counter_show(struct seq_file *m) int i; for (i = 0; i < ARRAY_SIZE(pci_sw_names); i++, counter++) - seq_printf(m, "%26s:\t%llu\n", pci_sw_names[i], + seq_printf(m, "%26s:\t%lu\n", pci_sw_names[i], atomic64_read(counter)); } diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c index 6b2f72f523b9..1d7a9c71944a 100644 --- a/arch/s390/pci/pci_dma.c +++ b/arch/s390/pci/pci_dma.c @@ -181,14 +181,17 @@ static int __dma_purge_tlb(struct zpci_dev *zdev, dma_addr_t dma_addr, /* * With zdev->tlb_refresh == 0, rpcit is not required to establish new * translations when previously invalid translation-table entries are - * validated. With lazy unmap, it also is skipped for previously valid + * validated. With lazy unmap, rpcit is skipped for previously valid * entries, but a global rpcit is then required before any address can * be re-used, i.e. after each iommu bitmap wrap-around. */ - if (!zdev->tlb_refresh && - (!s390_iommu_strict || - ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID))) - return 0; + if ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID) { + if (!zdev->tlb_refresh) + return 0; + } else { + if (!s390_iommu_strict) + return 0; + } return zpci_refresh_trans((u64) zdev->fh << 32, dma_addr, PAGE_ALIGN(size)); @@ -257,7 +260,7 @@ static dma_addr_t dma_alloc_address(struct device *dev, int size) spin_lock_irqsave(&zdev->iommu_bitmap_lock, flags); offset = __dma_alloc_iommu(dev, zdev->next_bit, size); if (offset == -1) { - if (!zdev->tlb_refresh && !s390_iommu_strict) { + if (!s390_iommu_strict) { /* global flush before DMA addresses are reused */ if (zpci_refresh_global(zdev)) goto out_error; @@ -292,7 +295,7 @@ static void dma_free_address(struct device *dev, dma_addr_t dma_addr, int size) if (!zdev->iommu_bitmap) goto out; - if (zdev->tlb_refresh || s390_iommu_strict) + if (s390_iommu_strict) bitmap_clear(zdev->iommu_bitmap, offset, size); else bitmap_set(zdev->lazy_bitmap, offset, size); @@ -388,8 +391,6 @@ static void *s390_dma_alloc(struct device *dev, size_t size, return NULL; pa = page_to_phys(page); - memset((void *) pa, 0, size); - map = s390_dma_map_pages(dev, page, 0, size, DMA_BIDIRECTIONAL, 0); if (dma_mapping_error(dev, map)) { free_pages(pa, get_order(size)); @@ -419,6 +420,7 @@ static int __s390_dma_map_sg(struct device *dev, struct scatterlist *sg, size_t size, dma_addr_t *handle, enum dma_data_direction dir) { + unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); dma_addr_t dma_addr_base, dma_addr; int flags = ZPCI_PTE_VALID; @@ -426,8 +428,7 @@ static int __s390_dma_map_sg(struct device *dev, struct scatterlist *sg, unsigned long pa = 0; int ret; - size = PAGE_ALIGN(size); - dma_addr_base = dma_alloc_address(dev, size >> PAGE_SHIFT); + dma_addr_base = dma_alloc_address(dev, nr_pages); if (dma_addr_base == DMA_ERROR_CODE) return -ENOMEM; @@ -436,26 +437,27 @@ static int __s390_dma_map_sg(struct device *dev, struct scatterlist *sg, flags |= ZPCI_TABLE_PROTECTED; for (s = sg; dma_addr < dma_addr_base + size; s = sg_next(s)) { - pa = page_to_phys(sg_page(s)) + s->offset; - ret = __dma_update_trans(zdev, pa, dma_addr, s->length, flags); + pa = page_to_phys(sg_page(s)); + ret = __dma_update_trans(zdev, pa, dma_addr, + s->offset + s->length, flags); if (ret) goto unmap; - dma_addr += s->length; + dma_addr += s->offset + s->length; } ret = __dma_purge_tlb(zdev, dma_addr_base, size, flags); if (ret) goto unmap; *handle = dma_addr_base; - atomic64_add(size >> PAGE_SHIFT, &zdev->mapped_pages); + atomic64_add(nr_pages, &zdev->mapped_pages); return ret; unmap: dma_update_trans(zdev, 0, dma_addr_base, dma_addr - dma_addr_base, ZPCI_PTE_INVALID); - dma_free_address(dev, dma_addr_base, size >> PAGE_SHIFT); + dma_free_address(dev, dma_addr_base, nr_pages); zpci_err("map error:\n"); zpci_err_dma(ret, pa); return ret; @@ -564,7 +566,7 @@ int zpci_dma_init_device(struct zpci_dev *zdev) rc = -ENOMEM; goto free_dma_table; } - if (!zdev->tlb_refresh && !s390_iommu_strict) { + if (!s390_iommu_strict) { zdev->lazy_bitmap = vzalloc(zdev->iommu_pages / 8); if (!zdev->lazy_bitmap) { rc = -ENOMEM; diff --git a/arch/s390/tools/Makefile b/arch/s390/tools/Makefile index 6d9814c9df2b..4b5e1e499527 100644 --- a/arch/s390/tools/Makefile +++ b/arch/s390/tools/Makefile @@ -9,7 +9,5 @@ define filechk_facilities.h $(obj)/gen_facilities endef -$(obj)/gen_facilities.o: $(srctree)/arch/s390/tools/gen_facilities.c - include/generated/facilities.h: $(obj)/gen_facilities FORCE $(call filechk,facilities.h) diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c index fe4e6c910dd7..8cc53b1e6d03 100644 --- a/arch/s390/tools/gen_facilities.c +++ b/arch/s390/tools/gen_facilities.c @@ -7,13 +7,83 @@ * */ -#define S390_GEN_FACILITIES_C - #include <strings.h> #include <string.h> #include <stdlib.h> #include <stdio.h> -#include <asm/facilities_src.h> + +struct facility_def { + char *name; + int *bits; +}; + +static struct facility_def facility_defs[] = { + { + /* + * FACILITIES_ALS contains the list of facilities that are + * required to run a kernel that is compiled e.g. with + * -march=<machine>. + */ + .name = "FACILITIES_ALS", + .bits = (int[]){ +#ifdef CONFIG_HAVE_MARCH_Z900_FEATURES + 0, /* N3 instructions */ + 1, /* z/Arch mode installed */ +#endif +#ifdef CONFIG_HAVE_MARCH_Z990_FEATURES + 18, /* long displacement facility */ +#endif +#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES + 7, /* stfle */ + 17, /* message security assist */ + 21, /* extended-immediate facility */ + 25, /* store clock fast */ +#endif +#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES + 27, /* mvcos */ + 32, /* compare and swap and store */ + 33, /* compare and swap and store 2 */ + 34, /* general extension facility */ + 35, /* execute extensions */ +#endif +#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES + 45, /* fast-BCR, etc. */ +#endif +#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES + 49, /* misc-instruction-extensions */ + 52, /* interlocked facility 2 */ +#endif +#ifdef CONFIG_HAVE_MARCH_Z13_FEATURES + 53, /* load-and-zero-rightmost-byte, etc. */ +#endif + -1 /* END */ + } + }, + { + .name = "FACILITIES_KVM", + .bits = (int[]){ + 0, /* N3 instructions */ + 1, /* z/Arch mode installed */ + 2, /* z/Arch mode active */ + 3, /* DAT-enhancement */ + 4, /* idte segment table */ + 5, /* idte region table */ + 6, /* ASN-and-LX reuse */ + 7, /* stfle */ + 8, /* enhanced-DAT 1 */ + 9, /* sense-running-status */ + 10, /* conditional sske */ + 13, /* ipte-range */ + 14, /* nonquiescing key-setting */ + 73, /* transactional execution */ + 75, /* access-exception-fetch/store indication */ + 76, /* msa extension 3 */ + 77, /* msa extension 4 */ + 78, /* enhanced-DAT 2 */ + -1 /* END */ + } + }, +}; static void print_facility_list(struct facility_def *def) { diff --git a/arch/sh/kernel/cpu/Makefile b/arch/sh/kernel/cpu/Makefile index accc7ca722e1..252e9fee687f 100644 --- a/arch/sh/kernel/cpu/Makefile +++ b/arch/sh/kernel/cpu/Makefile @@ -1,5 +1,5 @@ # -# Makefile for the Linux/SuperH CPU-specifc backends. +# Makefile for the Linux/SuperH CPU-specific backends. # obj-$(CONFIG_CPU_SH2) = sh2/ diff --git a/arch/sh/kernel/cpu/irq/Makefile b/arch/sh/kernel/cpu/irq/Makefile index f0c7025a67d1..3f8e79402d7d 100644 --- a/arch/sh/kernel/cpu/irq/Makefile +++ b/arch/sh/kernel/cpu/irq/Makefile @@ -1,5 +1,5 @@ # -# Makefile for the Linux/SuperH CPU-specifc IRQ handlers. +# Makefile for the Linux/SuperH CPU-specific IRQ handlers. # obj-$(CONFIG_SUPERH32) += imask.o obj-$(CONFIG_CPU_SH5) += intc-sh5.o diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 34d9e15857c3..44163e8c3868 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -25,7 +25,7 @@ KCOV_INSTRUMENT := n targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 -KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 +KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ -O2 KBUILD_CFLAGS += -fno-strict-aliasing $(call cc-option, -fPIE, -fPIC) KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING cflags-$(CONFIG_X86_32) := -march=i386 diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 476b574de99e..ec23d8e1297c 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -1,13 +1,17 @@ #ifndef _ASM_X86_E820_H #define _ASM_X86_E820_H -#ifdef CONFIG_EFI +/* + * E820_X_MAX is the maximum size of the extended E820 table. The extended + * table may contain up to 3 extra E820 entries per possible NUMA node, so we + * make room for 3 * MAX_NUMNODES possible entries, beyond the standard 128. + * Also note that E820_X_MAX *must* be defined before we include uapi/asm/e820.h. + */ #include <linux/numa.h> #define E820_X_MAX (E820MAX + 3 * MAX_NUMNODES) -#else /* ! CONFIG_EFI */ -#define E820_X_MAX E820MAX -#endif + #include <uapi/asm/e820.h> + #ifndef __ASSEMBLY__ /* see comment in arch/x86/kernel/e820.c */ extern struct e820map *e820; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index bdde80731f49..7892530cbacf 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -191,6 +191,8 @@ enum { #define PFERR_RSVD_BIT 3 #define PFERR_FETCH_BIT 4 #define PFERR_PK_BIT 5 +#define PFERR_GUEST_FINAL_BIT 32 +#define PFERR_GUEST_PAGE_BIT 33 #define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) #define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) @@ -198,6 +200,13 @@ enum { #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) #define PFERR_PK_MASK (1U << PFERR_PK_BIT) +#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT) +#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT) + +#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ + PFERR_USER_MASK | \ + PFERR_WRITE_MASK | \ + PFERR_PRESENT_MASK) /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 @@ -1062,6 +1071,7 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3); +bool pdptrs_changed(struct kvm_vcpu *vcpu); int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes); @@ -1124,7 +1134,8 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); struct x86_emulate_ctxt; int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port); -void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); +int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port); +int kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); int kvm_vcpu_halt(struct kvm_vcpu *vcpu); int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); @@ -1203,7 +1214,7 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu); @@ -1358,7 +1369,8 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu); extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); -void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); +int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); +int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); int kvm_is_in_guest(void); diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index a002b07a7099..2b5b2d4b924e 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -25,6 +25,7 @@ #define VMX_H +#include <linux/bitops.h> #include <linux/types.h> #include <uapi/asm/vmx.h> @@ -60,6 +61,7 @@ */ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 +#define SECONDARY_EXEC_DESC 0x00000004 #define SECONDARY_EXEC_RDTSCP 0x00000008 #define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010 #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 @@ -110,6 +112,36 @@ #define VMX_MISC_SAVE_EFER_LMA 0x00000020 #define VMX_MISC_ACTIVITY_HLT 0x00000040 +static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) +{ + return vmx_basic & GENMASK_ULL(30, 0); +} + +static inline u32 vmx_basic_vmcs_size(u64 vmx_basic) +{ + return (vmx_basic & GENMASK_ULL(44, 32)) >> 32; +} + +static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc) +{ + return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; +} + +static inline int vmx_misc_cr3_count(u64 vmx_misc) +{ + return (vmx_misc & GENMASK_ULL(24, 16)) >> 16; +} + +static inline int vmx_misc_max_msr(u64 vmx_misc) +{ + return (vmx_misc & GENMASK_ULL(27, 25)) >> 25; +} + +static inline int vmx_misc_mseg_revid(u64 vmx_misc) +{ + return (vmx_misc & GENMASK_ULL(63, 32)) >> 32; +} + /* VMCS Encodings */ enum vmcs_field { VIRTUAL_PROCESSOR_ID = 0x00000000, @@ -399,10 +431,11 @@ enum vmcs_field { #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 2) #define VMX_NR_VPIDS (1 << 16) +#define VMX_VPID_EXTENT_INDIVIDUAL_ADDR 0 #define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 #define VMX_VPID_EXTENT_ALL_CONTEXT 2 +#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL 3 -#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 #define VMX_EPT_EXTENT_CONTEXT 1 #define VMX_EPT_EXTENT_GLOBAL 2 #define VMX_EPT_EXTENT_SHIFT 24 @@ -419,8 +452,10 @@ enum vmcs_field { #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) #define VMX_VPID_INVVPID_BIT (1ull << 0) /* (32 - 32) */ +#define VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT (1ull << 8) /* (40 - 32) */ #define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */ #define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */ +#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT (1ull << 11) /* (43 - 32) */ #define VMX_EPT_DEFAULT_GAW 3 #define VMX_EPT_MAX_GAW 0x4 diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 37fee272618f..14458658e988 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -65,6 +65,8 @@ #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 #define EXIT_REASON_APIC_ACCESS 44 #define EXIT_REASON_EOI_INDUCED 45 +#define EXIT_REASON_GDTR_IDTR 46 +#define EXIT_REASON_LDTR_TR 47 #define EXIT_REASON_EPT_VIOLATION 48 #define EXIT_REASON_EPT_MISCONFIG 49 #define EXIT_REASON_INVEPT 50 @@ -113,6 +115,8 @@ { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \ { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \ { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ + { EXIT_REASON_GDTR_IDTR, "GDTR_IDTR" }, \ + { EXIT_REASON_LDTR_TR, "LDTR_TR" }, \ { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ { EXIT_REASON_INVEPT, "INVEPT" }, \ @@ -129,6 +133,7 @@ { EXIT_REASON_XRSTORS, "XRSTORS" } #define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 +#define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2 #define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 #endif /* _UAPIVMX_H */ diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index de6626c18e42..be6337156502 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -934,6 +934,8 @@ static int __populate_cache_leaves(unsigned int cpu) ci_leaf_init(this_leaf++, &id4_regs); __cache_cpumap_setup(cpu, idx, &id4_regs); } + this_cpu_ci->cpu_map_populated = true; + return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 132e1ec67da0..00ef43233e03 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -516,7 +516,7 @@ int mce_available(struct cpuinfo_x86 *c) static void mce_schedule_work(void) { - if (!mce_gen_pool_empty() && keventd_up()) + if (!mce_gen_pool_empty()) schedule_work(&mce_work); } diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0aefb626fa8f..b2d3cf1ef54a 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -16,6 +16,7 @@ #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> +#include <asm/processor.h> #include <asm/user.h> #include <asm/fpu/xstate.h> #include "cpuid.h" @@ -64,6 +65,11 @@ u64 kvm_supported_xcr0(void) #define F(x) bit(X86_FEATURE_##x) +/* These are scattered features in cpufeatures.h. */ +#define KVM_CPUID_BIT_AVX512_4VNNIW 2 +#define KVM_CPUID_BIT_AVX512_4FMAPS 3 +#define KF(x) bit(KVM_CPUID_BIT_##x) + int kvm_update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -80,6 +86,10 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) best->ecx |= F(OSXSAVE); } + best->edx &= ~F(APIC); + if (vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE) + best->edx |= F(APIC); + if (apic) { if (best->ecx & F(TSC_DEADLINE_TIMER)) apic->lapic_timer.timer_mode_mask = 3 << 17; @@ -374,6 +384,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ecx*/ const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/; + /* cpuid 7.0.edx*/ + const u32 kvm_cpuid_7_0_edx_x86_features = + KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS); + /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -456,12 +470,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* PKU is not yet implemented for shadow paging. */ if (!tdp_enabled) entry->ecx &= ~F(PKU); + entry->edx &= kvm_cpuid_7_0_edx_x86_features; + entry->edx &= get_scattered_cpuid_leaf(7, 0, CPUID_EDX); } else { entry->ebx = 0; entry->ecx = 0; + entry->edx = 0; } entry->eax = 0; - entry->edx = 0; break; } case 9: @@ -861,17 +877,17 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) } EXPORT_SYMBOL_GPL(kvm_cpuid); -void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +int kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { - u32 function, eax, ebx, ecx, edx; + u32 eax, ebx, ecx, edx; - function = eax = kvm_register_read(vcpu, VCPU_REGS_RAX); + eax = kvm_register_read(vcpu, VCPU_REGS_RAX); ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx); kvm_register_write(vcpu, VCPU_REGS_RAX, eax); kvm_register_write(vcpu, VCPU_REGS_RBX, ebx); kvm_register_write(vcpu, VCPU_REGS_RCX, ecx); kvm_register_write(vcpu, VCPU_REGS_RDX, edx); - kvm_x86_ops->skip_emulated_instruction(vcpu); + return kvm_skip_emulated_instruction(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a3ce9d260d68..56628a44668b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -158,9 +158,11 @@ #define Src2GS (OpGS << Src2Shift) #define Src2Mask (OpMask << Src2Shift) #define Mmx ((u64)1 << 40) /* MMX Vector instruction */ +#define AlignMask ((u64)7 << 41) #define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ -#define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */ -#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ +#define Unaligned ((u64)2 << 41) /* Explicitly unaligned (e.g. MOVDQU) */ +#define Avx ((u64)3 << 41) /* Advanced Vector Extensions */ +#define Aligned16 ((u64)4 << 41) /* Aligned to 16 byte boundary (e.g. FXSAVE) */ #define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ #define NoWrite ((u64)1 << 45) /* No writeback */ #define SrcWrite ((u64)1 << 46) /* Write back src operand */ @@ -446,6 +448,26 @@ FOP_END; FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET FOP_END; +/* + * XXX: inoutclob user must know where the argument is being expanded. + * Relying on CC_HAVE_ASM_GOTO would allow us to remove _fault. + */ +#define asm_safe(insn, inoutclob...) \ +({ \ + int _fault = 0; \ + \ + asm volatile("1:" insn "\n" \ + "2:\n" \ + ".pushsection .fixup, \"ax\"\n" \ + "3: movl $1, %[_fault]\n" \ + " jmp 2b\n" \ + ".popsection\n" \ + _ASM_EXTABLE(1b, 3b) \ + : [_fault] "+qm"(_fault) inoutclob ); \ + \ + _fault ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; \ +}) + static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, enum x86_intercept intercept, enum x86_intercept_stage stage) @@ -632,21 +654,26 @@ static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector, * depending on whether they're AVX encoded or not. * * Also included is CMPXCHG16B which is not a vector instruction, yet it is - * subject to the same check. + * subject to the same check. FXSAVE and FXRSTOR are checked here too as their + * 512 bytes of data must be aligned to a 16 byte boundary. */ -static bool insn_aligned(struct x86_emulate_ctxt *ctxt, unsigned size) +static unsigned insn_alignment(struct x86_emulate_ctxt *ctxt, unsigned size) { - if (likely(size < 16)) - return false; + u64 alignment = ctxt->d & AlignMask; - if (ctxt->d & Aligned) - return true; - else if (ctxt->d & Unaligned) - return false; - else if (ctxt->d & Avx) - return false; - else - return true; + if (likely(size < 16)) + return 1; + + switch (alignment) { + case Unaligned: + case Avx: + return 1; + case Aligned16: + return 16; + case Aligned: + default: + return size; + } } static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, @@ -704,7 +731,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, } break; } - if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0)) + if (la & (insn_alignment(ctxt, size) - 1)) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; bad: @@ -3842,6 +3869,131 @@ static int em_movsxd(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int check_fxsr(struct x86_emulate_ctxt *ctxt) +{ + u32 eax = 1, ebx, ecx = 0, edx; + + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + if (!(edx & FFL(FXSR))) + return emulate_ud(ctxt); + + if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) + return emulate_nm(ctxt); + + /* + * Don't emulate a case that should never be hit, instead of working + * around a lack of fxsave64/fxrstor64 on old compilers. + */ + if (ctxt->mode >= X86EMUL_MODE_PROT64) + return X86EMUL_UNHANDLEABLE; + + return X86EMUL_CONTINUE; +} + +/* + * FXSAVE and FXRSTOR have 4 different formats depending on execution mode, + * 1) 16 bit mode + * 2) 32 bit mode + * - like (1), but FIP and FDP (foo) are only 16 bit. At least Intel CPUs + * preserve whole 32 bit values, though, so (1) and (2) are the same wrt. + * save and restore + * 3) 64-bit mode with REX.W prefix + * - like (2), but XMM 8-15 are being saved and restored + * 4) 64-bit mode without REX.W prefix + * - like (3), but FIP and FDP are 64 bit + * + * Emulation uses (3) for (1) and (2) and preserves XMM 8-15 to reach the + * desired result. (4) is not emulated. + * + * Note: Guest and host CPUID.(EAX=07H,ECX=0H):EBX[bit 13] (deprecate FPU CS + * and FPU DS) should match. + */ +static int em_fxsave(struct x86_emulate_ctxt *ctxt) +{ + struct fxregs_state fx_state; + size_t size; + int rc; + + rc = check_fxsr(ctxt); + if (rc != X86EMUL_CONTINUE) + return rc; + + ctxt->ops->get_fpu(ctxt); + + rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); + + ctxt->ops->put_fpu(ctxt); + + if (rc != X86EMUL_CONTINUE) + return rc; + + if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR) + size = offsetof(struct fxregs_state, xmm_space[8 * 16/4]); + else + size = offsetof(struct fxregs_state, xmm_space[0]); + + return segmented_write(ctxt, ctxt->memop.addr.mem, &fx_state, size); +} + +static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt, + struct fxregs_state *new) +{ + int rc = X86EMUL_CONTINUE; + struct fxregs_state old; + + rc = asm_safe("fxsave %[fx]", , [fx] "+m"(old)); + if (rc != X86EMUL_CONTINUE) + return rc; + + /* + * 64 bit host will restore XMM 8-15, which is not correct on non-64 + * bit guests. Load the current values in order to preserve 64 bit + * XMMs after fxrstor. + */ +#ifdef CONFIG_X86_64 + /* XXX: accessing XMM 8-15 very awkwardly */ + memcpy(&new->xmm_space[8 * 16/4], &old.xmm_space[8 * 16/4], 8 * 16); +#endif + + /* + * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but + * does save and restore MXCSR. + */ + if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)) + memcpy(new->xmm_space, old.xmm_space, 8 * 16); + + return rc; +} + +static int em_fxrstor(struct x86_emulate_ctxt *ctxt) +{ + struct fxregs_state fx_state; + int rc; + + rc = check_fxsr(ctxt); + if (rc != X86EMUL_CONTINUE) + return rc; + + rc = segmented_read(ctxt, ctxt->memop.addr.mem, &fx_state, 512); + if (rc != X86EMUL_CONTINUE) + return rc; + + if (fx_state.mxcsr >> 16) + return emulate_gp(ctxt, 0); + + ctxt->ops->get_fpu(ctxt); + + if (ctxt->mode < X86EMUL_MODE_PROT64) + rc = fxrstor_fixup(ctxt, &fx_state); + + if (rc == X86EMUL_CONTINUE) + rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state)); + + ctxt->ops->put_fpu(ctxt); + + return rc; +} + static bool valid_cr(int nr) { switch (nr) { @@ -4194,7 +4346,9 @@ static const struct gprefix pfx_0f_ae_7 = { }; static const struct group_dual group15 = { { - N, N, N, N, N, N, N, GP(0, &pfx_0f_ae_7), + I(ModRM | Aligned16, em_fxsave), + I(ModRM | Aligned16, em_fxrstor), + N, N, N, N, N, GP(0, &pfx_0f_ae_7), }, { N, N, N, N, N, N, N, N, } }; @@ -5066,21 +5220,13 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) { - bool fault = false; + int rc; ctxt->ops->get_fpu(ctxt); - asm volatile("1: fwait \n\t" - "2: \n\t" - ".pushsection .fixup,\"ax\" \n\t" - "3: \n\t" - "movb $1, %[fault] \n\t" - "jmp 2b \n\t" - ".popsection \n\t" - _ASM_EXTABLE(1b, 3b) - : [fault]"+qm"(fault)); + rc = asm_safe("fwait"); ctxt->ops->put_fpu(ctxt); - if (unlikely(fault)) + if (unlikely(rc != X86EMUL_CONTINUE)) return emulate_exception(ctxt, MF_VECTOR, 0, false); return X86EMUL_CONTINUE; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 42b1c83741c8..99cde5220e07 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -291,7 +291,7 @@ static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata) return ret; } -int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) +static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) { struct kvm_vcpu *vcpu = synic_to_vcpu(synic); struct kvm_lapic_irq irq; diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 16a7134eedac..a78b445ce411 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -212,7 +212,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) */ smp_mb(); if (atomic_dec_if_positive(&ps->pending) > 0) - kthread_queue_work(&pit->worker, &pit->expired); + kthread_queue_work(pit->worker, &pit->expired); } void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) @@ -272,7 +272,7 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) if (atomic_read(&ps->reinject)) atomic_inc(&ps->pending); - kthread_queue_work(&pt->worker, &pt->expired); + kthread_queue_work(pt->worker, &pt->expired); if (ps->is_periodic) { hrtimer_add_expires_ns(&ps->timer, ps->period); @@ -667,10 +667,8 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) pid_nr = pid_vnr(pid); put_pid(pid); - kthread_init_worker(&pit->worker); - pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker, - "kvm-pit/%d", pid_nr); - if (IS_ERR(pit->worker_task)) + pit->worker = kthread_create_worker(0, "kvm-pit/%d", pid_nr); + if (IS_ERR(pit->worker)) goto fail_kthread; kthread_init_work(&pit->expired, pit_do_work); @@ -713,7 +711,7 @@ fail_register_speaker: fail_register_pit: mutex_unlock(&kvm->slots_lock); kvm_pit_set_reinject(pit, false); - kthread_stop(pit->worker_task); + kthread_destroy_worker(pit->worker); fail_kthread: kvm_free_irq_source_id(kvm, pit->irq_source_id); fail_request: @@ -730,8 +728,7 @@ void kvm_free_pit(struct kvm *kvm) kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev); kvm_pit_set_reinject(pit, false); hrtimer_cancel(&pit->pit_state.timer); - kthread_flush_work(&pit->expired); - kthread_stop(pit->worker_task); + kthread_destroy_worker(pit->worker); kvm_free_irq_source_id(kvm, pit->irq_source_id); kfree(pit); } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 2f5af0798326..600bee9dcbbd 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -44,8 +44,7 @@ struct kvm_pit { struct kvm_kpit_state pit_state; int irq_source_id; struct kvm_irq_mask_notifier mask_notifier; - struct kthread_worker worker; - struct task_struct *worker_task; + struct kthread_worker *worker; struct kthread_work expired; }; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 6f69340f9fa3..34a66b2d47e6 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -342,9 +342,11 @@ void __kvm_apic_update_irr(u32 *pir, void *regs) u32 i, pir_val; for (i = 0; i <= 7; i++) { - pir_val = xchg(&pir[i], 0); - if (pir_val) + pir_val = READ_ONCE(pir[i]); + if (pir_val) { + pir_val = xchg(&pir[i], 0); *((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val; + } } } EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); @@ -1090,7 +1092,7 @@ static void apic_send_ipi(struct kvm_lapic *apic) static u32 apic_get_tmcct(struct kvm_lapic *apic) { - ktime_t remaining; + ktime_t remaining, now; s64 ns; u32 tmcct; @@ -1101,7 +1103,8 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) apic->lapic_timer.period == 0) return 0; - remaining = hrtimer_get_remaining(&apic->lapic_timer.timer); + now = ktime_get(); + remaining = ktime_sub(apic->lapic_timer.target_expiration, now); if (ktime_to_ns(remaining) < 0) remaining = ktime_set(0, 0); @@ -1332,7 +1335,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) local_irq_save(flags); - now = apic->lapic_timer.timer.base->get_time(); + now = ktime_get(); guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); if (likely(tscdeadline > guest_tsc)) { ns = (tscdeadline - guest_tsc) * 1000000ULL; @@ -1347,6 +1350,79 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) local_irq_restore(flags); } +static void start_sw_period(struct kvm_lapic *apic) +{ + if (!apic->lapic_timer.period) + return; + + if (apic_lvtt_oneshot(apic) && + ktime_after(ktime_get(), + apic->lapic_timer.target_expiration)) { + apic_timer_expired(apic); + return; + } + + hrtimer_start(&apic->lapic_timer.timer, + apic->lapic_timer.target_expiration, + HRTIMER_MODE_ABS_PINNED); +} + +static bool set_target_expiration(struct kvm_lapic *apic) +{ + ktime_t now; + u64 tscl = rdtsc(); + + now = ktime_get(); + apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT) + * APIC_BUS_CYCLE_NS * apic->divide_count; + + if (!apic->lapic_timer.period) + return false; + + /* + * Do not allow the guest to program periodic timers with small + * interval, since the hrtimers are not throttled by the host + * scheduler. + */ + if (apic_lvtt_period(apic)) { + s64 min_period = min_timer_period_us * 1000LL; + + if (apic->lapic_timer.period < min_period) { + pr_info_ratelimited( + "kvm: vcpu %i: requested %lld ns " + "lapic timer period limited to %lld ns\n", + apic->vcpu->vcpu_id, + apic->lapic_timer.period, min_period); + apic->lapic_timer.period = min_period; + } + } + + apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" + PRIx64 ", " + "timer initial count 0x%x, period %lldns, " + "expire @ 0x%016" PRIx64 ".\n", __func__, + APIC_BUS_CYCLE_NS, ktime_to_ns(now), + kvm_lapic_get_reg(apic, APIC_TMICT), + apic->lapic_timer.period, + ktime_to_ns(ktime_add_ns(now, + apic->lapic_timer.period))); + + apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) + + nsec_to_cycles(apic->vcpu, apic->lapic_timer.period); + apic->lapic_timer.target_expiration = ktime_add_ns(now, apic->lapic_timer.period); + + return true; +} + +static void advance_periodic_target_expiration(struct kvm_lapic *apic) +{ + apic->lapic_timer.tscdeadline += + nsec_to_cycles(apic->vcpu, apic->lapic_timer.period); + apic->lapic_timer.target_expiration = + ktime_add_ns(apic->lapic_timer.target_expiration, + apic->lapic_timer.period); +} + bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) { if (!lapic_in_kernel(vcpu)) @@ -1356,52 +1432,59 @@ bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); -static void cancel_hv_tscdeadline(struct kvm_lapic *apic) +static void cancel_hv_timer(struct kvm_lapic *apic) { kvm_x86_ops->cancel_hv_timer(apic->vcpu); apic->lapic_timer.hv_timer_in_use = false; } -void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - WARN_ON(!apic->lapic_timer.hv_timer_in_use); - WARN_ON(swait_active(&vcpu->wq)); - cancel_hv_tscdeadline(apic); - apic_timer_expired(apic); -} -EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); - -static bool start_hv_tscdeadline(struct kvm_lapic *apic) +static bool start_hv_timer(struct kvm_lapic *apic) { u64 tscdeadline = apic->lapic_timer.tscdeadline; - if (atomic_read(&apic->lapic_timer.pending) || + if ((atomic_read(&apic->lapic_timer.pending) && + !apic_lvtt_period(apic)) || kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) { if (apic->lapic_timer.hv_timer_in_use) - cancel_hv_tscdeadline(apic); + cancel_hv_timer(apic); } else { apic->lapic_timer.hv_timer_in_use = true; hrtimer_cancel(&apic->lapic_timer.timer); /* In case the sw timer triggered in the window */ - if (atomic_read(&apic->lapic_timer.pending)) - cancel_hv_tscdeadline(apic); + if (atomic_read(&apic->lapic_timer.pending) && + !apic_lvtt_period(apic)) + cancel_hv_timer(apic); } trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, apic->lapic_timer.hv_timer_in_use); return apic->lapic_timer.hv_timer_in_use; } +void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + WARN_ON(!apic->lapic_timer.hv_timer_in_use); + WARN_ON(swait_active(&vcpu->wq)); + cancel_hv_timer(apic); + apic_timer_expired(apic); + + if (apic_lvtt_period(apic) && apic->lapic_timer.period) { + advance_periodic_target_expiration(apic); + if (!start_hv_timer(apic)) + start_sw_period(apic); + } +} +EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); + void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; WARN_ON(apic->lapic_timer.hv_timer_in_use); - if (apic_lvtt_tscdeadline(apic)) - start_hv_tscdeadline(apic); + start_hv_timer(apic); } EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer); @@ -1413,62 +1496,28 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) if (!apic->lapic_timer.hv_timer_in_use) return; - cancel_hv_tscdeadline(apic); + cancel_hv_timer(apic); if (atomic_read(&apic->lapic_timer.pending)) return; - start_sw_tscdeadline(apic); + if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) + start_sw_period(apic); + else if (apic_lvtt_tscdeadline(apic)) + start_sw_tscdeadline(apic); } EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); static void start_apic_timer(struct kvm_lapic *apic) { - ktime_t now; - atomic_set(&apic->lapic_timer.pending, 0); if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { - /* lapic timer in oneshot or periodic mode */ - now = apic->lapic_timer.timer.base->get_time(); - apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT) - * APIC_BUS_CYCLE_NS * apic->divide_count; - - if (!apic->lapic_timer.period) - return; - /* - * Do not allow the guest to program periodic timers with small - * interval, since the hrtimers are not throttled by the host - * scheduler. - */ - if (apic_lvtt_period(apic)) { - s64 min_period = min_timer_period_us * 1000LL; - - if (apic->lapic_timer.period < min_period) { - pr_info_ratelimited( - "kvm: vcpu %i: requested %lld ns " - "lapic timer period limited to %lld ns\n", - apic->vcpu->vcpu_id, - apic->lapic_timer.period, min_period); - apic->lapic_timer.period = min_period; - } - } - - hrtimer_start(&apic->lapic_timer.timer, - ktime_add_ns(now, apic->lapic_timer.period), - HRTIMER_MODE_ABS_PINNED); - - apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" - PRIx64 ", " - "timer initial count 0x%x, period %lldns, " - "expire @ 0x%016" PRIx64 ".\n", __func__, - APIC_BUS_CYCLE_NS, ktime_to_ns(now), - kvm_lapic_get_reg(apic, APIC_TMICT), - apic->lapic_timer.period, - ktime_to_ns(ktime_add_ns(now, - apic->lapic_timer.period))); + if (set_target_expiration(apic) && + !(kvm_x86_ops->set_hv_timer && start_hv_timer(apic))) + start_sw_period(apic); } else if (apic_lvtt_tscdeadline(apic)) { - if (!(kvm_x86_ops->set_hv_timer && start_hv_tscdeadline(apic))) + if (!(kvm_x86_ops->set_hv_timer && start_hv_timer(apic))) start_sw_tscdeadline(apic); } } @@ -1701,13 +1750,22 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) * LAPIC interface *---------------------------------------------------------------------- */ +u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (!lapic_in_kernel(vcpu)) + return 0; + + return apic->lapic_timer.tscdeadline; +} u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) || - apic_lvtt_period(apic)) + if (!lapic_in_kernel(vcpu) || + !apic_lvtt_tscdeadline(apic)) return 0; return apic->lapic_timer.tscdeadline; @@ -1748,14 +1806,17 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) u64 old_value = vcpu->arch.apic_base; struct kvm_lapic *apic = vcpu->arch.apic; - if (!apic) { + if (!apic) value |= MSR_IA32_APICBASE_BSP; - vcpu->arch.apic_base = value; - return; - } vcpu->arch.apic_base = value; + if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) + kvm_update_cpuid(vcpu); + + if (!apic) + return; + /* update jump label if enable bit changes */ if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) { if (value & MSR_IA32_APICBASE_ENABLE) { @@ -1909,6 +1970,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) apic_timer_expired(apic); if (lapic_is_periodic(apic)) { + advance_periodic_target_expiration(apic); hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); return HRTIMER_RESTART; } else @@ -1993,6 +2055,10 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) kvm_apic_local_deliver(apic, APIC_LVTT); if (apic_lvtt_tscdeadline(apic)) apic->lapic_timer.tscdeadline = 0; + if (apic_lvtt_oneshot(apic)) { + apic->lapic_timer.tscdeadline = 0; + apic->lapic_timer.target_expiration = ktime_set(0, 0); + } atomic_set(&apic->lapic_timer.pending, 0); } } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index f60d01c29d51..e0c80233b3e1 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -15,6 +15,7 @@ struct kvm_timer { struct hrtimer timer; s64 period; /* unit: ns */ + ktime_t target_expiration; u32 timer_mode; u32 timer_mode_mask; u64 tscdeadline; @@ -85,6 +86,7 @@ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); +u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 87c5880ba3b7..7012de4a1fed 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1660,17 +1660,9 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) * This has some overhead, but not as much as the cost of swapping * out actively used pages or breaking up actively used hugepages. */ - if (!shadow_accessed_mask) { - /* - * We are holding the kvm->mmu_lock, and we are blowing up - * shadow PTEs. MMU notifier consumers need to be kept at bay. - * This is correct as long as we don't decouple the mmu_lock - * protected regions (like invalidate_range_start|end does). - */ - kvm->mmu_notifier_seq++; + if (!shadow_accessed_mask) return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); - } return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); } @@ -4509,7 +4501,7 @@ static void make_mmu_pages_available(struct kvm_vcpu *vcpu) kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); } -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, void *insn, int insn_len) { int r, emulation_type = EMULTYPE_RETRY; @@ -4528,12 +4520,28 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, return r; } - r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); + r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code), + false); if (r < 0) return r; if (!r) return 1; + /* + * Before emulating the instruction, check if the error code + * was due to a RO violation while translating the guest page. + * This can occur when using nested virtualization with nested + * paging in both guests. If true, we simply unprotect the page + * and resume the guest. + * + * Note: AMD only (since it supports the PFERR_GUEST_PAGE_MASK used + * in PFERR_NEXT_GUEST_PAGE) + */ + if (error_code == PFERR_NESTED_GUEST_PAGE) { + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2)); + return 1; + } + if (mmio_info_in_cache(vcpu, cr2, direct)) emulation_type = 0; emulate: @@ -4967,7 +4975,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots) * zap all shadow pages. */ if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) { - printk_ratelimited(KERN_DEBUG "kvm: zapping shadow pages for mmio generation wraparound\n"); + kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_invalidate_zap_all_pages(kvm); } } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8ca1eca5038d..08a4d3ab3455 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2074,7 +2074,7 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) static int pf_interception(struct vcpu_svm *svm) { u64 fault_address = svm->vmcb->control.exit_info_2; - u32 error_code; + u64 error_code; int r = 1; switch (svm->apf_reason) { @@ -2270,7 +2270,7 @@ static int io_interception(struct vcpu_svm *svm) ++svm->vcpu.stat.io_exits; string = (io_info & SVM_IOIO_STR_MASK) != 0; in = (io_info & SVM_IOIO_TYPE_MASK) != 0; - if (string || in) + if (string) return emulate_instruction(vcpu, 0) == EMULATE_DONE; port = io_info >> 16; @@ -2278,7 +2278,8 @@ static int io_interception(struct vcpu_svm *svm) svm->next_rip = svm->vmcb->control.exit_info_2; skip_emulated_instruction(&svm->vcpu); - return kvm_fast_pio_out(vcpu, size, port); + return in ? kvm_fast_pio_in(vcpu, size, port) + : kvm_fast_pio_out(vcpu, size, port); } static int nmi_interception(struct vcpu_svm *svm) @@ -3150,8 +3151,7 @@ static int skinit_interception(struct vcpu_svm *svm) static int wbinvd_interception(struct vcpu_svm *svm) { - kvm_emulate_wbinvd(&svm->vcpu); - return 1; + return kvm_emulate_wbinvd(&svm->vcpu); } static int xsetbv_interception(struct vcpu_svm *svm) @@ -3238,8 +3238,7 @@ static int task_switch_interception(struct vcpu_svm *svm) static int cpuid_interception(struct vcpu_svm *svm) { svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - kvm_emulate_cpuid(&svm->vcpu); - return 1; + return kvm_emulate_cpuid(&svm->vcpu); } static int iret_interception(struct vcpu_svm *svm) @@ -3275,9 +3274,7 @@ static int rdpmc_interception(struct vcpu_svm *svm) return emulate_on_interception(svm); err = kvm_rdpmc(&svm->vcpu); - kvm_complete_insn_gp(&svm->vcpu, err); - - return 1; + return kvm_complete_insn_gp(&svm->vcpu, err); } static bool check_selective_cr0_intercepted(struct vcpu_svm *svm, @@ -3374,9 +3371,7 @@ static int cr_interception(struct vcpu_svm *svm) } kvm_register_write(&svm->vcpu, reg, val); } - kvm_complete_insn_gp(&svm->vcpu, err); - - return 1; + return kvm_complete_insn_gp(&svm->vcpu, err); } static int dr_interception(struct vcpu_svm *svm) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3980da515fd0..aae43c6f2472 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -132,6 +132,22 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 +#define VMX_VPID_EXTENT_SUPPORTED_MASK \ + (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ + VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ + VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ + VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) + +/* + * Hyper-V requires all of these, so mark them as supported even though + * they are just treated the same as all-context. + */ +#define VMX_VPID_EXTENT_SUPPORTED_MASK \ + (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ + VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ + VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ + VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) + /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive @@ -446,23 +462,31 @@ struct nested_vmx { u16 vpid02; u16 last_vpid; + /* + * We only store the "true" versions of the VMX capability MSRs. We + * generate the "non-true" versions by setting the must-be-1 bits + * according to the SDM. + */ u32 nested_vmx_procbased_ctls_low; u32 nested_vmx_procbased_ctls_high; - u32 nested_vmx_true_procbased_ctls_low; u32 nested_vmx_secondary_ctls_low; u32 nested_vmx_secondary_ctls_high; u32 nested_vmx_pinbased_ctls_low; u32 nested_vmx_pinbased_ctls_high; u32 nested_vmx_exit_ctls_low; u32 nested_vmx_exit_ctls_high; - u32 nested_vmx_true_exit_ctls_low; u32 nested_vmx_entry_ctls_low; u32 nested_vmx_entry_ctls_high; - u32 nested_vmx_true_entry_ctls_low; u32 nested_vmx_misc_low; u32 nested_vmx_misc_high; u32 nested_vmx_ept_caps; u32 nested_vmx_vpid_caps; + u64 nested_vmx_basic; + u64 nested_vmx_cr0_fixed0; + u64 nested_vmx_cr0_fixed1; + u64 nested_vmx_cr4_fixed0; + u64 nested_vmx_cr4_fixed1; + u64 nested_vmx_vmcs_enum; }; #define POSTED_INTR_ON 0 @@ -520,6 +544,12 @@ static inline void pi_set_sn(struct pi_desc *pi_desc) (unsigned long *)&pi_desc->control); } +static inline void pi_clear_on(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + static inline int pi_test_on(struct pi_desc *pi_desc) { return test_bit(POSTED_INTR_ON, @@ -920,16 +950,32 @@ static DEFINE_PER_CPU(struct desc_ptr, host_gdt); static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); -static unsigned long *vmx_io_bitmap_a; -static unsigned long *vmx_io_bitmap_b; -static unsigned long *vmx_msr_bitmap_legacy; -static unsigned long *vmx_msr_bitmap_longmode; -static unsigned long *vmx_msr_bitmap_legacy_x2apic; -static unsigned long *vmx_msr_bitmap_longmode_x2apic; -static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive; -static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive; -static unsigned long *vmx_vmread_bitmap; -static unsigned long *vmx_vmwrite_bitmap; +enum { + VMX_IO_BITMAP_A, + VMX_IO_BITMAP_B, + VMX_MSR_BITMAP_LEGACY, + VMX_MSR_BITMAP_LONGMODE, + VMX_MSR_BITMAP_LEGACY_X2APIC_APICV, + VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV, + VMX_MSR_BITMAP_LEGACY_X2APIC, + VMX_MSR_BITMAP_LONGMODE_X2APIC, + VMX_VMREAD_BITMAP, + VMX_VMWRITE_BITMAP, + VMX_BITMAP_NR +}; + +static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; + +#define vmx_io_bitmap_a (vmx_bitmap[VMX_IO_BITMAP_A]) +#define vmx_io_bitmap_b (vmx_bitmap[VMX_IO_BITMAP_B]) +#define vmx_msr_bitmap_legacy (vmx_bitmap[VMX_MSR_BITMAP_LEGACY]) +#define vmx_msr_bitmap_longmode (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE]) +#define vmx_msr_bitmap_legacy_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV]) +#define vmx_msr_bitmap_longmode_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV]) +#define vmx_msr_bitmap_legacy_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC]) +#define vmx_msr_bitmap_longmode_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC]) +#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) +#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) static bool cpu_has_load_ia32_efer; static bool cpu_has_load_perf_global_ctrl; @@ -2523,14 +2569,14 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) { if (is_long_mode(vcpu)) - msr_bitmap = vmx_msr_bitmap_longmode_x2apic; + msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv; else - msr_bitmap = vmx_msr_bitmap_legacy_x2apic; + msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv; } else { if (is_long_mode(vcpu)) - msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv_inactive; + msr_bitmap = vmx_msr_bitmap_longmode_x2apic; else - msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv_inactive; + msr_bitmap = vmx_msr_bitmap_legacy_x2apic; } } else { if (is_long_mode(vcpu)) @@ -2706,9 +2752,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; /* We support free control of debug control saving. */ - vmx->nested.nested_vmx_true_exit_ctls_low = - vmx->nested.nested_vmx_exit_ctls_low & - ~VM_EXIT_SAVE_DEBUG_CONTROLS; + vmx->nested.nested_vmx_exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2727,9 +2771,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; /* We support free control of debug control loading. */ - vmx->nested.nested_vmx_true_entry_ctls_low = - vmx->nested.nested_vmx_entry_ctls_low & - ~VM_ENTRY_LOAD_DEBUG_CONTROLS; + vmx->nested.nested_vmx_entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, @@ -2762,8 +2804,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) CPU_BASED_USE_MSR_BITMAPS; /* We support free control of CR3 access interception. */ - vmx->nested.nested_vmx_true_procbased_ctls_low = - vmx->nested.nested_vmx_procbased_ctls_low & + vmx->nested.nested_vmx_procbased_ctls_low &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); /* secondary cpu-based controls */ @@ -2774,6 +2815,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_secondary_ctls_high &= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_RDTSCP | + SECONDARY_EXEC_DESC | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_APIC_REGISTER_VIRT | @@ -2805,8 +2847,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) */ if (enable_vpid) vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | - VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | - VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; + VMX_VPID_EXTENT_SUPPORTED_MASK; else vmx->nested.nested_vmx_vpid_caps = 0; @@ -2823,14 +2864,52 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | VMX_MISC_ACTIVITY_HLT; vmx->nested.nested_vmx_misc_high = 0; + + /* + * This MSR reports some information about VMX support. We + * should return information about the VMX we emulate for the + * guest, and the VMCS structure we give it - not about the + * VMX support of the underlying hardware. + */ + vmx->nested.nested_vmx_basic = + VMCS12_REVISION | + VMX_BASIC_TRUE_CTLS | + ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | + (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); + + if (cpu_has_vmx_basic_inout()) + vmx->nested.nested_vmx_basic |= VMX_BASIC_INOUT; + + /* + * These MSRs specify bits which the guest must keep fixed on + * while L1 is in VMXON mode (in L1's root mode, or running an L2). + * We picked the standard core2 setting. + */ +#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) +#define VMXON_CR4_ALWAYSON X86_CR4_VMXE + vmx->nested.nested_vmx_cr0_fixed0 = VMXON_CR0_ALWAYSON; + vmx->nested.nested_vmx_cr4_fixed0 = VMXON_CR4_ALWAYSON; + + /* These MSRs specify bits which the guest must keep fixed off. */ + rdmsrl(MSR_IA32_VMX_CR0_FIXED1, vmx->nested.nested_vmx_cr0_fixed1); + rdmsrl(MSR_IA32_VMX_CR4_FIXED1, vmx->nested.nested_vmx_cr4_fixed1); + + /* highest index: VMX_PREEMPTION_TIMER_VALUE */ + vmx->nested.nested_vmx_vmcs_enum = 0x2e; +} + +/* + * if fixed0[i] == 1: val[i] must be 1 + * if fixed1[i] == 0: val[i] must be 0 + */ +static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1) +{ + return ((val & fixed1) | fixed0) == val; } static inline bool vmx_control_verify(u32 control, u32 low, u32 high) { - /* - * Bits 0 in high must be 0, and bits 1 in low must be 1. - */ - return ((control & high) | low) == control; + return fixed_bits_valid(control, low, high); } static inline u64 vmx_control_msr(u32 low, u32 high) @@ -2838,87 +2917,285 @@ static inline u64 vmx_control_msr(u32 low, u32 high) return low | ((u64)high << 32); } -/* Returns 0 on success, non-0 otherwise. */ -static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) +{ + superset &= mask; + subset &= mask; + + return (superset | subset) == superset; +} + +static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) +{ + const u64 feature_and_reserved = + /* feature (except bit 48; see below) */ + BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | + /* reserved */ + BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); + u64 vmx_basic = vmx->nested.nested_vmx_basic; + + if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) + return -EINVAL; + + /* + * KVM does not emulate a version of VMX that constrains physical + * addresses of VMX structures (e.g. VMCS) to 32-bits. + */ + if (data & BIT_ULL(48)) + return -EINVAL; + + if (vmx_basic_vmcs_revision_id(vmx_basic) != + vmx_basic_vmcs_revision_id(data)) + return -EINVAL; + + if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) + return -EINVAL; + + vmx->nested.nested_vmx_basic = data; + return 0; +} + +static int +vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + u64 supported; + u32 *lowp, *highp; + + switch (msr_index) { + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: + lowp = &vmx->nested.nested_vmx_pinbased_ctls_low; + highp = &vmx->nested.nested_vmx_pinbased_ctls_high; + break; + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + lowp = &vmx->nested.nested_vmx_procbased_ctls_low; + highp = &vmx->nested.nested_vmx_procbased_ctls_high; + break; + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + lowp = &vmx->nested.nested_vmx_exit_ctls_low; + highp = &vmx->nested.nested_vmx_exit_ctls_high; + break; + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + lowp = &vmx->nested.nested_vmx_entry_ctls_low; + highp = &vmx->nested.nested_vmx_entry_ctls_high; + break; + case MSR_IA32_VMX_PROCBASED_CTLS2: + lowp = &vmx->nested.nested_vmx_secondary_ctls_low; + highp = &vmx->nested.nested_vmx_secondary_ctls_high; + break; + default: + BUG(); + } + + supported = vmx_control_msr(*lowp, *highp); + + /* Check must-be-1 bits are still 1. */ + if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) + return -EINVAL; + + /* Check must-be-0 bits are still 0. */ + if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) + return -EINVAL; + + *lowp = data; + *highp = data >> 32; + return 0; +} + +static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) +{ + const u64 feature_and_reserved_bits = + /* feature */ + BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | + BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | + /* reserved */ + GENMASK_ULL(13, 9) | BIT_ULL(31); + u64 vmx_misc; + + vmx_misc = vmx_control_msr(vmx->nested.nested_vmx_misc_low, + vmx->nested.nested_vmx_misc_high); + + if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) + return -EINVAL; + + if ((vmx->nested.nested_vmx_pinbased_ctls_high & + PIN_BASED_VMX_PREEMPTION_TIMER) && + vmx_misc_preemption_timer_rate(data) != + vmx_misc_preemption_timer_rate(vmx_misc)) + return -EINVAL; + + if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) + return -EINVAL; + + if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) + return -EINVAL; + + if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) + return -EINVAL; + + vmx->nested.nested_vmx_misc_low = data; + vmx->nested.nested_vmx_misc_high = data >> 32; + return 0; +} + +static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) +{ + u64 vmx_ept_vpid_cap; + + vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.nested_vmx_ept_caps, + vmx->nested.nested_vmx_vpid_caps); + + /* Every bit is either reserved or a feature bit. */ + if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) + return -EINVAL; + + vmx->nested.nested_vmx_ept_caps = data; + vmx->nested.nested_vmx_vpid_caps = data >> 32; + return 0; +} + +static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + u64 *msr; + + switch (msr_index) { + case MSR_IA32_VMX_CR0_FIXED0: + msr = &vmx->nested.nested_vmx_cr0_fixed0; + break; + case MSR_IA32_VMX_CR4_FIXED0: + msr = &vmx->nested.nested_vmx_cr4_fixed0; + break; + default: + BUG(); + } + + /* + * 1 bits (which indicates bits which "must-be-1" during VMX operation) + * must be 1 in the restored value. + */ + if (!is_bitwise_subset(data, *msr, -1ULL)) + return -EINVAL; + + *msr = data; + return 0; +} + +/* + * Called when userspace is restoring VMX MSRs. + * + * Returns 0 on success, non-0 otherwise. + */ +static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { struct vcpu_vmx *vmx = to_vmx(vcpu); switch (msr_index) { case MSR_IA32_VMX_BASIC: + return vmx_restore_vmx_basic(vmx, data); + case MSR_IA32_VMX_PINBASED_CTLS: + case MSR_IA32_VMX_PROCBASED_CTLS: + case MSR_IA32_VMX_EXIT_CTLS: + case MSR_IA32_VMX_ENTRY_CTLS: + /* + * The "non-true" VMX capability MSRs are generated from the + * "true" MSRs, so we do not support restoring them directly. + * + * If userspace wants to emulate VMX_BASIC[55]=0, userspace + * should restore the "true" MSRs with the must-be-1 bits + * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND + * DEFAULT SETTINGS". + */ + return -EINVAL; + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + case MSR_IA32_VMX_PROCBASED_CTLS2: + return vmx_restore_control_msr(vmx, msr_index, data); + case MSR_IA32_VMX_MISC: + return vmx_restore_vmx_misc(vmx, data); + case MSR_IA32_VMX_CR0_FIXED0: + case MSR_IA32_VMX_CR4_FIXED0: + return vmx_restore_fixed0_msr(vmx, msr_index, data); + case MSR_IA32_VMX_CR0_FIXED1: + case MSR_IA32_VMX_CR4_FIXED1: + /* + * These MSRs are generated based on the vCPU's CPUID, so we + * do not support restoring them directly. + */ + return -EINVAL; + case MSR_IA32_VMX_EPT_VPID_CAP: + return vmx_restore_vmx_ept_vpid_cap(vmx, data); + case MSR_IA32_VMX_VMCS_ENUM: + vmx->nested.nested_vmx_vmcs_enum = data; + return 0; + default: /* - * This MSR reports some information about VMX support. We - * should return information about the VMX we emulate for the - * guest, and the VMCS structure we give it - not about the - * VMX support of the underlying hardware. + * The rest of the VMX capability MSRs do not support restore. */ - *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS | - ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | - (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); - if (cpu_has_vmx_basic_inout()) - *pdata |= VMX_BASIC_INOUT; + return -EINVAL; + } +} + +/* Returns 0 on success, non-0 otherwise. */ +static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + switch (msr_index) { + case MSR_IA32_VMX_BASIC: + *pdata = vmx->nested.nested_vmx_basic; break; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS: *pdata = vmx_control_msr( vmx->nested.nested_vmx_pinbased_ctls_low, vmx->nested.nested_vmx_pinbased_ctls_high); + if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) + *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - *pdata = vmx_control_msr( - vmx->nested.nested_vmx_true_procbased_ctls_low, - vmx->nested.nested_vmx_procbased_ctls_high); - break; case MSR_IA32_VMX_PROCBASED_CTLS: *pdata = vmx_control_msr( vmx->nested.nested_vmx_procbased_ctls_low, vmx->nested.nested_vmx_procbased_ctls_high); + if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) + *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_EXIT_CTLS: - *pdata = vmx_control_msr( - vmx->nested.nested_vmx_true_exit_ctls_low, - vmx->nested.nested_vmx_exit_ctls_high); - break; case MSR_IA32_VMX_EXIT_CTLS: *pdata = vmx_control_msr( vmx->nested.nested_vmx_exit_ctls_low, vmx->nested.nested_vmx_exit_ctls_high); + if (msr_index == MSR_IA32_VMX_EXIT_CTLS) + *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - *pdata = vmx_control_msr( - vmx->nested.nested_vmx_true_entry_ctls_low, - vmx->nested.nested_vmx_entry_ctls_high); - break; case MSR_IA32_VMX_ENTRY_CTLS: *pdata = vmx_control_msr( vmx->nested.nested_vmx_entry_ctls_low, vmx->nested.nested_vmx_entry_ctls_high); + if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) + *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_MISC: *pdata = vmx_control_msr( vmx->nested.nested_vmx_misc_low, vmx->nested.nested_vmx_misc_high); break; - /* - * These MSRs specify bits which the guest must keep fixed (on or off) - * while L1 is in VMXON mode (in L1's root mode, or running an L2). - * We picked the standard core2 setting. - */ -#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) -#define VMXON_CR4_ALWAYSON X86_CR4_VMXE case MSR_IA32_VMX_CR0_FIXED0: - *pdata = VMXON_CR0_ALWAYSON; + *pdata = vmx->nested.nested_vmx_cr0_fixed0; break; case MSR_IA32_VMX_CR0_FIXED1: - *pdata = -1ULL; + *pdata = vmx->nested.nested_vmx_cr0_fixed1; break; case MSR_IA32_VMX_CR4_FIXED0: - *pdata = VMXON_CR4_ALWAYSON; + *pdata = vmx->nested.nested_vmx_cr4_fixed0; break; case MSR_IA32_VMX_CR4_FIXED1: - *pdata = -1ULL; + *pdata = vmx->nested.nested_vmx_cr4_fixed1; break; case MSR_IA32_VMX_VMCS_ENUM: - *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */ + *pdata = vmx->nested.nested_vmx_vmcs_enum; break; case MSR_IA32_VMX_PROCBASED_CTLS2: *pdata = vmx_control_msr( @@ -3101,7 +3378,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx_leave_nested(vcpu); break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: - return 1; /* they are read-only */ + if (!msr_info->host_initiated) + return 1; /* they are read-only */ + if (!nested_vmx_allowed(vcpu)) + return 1; + return vmx_set_vmx_msr(vcpu, msr_index, data); case MSR_IA32_XSS: if (!vmx_xsaves_supported()) return 1; @@ -3863,6 +4144,40 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu) (unsigned long *)&vcpu->arch.regs_dirty); } +static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1; + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high & + SECONDARY_EXEC_UNRESTRICTED_GUEST && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) + fixed0 &= ~(X86_CR0_PE | X86_CR0_PG); + + return fixed_bits_valid(val, fixed0, fixed1); +} + +static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1; + + return fixed_bits_valid(val, fixed0, fixed1); +} + +static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed1; + + return fixed_bits_valid(val, fixed0, fixed1); +} + +/* No difference in the restrictions on guest and host CR4 in VMX operation. */ +#define nested_guest_cr4_valid nested_cr4_valid +#define nested_host_cr4_valid nested_cr4_valid + static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, @@ -3991,8 +4306,8 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!nested_vmx_allowed(vcpu)) return 1; } - if (to_vmx(vcpu)->nested.vmxon && - ((cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) + + if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) return 1; vcpu->arch.cr4 = cr4; @@ -4569,41 +4884,6 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, } } -static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, - u32 msr, int type) -{ - int f = sizeof(unsigned long); - - if (!cpu_has_vmx_msr_bitmap()) - return; - - /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - if (type & MSR_TYPE_R) - /* read-low */ - __set_bit(msr, msr_bitmap + 0x000 / f); - - if (type & MSR_TYPE_W) - /* write-low */ - __set_bit(msr, msr_bitmap + 0x800 / f); - - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - if (type & MSR_TYPE_R) - /* read-high */ - __set_bit(msr, msr_bitmap + 0x400 / f); - - if (type & MSR_TYPE_W) - /* write-high */ - __set_bit(msr, msr_bitmap + 0xc00 / f); - - } -} - /* * If a msr is allowed by L0, we should check whether it is allowed by L1. * The corresponding bit will be cleared unless both of L0 and L1 allow it. @@ -4659,48 +4939,18 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) msr, MSR_TYPE_R | MSR_TYPE_W); } -static void vmx_enable_intercept_msr_read_x2apic(u32 msr, bool apicv_active) -{ - if (apicv_active) { - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_R); - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_R); - } else { - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, - msr, MSR_TYPE_R); - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, - msr, MSR_TYPE_R); - } -} - -static void vmx_disable_intercept_msr_read_x2apic(u32 msr, bool apicv_active) +static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_active) { if (apicv_active) { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_R); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_R); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv, + msr, type); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv, + msr, type); } else { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, - msr, MSR_TYPE_R); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, - msr, MSR_TYPE_R); - } -} - -static void vmx_disable_intercept_msr_write_x2apic(u32 msr, bool apicv_active) -{ - if (apicv_active) { __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_W); + msr, type); __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_W); - } else { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, - msr, MSR_TYPE_W); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, - msr, MSR_TYPE_W); + msr, type); } } @@ -4822,9 +5072,15 @@ static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!pi_test_and_clear_on(&vmx->pi_desc)) + if (!pi_test_on(&vmx->pi_desc)) return; + pi_clear_on(&vmx->pi_desc); + /* + * IOMMU can write to PIR.ON, so the barrier matters even on UP. + * But on x86 this is just a compiler barrier anyway. + */ + smp_mb__after_atomic(); kvm_apic_update_irr(vcpu, vmx->pi_desc.pir); } @@ -5583,7 +5839,7 @@ static int handle_triple_fault(struct kvm_vcpu *vcpu) static int handle_io(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; - int size, in, string; + int size, in, string, ret; unsigned port; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -5597,9 +5853,14 @@ static int handle_io(struct kvm_vcpu *vcpu) port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; - skip_emulated_instruction(vcpu); - return kvm_fast_pio_out(vcpu, size, port); + ret = kvm_skip_emulated_instruction(vcpu); + + /* + * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered + * KVM_EXIT_DEBUG here. + */ + return kvm_fast_pio_out(vcpu, size, port) && ret; } static void @@ -5613,18 +5874,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xc1; } -static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) -{ - unsigned long always_on = VMXON_CR0_ALWAYSON; - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high & - SECONDARY_EXEC_UNRESTRICTED_GUEST && - nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) - always_on &= ~(X86_CR0_PE | X86_CR0_PG); - return (val & always_on) == always_on; -} - /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) { @@ -5643,7 +5892,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) val = (val & ~vmcs12->cr0_guest_host_mask) | (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); - if (!nested_cr0_valid(vcpu, val)) + if (!nested_guest_cr0_valid(vcpu, val)) return 1; if (kvm_set_cr0(vcpu, val)) @@ -5652,8 +5901,9 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) return 0; } else { if (to_vmx(vcpu)->nested.vmxon && - ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)) + !nested_host_cr0_valid(vcpu, val)) return 1; + return kvm_set_cr0(vcpu, val); } } @@ -5697,6 +5947,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) int cr; int reg; int err; + int ret; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); cr = exit_qualification & 15; @@ -5708,25 +5959,27 @@ static int handle_cr(struct kvm_vcpu *vcpu) switch (cr) { case 0: err = handle_set_cr0(vcpu, val); - kvm_complete_insn_gp(vcpu, err); - return 1; + return kvm_complete_insn_gp(vcpu, err); case 3: err = kvm_set_cr3(vcpu, val); - kvm_complete_insn_gp(vcpu, err); - return 1; + return kvm_complete_insn_gp(vcpu, err); case 4: err = handle_set_cr4(vcpu, val); - kvm_complete_insn_gp(vcpu, err); - return 1; + return kvm_complete_insn_gp(vcpu, err); case 8: { u8 cr8_prev = kvm_get_cr8(vcpu); u8 cr8 = (u8)val; err = kvm_set_cr8(vcpu, cr8); - kvm_complete_insn_gp(vcpu, err); + ret = kvm_complete_insn_gp(vcpu, err); if (lapic_in_kernel(vcpu)) - return 1; + return ret; if (cr8_prev <= cr8) - return 1; + return ret; + /* + * TODO: we might be squashing a + * KVM_GUESTDBG_SINGLESTEP-triggered + * KVM_EXIT_DEBUG here. + */ vcpu->run->exit_reason = KVM_EXIT_SET_TPR; return 0; } @@ -5735,23 +5988,20 @@ static int handle_cr(struct kvm_vcpu *vcpu) case 2: /* clts */ handle_clts(vcpu); trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); - skip_emulated_instruction(vcpu); vmx_fpu_activate(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); case 1: /*mov from cr*/ switch (cr) { case 3: val = kvm_read_cr3(vcpu); kvm_register_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); case 8: val = kvm_get_cr8(vcpu); kvm_register_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } break; case 3: /* lmsw */ @@ -5759,8 +6009,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); kvm_lmsw(vcpu, val); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); default: break; } @@ -5831,8 +6080,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg))) return 1; - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static u64 vmx_get_dr6(struct kvm_vcpu *vcpu) @@ -5864,8 +6112,7 @@ static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) static int handle_cpuid(struct kvm_vcpu *vcpu) { - kvm_emulate_cpuid(vcpu); - return 1; + return kvm_emulate_cpuid(vcpu); } static int handle_rdmsr(struct kvm_vcpu *vcpu) @@ -5886,8 +6133,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu) /* FIXME: handling of bits 32:63 of rax, rdx */ vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u; vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u; - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_wrmsr(struct kvm_vcpu *vcpu) @@ -5907,8 +6153,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) } trace_kvm_msr_write(ecx, data); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) @@ -5952,8 +6197,7 @@ static int handle_invlpg(struct kvm_vcpu *vcpu) unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); kvm_mmu_invlpg(vcpu, exit_qualification); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_rdpmc(struct kvm_vcpu *vcpu) @@ -5961,15 +6205,12 @@ static int handle_rdpmc(struct kvm_vcpu *vcpu) int err; err = kvm_rdpmc(vcpu); - kvm_complete_insn_gp(vcpu, err); - - return 1; + return kvm_complete_insn_gp(vcpu, err); } static int handle_wbinvd(struct kvm_vcpu *vcpu) { - kvm_emulate_wbinvd(vcpu); - return 1; + return kvm_emulate_wbinvd(vcpu); } static int handle_xsetbv(struct kvm_vcpu *vcpu) @@ -5978,20 +6219,20 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu) u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); if (kvm_set_xcr(vcpu, index, new_bv) == 0) - skip_emulated_instruction(vcpu); + return kvm_skip_emulated_instruction(vcpu); return 1; } static int handle_xsaves(struct kvm_vcpu *vcpu) { - skip_emulated_instruction(vcpu); + kvm_skip_emulated_instruction(vcpu); WARN(1, "this should never happen\n"); return 1; } static int handle_xrstors(struct kvm_vcpu *vcpu) { - skip_emulated_instruction(vcpu); + kvm_skip_emulated_instruction(vcpu); WARN(1, "this should never happen\n"); return 1; } @@ -6012,8 +6253,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu) if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && (offset == APIC_EOI)) { kvm_lapic_set_eoi(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } } return emulate_instruction(vcpu, 0) == EMULATE_DONE; @@ -6161,9 +6401,8 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { - skip_emulated_instruction(vcpu); trace_kvm_fast_mmio(gpa); - return 1; + return kvm_skip_emulated_instruction(vcpu); } ret = handle_mmio_page_fault(vcpu, gpa, true); @@ -6348,50 +6587,13 @@ static __init int hardware_setup(void) for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) kvm_define_shared_msr(i, vmx_msr_index[i]); - vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_io_bitmap_a) - return r; + for (i = 0; i < VMX_BITMAP_NR; i++) { + vmx_bitmap[i] = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_bitmap[i]) + goto out; + } vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_io_bitmap_b) - goto out; - - vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy) - goto out1; - - vmx_msr_bitmap_legacy_x2apic = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy_x2apic) - goto out2; - - vmx_msr_bitmap_legacy_x2apic_apicv_inactive = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive) - goto out3; - - vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode) - goto out4; - - vmx_msr_bitmap_longmode_x2apic = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode_x2apic) - goto out5; - - vmx_msr_bitmap_longmode_x2apic_apicv_inactive = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive) - goto out6; - - vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_vmread_bitmap) - goto out7; - - vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_vmwrite_bitmap) - goto out8; - memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); @@ -6409,7 +6611,7 @@ static __init int hardware_setup(void) if (setup_vmcs_config(&vmcs_config) < 0) { r = -EIO; - goto out9; + goto out; } if (boot_cpu_has(X86_FEATURE_NX)) @@ -6472,39 +6674,34 @@ static __init int hardware_setup(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); - memcpy(vmx_msr_bitmap_legacy_x2apic, + memcpy(vmx_msr_bitmap_legacy_x2apic_apicv, vmx_msr_bitmap_legacy, PAGE_SIZE); - memcpy(vmx_msr_bitmap_longmode_x2apic, + memcpy(vmx_msr_bitmap_longmode_x2apic_apicv, vmx_msr_bitmap_longmode, PAGE_SIZE); - memcpy(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, + memcpy(vmx_msr_bitmap_legacy_x2apic, vmx_msr_bitmap_legacy, PAGE_SIZE); - memcpy(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, + memcpy(vmx_msr_bitmap_longmode_x2apic, vmx_msr_bitmap_longmode, PAGE_SIZE); set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ + for (msr = 0x800; msr <= 0x8ff; msr++) { + if (msr == 0x839 /* TMCCT */) + continue; + vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true); + } + /* - * enable_apicv && kvm_vcpu_apicv_active() + * TPR reads and writes can be virtualized even if virtual interrupt + * delivery is not in use. */ - for (msr = 0x800; msr <= 0x8ff; msr++) - vmx_disable_intercept_msr_read_x2apic(msr, true); + vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_W, true); + vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false); - /* TMCCT */ - vmx_enable_intercept_msr_read_x2apic(0x839, true); - /* TPR */ - vmx_disable_intercept_msr_write_x2apic(0x808, true); /* EOI */ - vmx_disable_intercept_msr_write_x2apic(0x80b, true); + vmx_disable_intercept_msr_x2apic(0x80b, MSR_TYPE_W, true); /* SELF-IPI */ - vmx_disable_intercept_msr_write_x2apic(0x83f, true); - - /* - * (enable_apicv && !kvm_vcpu_apicv_active()) || - * !enable_apicv - */ - /* TPR */ - vmx_disable_intercept_msr_read_x2apic(0x808, false); - vmx_disable_intercept_msr_write_x2apic(0x808, false); + vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true); if (enable_ept) { kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, @@ -6551,42 +6748,19 @@ static __init int hardware_setup(void) return alloc_kvm_area(); -out9: - free_page((unsigned long)vmx_vmwrite_bitmap); -out8: - free_page((unsigned long)vmx_vmread_bitmap); -out7: - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); -out6: - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); -out5: - free_page((unsigned long)vmx_msr_bitmap_longmode); -out4: - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); -out3: - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); -out2: - free_page((unsigned long)vmx_msr_bitmap_legacy); -out1: - free_page((unsigned long)vmx_io_bitmap_b); out: - free_page((unsigned long)vmx_io_bitmap_a); + for (i = 0; i < VMX_BITMAP_NR; i++) + free_page((unsigned long)vmx_bitmap[i]); return r; } static __exit void hardware_unsetup(void) { - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); - free_page((unsigned long)vmx_msr_bitmap_legacy); - free_page((unsigned long)vmx_msr_bitmap_longmode); - free_page((unsigned long)vmx_io_bitmap_b); - free_page((unsigned long)vmx_io_bitmap_a); - free_page((unsigned long)vmx_vmwrite_bitmap); - free_page((unsigned long)vmx_vmread_bitmap); + int i; + + for (i = 0; i < VMX_BITMAP_NR; i++) + free_page((unsigned long)vmx_bitmap[i]); free_kvm_area(); } @@ -6600,16 +6774,13 @@ static int handle_pause(struct kvm_vcpu *vcpu) if (ple_gap) grow_ple_window(vcpu); - skip_emulated_instruction(vcpu); kvm_vcpu_on_spin(vcpu); - - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_nop(struct kvm_vcpu *vcpu) { - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_mwait(struct kvm_vcpu *vcpu) @@ -6916,8 +7087,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, */ if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failInvalid(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } page = nested_get_page(vcpu, vmptr); @@ -6925,8 +7095,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, *(u32 *)kmap(page) != VMCS12_REVISION) { nested_vmx_failInvalid(vcpu); kunmap(page); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } kunmap(page); vmx->nested.vmxon_ptr = vmptr; @@ -6935,30 +7104,26 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } if (vmptr == vmx->nested.vmxon_ptr) { nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } break; case EXIT_REASON_VMPTRLD: if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } if (vmptr == vmx->nested.vmxon_ptr) { nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } break; default: @@ -7014,8 +7179,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) if (vmx->nested.vmxon) { nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) @@ -7055,9 +7219,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu) vmx->nested.vmxon = true; - skip_emulated_instruction(vcpu); nested_vmx_succeed(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); out_shadow_vmcs: kfree(vmx->nested.cached_vmcs12); @@ -7176,9 +7339,8 @@ static int handle_vmoff(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; free_nested(to_vmx(vcpu)); - skip_emulated_instruction(vcpu); nested_vmx_succeed(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* Emulate the VMCLEAR instruction */ @@ -7217,9 +7379,8 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) nested_free_vmcs02(vmx, vmptr); - skip_emulated_instruction(vcpu); nested_vmx_succeed(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); @@ -7417,7 +7578,6 @@ static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); if (vmx->nested.current_vmptr == -1ull) { nested_vmx_failInvalid(vcpu); - skip_emulated_instruction(vcpu); return 0; } return 1; @@ -7431,17 +7591,18 @@ static int handle_vmread(struct kvm_vcpu *vcpu) u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); gva_t gva = 0; - if (!nested_vmx_check_permission(vcpu) || - !nested_vmx_check_vmcs12(vcpu)) + if (!nested_vmx_check_permission(vcpu)) return 1; + if (!nested_vmx_check_vmcs12(vcpu)) + return kvm_skip_emulated_instruction(vcpu); + /* Decode instruction info and find the field to read */ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); /* Read the field, zero-extended to a u64 field_value */ if (vmcs12_read_any(vcpu, field, &field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* * Now copy part of this value to register or memory, as requested. @@ -7461,8 +7622,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) } nested_vmx_succeed(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } @@ -7481,10 +7641,12 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) u64 field_value = 0; struct x86_exception e; - if (!nested_vmx_check_permission(vcpu) || - !nested_vmx_check_vmcs12(vcpu)) + if (!nested_vmx_check_permission(vcpu)) return 1; + if (!nested_vmx_check_vmcs12(vcpu)) + return kvm_skip_emulated_instruction(vcpu); + if (vmx_instruction_info & (1u << 10)) field_value = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 3) & 0xf)); @@ -7504,19 +7666,16 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) if (vmcs_field_readonly(field)) { nested_vmx_failValid(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } if (vmcs12_write_any(vcpu, field, field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } nested_vmx_succeed(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* Emulate the VMPTRLD instruction */ @@ -7537,8 +7696,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) page = nested_get_page(vcpu, vmptr); if (page == NULL) { nested_vmx_failInvalid(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } new_vmcs12 = kmap(page); if (new_vmcs12->revision_id != VMCS12_REVISION) { @@ -7546,8 +7704,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) nested_release_page_clean(page); nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } nested_release_vmcs12(vmx); @@ -7571,8 +7728,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) } nested_vmx_succeed(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* Emulate the VMPTRST instruction */ @@ -7597,8 +7753,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) return 1; } nested_vmx_succeed(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* Emulate the INVEPT instruction */ @@ -7636,8 +7791,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) if (type >= 32 || !(types & (1 << type))) { nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* According to the Intel VMX instruction reference, the memory @@ -7668,8 +7822,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) break; } - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_invvpid(struct kvm_vcpu *vcpu) @@ -7694,13 +7847,13 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); - types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7; + types = (vmx->nested.nested_vmx_vpid_caps & + VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; if (type >= 32 || !(types & (1 << type))) { nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* according to the intel vmx instruction reference, the memory @@ -7716,23 +7869,26 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) } switch (type) { + case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: case VMX_VPID_EXTENT_SINGLE_CONTEXT: - /* - * Old versions of KVM use the single-context version so we - * have to support it; just treat it the same as all-context. - */ + case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: + if (!vpid) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return kvm_skip_emulated_instruction(vcpu); + } + break; case VMX_VPID_EXTENT_ALL_CONTEXT: - __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); - nested_vmx_succeed(vcpu); break; default: - /* Trap individual address invalidation invvpid calls */ - BUG_ON(1); - break; + WARN_ON_ONCE(1); + return kvm_skip_emulated_instruction(vcpu); } - skip_emulated_instruction(vcpu); - return 1; + __vmx_flush_tlb(vcpu, vmx->nested.vpid02); + nested_vmx_succeed(vcpu); + + return kvm_skip_emulated_instruction(vcpu); } static int handle_pml_full(struct kvm_vcpu *vcpu) @@ -8071,6 +8227,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); case EXIT_REASON_IO_INSTRUCTION: return nested_vmx_exit_handled_io(vcpu, vmcs12); + case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); case EXIT_REASON_MSR_READ: case EXIT_REASON_MSR_WRITE: return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); @@ -8620,11 +8778,6 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); register void *__sp asm(_ASM_SP); - /* - * If external interrupt exists, IF bit is set in rflags/eflags on the - * interrupt stack frame, and interrupt will be enabled on a return - * from interrupt handler. - */ if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { unsigned int vector; @@ -8809,7 +8962,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host); } -void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) +static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 tscl; @@ -9279,6 +9432,50 @@ static void vmcs_set_secondary_exec_control(u32 new_ctl) (new_ctl & ~mask) | (cur_ctl & mask)); } +/* + * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits + * (indicating "allowed-1") if they are supported in the guest's CPUID. + */ +static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_cpuid_entry2 *entry; + + vmx->nested.nested_vmx_cr0_fixed1 = 0xffffffff; + vmx->nested.nested_vmx_cr4_fixed1 = X86_CR4_PCE; + +#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ + if (entry && (entry->_reg & (_cpuid_mask))) \ + vmx->nested.nested_vmx_cr4_fixed1 |= (_cr4_mask); \ +} while (0) + + entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); + cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME)); + cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME)); + cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC)); + cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE)); + cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE)); + cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE)); + cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE)); + cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE)); + cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR)); + cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM)); + cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX)); + cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX)); + cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID)); + cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE)); + + entry = kvm_find_cpuid_entry(vcpu, 0x7, 0); + cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE)); + cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP)); + cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP)); + cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU)); + /* TODO: Use X86_CR4_UMIP and X86_FEATURE_UMIP macros */ + cr4_fixed1_update(bit(11), ecx, bit(2)); + +#undef cr4_fixed1_update +} + static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -9320,6 +9517,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) else to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + + if (nested_vmx_allowed(vcpu)) + nested_vmx_cr_fixed1_bits_update(vcpu); } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -9774,6 +9974,49 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) return 0; } +static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + unsigned long invalid_mask; + + invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu); + return (val & invalid_mask) == 0; +} + +/* + * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are + * emulating VM entry into a guest with EPT enabled. + * Returns 0 on success, 1 on failure. Invalid state exit qualification code + * is assigned to entry_failure_code on failure. + */ +static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, + unsigned long *entry_failure_code) +{ + if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { + if (!nested_cr3_valid(vcpu, cr3)) { + *entry_failure_code = ENTRY_FAIL_DEFAULT; + return 1; + } + + /* + * If PAE paging and EPT are both on, CR3 is not used by the CPU and + * must not be dereferenced. + */ + if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) && + !nested_ept) { + if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) { + *entry_failure_code = ENTRY_FAIL_PDPTE; + return 1; + } + } + + vcpu->arch.cr3 = cr3; + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + } + + kvm_mmu_reset_context(vcpu); + return 0; +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -9782,11 +10025,15 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) * needs. In addition to modifying the active vmcs (which is vmcs02), this * function also has additional necessary side-effects, like setting various * vcpu->arch fields. + * Returns 0 on success, 1 on failure. Invalid state exit qualification code + * is assigned to entry_failure_code on failure. */ -static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + unsigned long *entry_failure_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exec_control; + bool nested_ept_enabled = false; vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); @@ -9951,6 +10198,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_intr_status); } + nested_ept_enabled = (exec_control & SECONDARY_EXEC_ENABLE_EPT) != 0; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } @@ -9964,6 +10212,15 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx_set_constant_host_state(vmx); /* + * Set the MSR load/store lists to match L0's settings. + */ + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); + + /* * HOST_RSP is normally set correctly in vmx_vcpu_run() just before * entry, but only if the current (host) sp changed from the value * we wrote last (vmx->host_rsp). This cache is no longer relevant @@ -10069,15 +10326,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) nested_ept_init_mmu_context(vcpu); } - if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) - vcpu->arch.efer = vmcs12->guest_ia32_efer; - else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) - vcpu->arch.efer |= (EFER_LMA | EFER_LME); - else - vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); - /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ - vmx_set_efer(vcpu, vcpu->arch.efer); - /* * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified * TS bit (for lazy fpu) and bits which we consider mandatory enabled. @@ -10092,8 +10340,20 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx_set_cr4(vcpu, vmcs12->guest_cr4); vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); - /* shadow page tables on either EPT or shadow page tables */ - kvm_set_cr3(vcpu, vmcs12->guest_cr3); + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) + vcpu->arch.efer = vmcs12->guest_ia32_efer; + else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) + vcpu->arch.efer |= (EFER_LMA | EFER_LME); + else + vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); + /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ + vmx_set_efer(vcpu, vcpu->arch.efer); + + /* Shadow page tables on either EPT or shadow page tables. */ + if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_ept_enabled, + entry_failure_code)) + return 1; + kvm_mmu_reset_context(vcpu); if (!enable_ept) @@ -10111,6 +10371,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); + return 0; } /* @@ -10125,12 +10386,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) struct loaded_vmcs *vmcs02; bool ia32e; u32 msr_entry_idx; + unsigned long exit_qualification; - if (!nested_vmx_check_permission(vcpu) || - !nested_vmx_check_vmcs12(vcpu)) + if (!nested_vmx_check_permission(vcpu)) return 1; - skip_emulated_instruction(vcpu); + if (!nested_vmx_check_vmcs12(vcpu)) + goto out; + vmcs12 = get_vmcs12(vcpu); if (enable_shadow_vmcs) @@ -10150,37 +10413,37 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) nested_vmx_failValid(vcpu, launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS : VMXERR_VMRESUME_NONLAUNCHED_VMCS); - return 1; + goto out; } if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; + goto out; } if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; + goto out; } if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; + goto out; } if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; + goto out; } if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; + goto out; } if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, - vmx->nested.nested_vmx_true_procbased_ctls_low, + vmx->nested.nested_vmx_procbased_ctls_low, vmx->nested.nested_vmx_procbased_ctls_high) || !vmx_control_verify(vmcs12->secondary_vm_exec_control, vmx->nested.nested_vmx_secondary_ctls_low, @@ -10189,33 +10452,34 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmx->nested.nested_vmx_pinbased_ctls_low, vmx->nested.nested_vmx_pinbased_ctls_high) || !vmx_control_verify(vmcs12->vm_exit_controls, - vmx->nested.nested_vmx_true_exit_ctls_low, + vmx->nested.nested_vmx_exit_ctls_low, vmx->nested.nested_vmx_exit_ctls_high) || !vmx_control_verify(vmcs12->vm_entry_controls, - vmx->nested.nested_vmx_true_entry_ctls_low, + vmx->nested.nested_vmx_entry_ctls_low, vmx->nested.nested_vmx_entry_ctls_high)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; + goto out; } - if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) || - ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { + if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) || + !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || + !nested_cr3_valid(vcpu, vmcs12->host_cr3)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); - return 1; + goto out; } - if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) || - ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { + if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) || + !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) { nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); - return 1; + goto out; } if (vmcs12->vmcs_link_pointer != -1ull) { nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR); - return 1; + goto out; } /* @@ -10235,7 +10499,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) { nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); - return 1; + goto out; } } @@ -10253,7 +10517,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) { nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); - return 1; + goto out; } } @@ -10266,6 +10530,12 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (!vmcs02) return -ENOMEM; + /* + * After this point, the trap flag no longer triggers a singlestep trap + * on the vm entry instructions. Don't call + * kvm_skip_emulated_instruction. + */ + skip_emulated_instruction(vcpu); enter_guest_mode(vcpu); if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) @@ -10280,7 +10550,13 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmx_segment_cache_clear(vmx); - prepare_vmcs02(vcpu, vmcs12); + if (prepare_vmcs02(vcpu, vmcs12, &exit_qualification)) { + leave_guest_mode(vcpu); + vmx_load_vmcs01(vcpu); + nested_vmx_entry_failure(vcpu, vmcs12, + EXIT_REASON_INVALID_STATE, exit_qualification); + return 1; + } msr_entry_idx = nested_vmx_load_msr(vcpu, vmcs12->vm_entry_msr_load_addr, @@ -10307,6 +10583,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * the success flag) when L2 exits (see nested_vmx_vmexit()). */ return 1; + +out: + return kvm_skip_emulated_instruction(vcpu); } /* @@ -10612,6 +10891,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct kvm_segment seg; + unsigned long entry_failure_code; if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->host_ia32_efer; @@ -10649,8 +10929,12 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, nested_ept_uninit_mmu_context(vcpu); - kvm_set_cr3(vcpu, vmcs12->host_cr3); - kvm_mmu_reset_context(vcpu); + /* + * Only PDPTE load can fail as the value of cr3 was checked on entry and + * couldn't have changed. + */ + if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) + nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); if (!enable_ept) vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; @@ -10751,6 +11035,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + u32 vm_inst_error = 0; /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); @@ -10763,6 +11048,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmcs12->vm_exit_msr_store_count)) nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); + if (unlikely(vmx->fail)) + vm_inst_error = vmcs_read32(VM_INSTRUCTION_ERROR); + vmx_load_vmcs01(vcpu); if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) @@ -10791,6 +11079,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, load_vmcs12_host_state(vcpu, vmcs12); /* Update any VMCS fields that might have changed while L2 ran */ + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); if (vmx->hv_deadline_tsc == -1) vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, @@ -10839,7 +11129,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, */ if (unlikely(vmx->fail)) { vmx->fail = 0; - nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR)); + nested_vmx_failValid(vcpu, vm_inst_error); } else nested_vmx_succeed(vcpu); if (enable_shadow_vmcs) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2912684ff902..1f0d2383f5ee 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -434,12 +434,14 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) } EXPORT_SYMBOL_GPL(kvm_requeue_exception); -void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) +int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) { if (err) kvm_inject_gp(vcpu, 0); else - kvm_x86_ops->skip_emulated_instruction(vcpu); + return kvm_skip_emulated_instruction(vcpu); + + return 1; } EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); @@ -573,7 +575,7 @@ out: } EXPORT_SYMBOL_GPL(load_pdptrs); -static bool pdptrs_changed(struct kvm_vcpu *vcpu) +bool pdptrs_changed(struct kvm_vcpu *vcpu) { u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; bool changed = true; @@ -599,6 +601,7 @@ out: return changed; } +EXPORT_SYMBOL_GPL(pdptrs_changed); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { @@ -2178,7 +2181,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_SYSTEM_TIME_NEW: case MSR_KVM_SYSTEM_TIME: { - u64 gpa_offset; struct kvm_arch *ka = &vcpu->kvm->arch; kvmclock_reset(vcpu); @@ -2200,8 +2202,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!(data & 1)) break; - gpa_offset = data & ~(PAGE_MASK | 1); - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_time, data & ~1ULL, sizeof(struct pvclock_vcpu_time_info))) @@ -2296,7 +2296,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); if (!ignore_msrs) { - vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n", + vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n", msr, data); return 1; } else { @@ -2508,7 +2508,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); if (!ignore_msrs) { - vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr_info->index); + vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n", + msr_info->index); return 1; } else { vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr_info->index); @@ -2812,7 +2813,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } if (kvm_lapic_hv_timer_in_use(vcpu) && kvm_x86_ops->set_hv_timer(vcpu, - kvm_get_lapic_tscdeadline_msr(vcpu))) + kvm_get_lapic_target_expiration_tsc(vcpu))) kvm_lapic_switch_to_sw_timer(vcpu); /* * On a host with synchronized TSC, there is no need to update @@ -4832,7 +4833,7 @@ static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) kvm_mmu_invlpg(emul_to_vcpu(ctxt), address); } -int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) +static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) { if (!need_emulate_wbinvd(vcpu)) return X86EMUL_CONTINUE; @@ -4852,8 +4853,8 @@ int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) { - kvm_x86_ops->skip_emulated_instruction(vcpu); - return kvm_emulate_wbinvd_noskip(vcpu); + kvm_emulate_wbinvd_noskip(vcpu); + return kvm_skip_emulated_instruction(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); @@ -5451,7 +5452,6 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflag kvm_run->exit_reason = KVM_EXIT_DEBUG; *r = EMULATE_USER_EXIT; } else { - vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF; /* * "Certain debug exceptions may clear bit 0-3. The * remaining contents of the DR6 register are never @@ -5464,6 +5464,17 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflag } } +int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) +{ + unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + int r = EMULATE_DONE; + + kvm_x86_ops->skip_emulated_instruction(vcpu); + kvm_vcpu_check_singlestep(vcpu, rflags, &r); + return r == EMULATE_DONE; +} +EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); + static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) { if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) && @@ -5649,6 +5660,49 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) } EXPORT_SYMBOL_GPL(kvm_fast_pio_out); +static int complete_fast_pio_in(struct kvm_vcpu *vcpu) +{ + unsigned long val; + + /* We should only ever be called with arch.pio.count equal to 1 */ + BUG_ON(vcpu->arch.pio.count != 1); + + /* For size less than 4 we merge, else we zero extend */ + val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) + : 0; + + /* + * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform + * the copy and tracing + */ + emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size, + vcpu->arch.pio.port, &val, 1); + kvm_register_write(vcpu, VCPU_REGS_RAX, val); + + return 1; +} + +int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port) +{ + unsigned long val; + int ret; + + /* For size less than 4 we merge, else we zero extend */ + val = (size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) : 0; + + ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port, + &val, 1); + if (ret) { + kvm_register_write(vcpu, VCPU_REGS_RAX, val); + return ret; + } + + vcpu->arch.complete_userspace_io = complete_fast_pio_in; + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_fast_pio_in); + static int kvmclock_cpu_down_prep(unsigned int cpu) { __this_cpu_write(cpu_tsc_khz, 0); @@ -5998,8 +6052,12 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_halt); int kvm_emulate_halt(struct kvm_vcpu *vcpu) { - kvm_x86_ops->skip_emulated_instruction(vcpu); - return kvm_vcpu_halt(vcpu); + int ret = kvm_skip_emulated_instruction(vcpu); + /* + * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered + * KVM_EXIT_DEBUG here. + */ + return kvm_vcpu_halt(vcpu) && ret; } EXPORT_SYMBOL_GPL(kvm_emulate_halt); @@ -6030,9 +6088,9 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; - int op_64_bit, r = 1; + int op_64_bit, r; - kvm_x86_ops->skip_emulated_instruction(vcpu); + r = kvm_skip_emulated_instruction(vcpu); if (kvm_hv_hypercall_enabled(vcpu->kvm)) return kvm_hv_hypercall(vcpu); diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index bedfab98077a..e1fb269c87af 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -264,8 +264,8 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) return 0; error: - dev_err(&dev->dev, - "Xen PCI frontend has not registered MSI/MSI-X support!\n"); + dev_err(&dev->dev, "Failed to create MSI%s! ret=%d!\n", + type == PCI_CAP_ID_MSI ? "" : "-X", irq); return irq; } diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c index b27bccd4390f..821cb41f00e6 100644 --- a/arch/x86/platform/ce4100/ce4100.c +++ b/arch/x86/platform/ce4100/ce4100.c @@ -89,7 +89,7 @@ static void ce4100_mem_serial_out(struct uart_port *p, int offset, int value) } static void ce4100_serial_fixup(int port, struct uart_port *up, - unsigned short *capabilites) + u32 *capabilites) { #ifdef CONFIG_EARLY_PRINTK /* diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 0e98e5d241d0..a9fafb5c8738 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -19,7 +19,6 @@ int xen_swiotlb __read_mostly; static struct dma_map_ops xen_swiotlb_dma_ops = { - .mapping_error = xen_swiotlb_dma_mapping_error, .alloc = xen_swiotlb_alloc_coherent, .free = xen_swiotlb_free_coherent, .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index f8960fca0827..8c394e30e5fe 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -41,7 +41,7 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; unsigned long xen_released_pages; /* E820 map used during setting up memory. */ -static struct e820entry xen_e820_map[E820MAX] __initdata; +static struct e820entry xen_e820_map[E820_X_MAX] __initdata; static u32 xen_e820_map_entries __initdata; /* @@ -750,7 +750,7 @@ char * __init xen_memory_setup(void) max_pfn = min(max_pfn, xen_start_info->nr_pages); mem_end = PFN_PHYS(max_pfn); - memmap.nr_entries = E820MAX; + memmap.nr_entries = ARRAY_SIZE(xen_e820_map); set_xen_guest_handle(memmap.buffer, xen_e820_map); op = xen_initial_domain() ? @@ -923,7 +923,7 @@ char * __init xen_auto_xlated_memory_setup(void) int i; int rc; - memmap.nr_entries = E820MAX; + memmap.nr_entries = ARRAY_SIZE(xen_e820_map); set_xen_guest_handle(memmap.buffer, xen_e820_map); rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); |