diff options
Diffstat (limited to 'arch/x86')
312 files changed, 11280 insertions, 10650 deletions
diff --git a/arch/x86/.gitignore b/arch/x86/.gitignore index aff152c87cf4..5a82bac5e0bc 100644 --- a/arch/x86/.gitignore +++ b/arch/x86/.gitignore @@ -1,6 +1,7 @@ boot/compressed/vmlinux tools/test_get_len tools/insn_sanity +tools/insn_decoder_test purgatory/kexec-purgatory.c purgatory/purgatory.ro diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 63bf349b2b24..c07f492b871a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -29,6 +29,7 @@ config X86_64 select HAVE_ARCH_SOFT_DIRTY select MODULES_USE_ELF_RELA select X86_DEV_DMA_OPS + select ARCH_HAS_SYSCALL_WRAPPER # # Arch settings @@ -51,10 +52,10 @@ config X86 select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER + select ARCH_HAS_FILTER_PGPROT select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_KCOV if X86_64 - select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_REFCOUNT @@ -83,6 +84,7 @@ config X86 select CLOCKSOURCE_VALIDATE_LAST_CYCLE select CLOCKSOURCE_WATCHDOG select DCACHE_WORD_ACCESS + select DMA_DIRECT_OPS select EDAC_ATOMIC_SCRUB select EDAC_SUPPORT select GENERIC_CLOCKEVENTS @@ -272,6 +274,9 @@ config ARCH_HAS_CPU_RELAX config ARCH_HAS_CACHE_LINE_SIZE def_bool y +config ARCH_HAS_FILTER_PGPROT + def_bool y + config HAVE_SETUP_PER_CPU_AREA def_bool y @@ -393,17 +398,6 @@ config X86_FEATURE_NAMES If in doubt, say Y. -config X86_FAST_FEATURE_TESTS - bool "Fast CPU feature tests" if EMBEDDED - default y - ---help--- - Some fast-paths in the kernel depend on the capabilities of the CPU. - Say Y here for the kernel to patch in the appropriate code at runtime - based on the capabilities of the CPU. The infrastructure for patching - code at runtime takes up some additional space; space-constrained - embedded systems may wish to say N here to produce smaller, slightly - slower code. - config X86_X2APIC bool "Support x2apic" depends on X86_LOCAL_APIC && X86_64 && (IRQ_REMAP || HYPERVISOR_GUEST) @@ -423,12 +417,6 @@ config X86_MPPARSE For old smp systems that do not have proper acpi support. Newer systems (esp with 64bit cpus) with acpi support, MADT and DSDT will override it -config X86_BIGSMP - bool "Support for big SMP systems with more than 8 CPUs" - depends on X86_32 && SMP - ---help--- - This option is needed for the systems that have more than 8 CPUs - config GOLDFISH def_bool y depends on X86_GOLDFISH @@ -436,6 +424,7 @@ config GOLDFISH config RETPOLINE bool "Avoid speculative indirect branches in kernel" default y + select STACK_VALIDATION if HAVE_STACK_VALIDATION help Compile kernel with the retpoline compiler options to guard against kernel-to-user data leaks by avoiding speculative indirect @@ -460,6 +449,12 @@ config INTEL_RDT Say N if unsure. if X86_32 +config X86_BIGSMP + bool "Support for big SMP systems with more than 8 CPUs" + depends on SMP + ---help--- + This option is needed for the systems that have more than 8 CPUs + config X86_EXTENDED_PLATFORM bool "Support for extended (non-PC) x86 platforms" default y @@ -690,6 +685,7 @@ config X86_SUPPORTS_MEMORY_FAILURE config STA2X11 bool "STA2X11 Companion Chip Support" depends on X86_32_NON_STANDARD && PCI + select ARCH_HAS_PHYS_TO_DMA select X86_DEV_DMA_OPS select X86_DMA_REMAP select SWIOTLB @@ -949,25 +945,66 @@ config MAXSMP Enable maximum number of CPUS and NUMA Nodes for this architecture. If unsure, say N. +# +# The maximum number of CPUs supported: +# +# The main config value is NR_CPUS, which defaults to NR_CPUS_DEFAULT, +# and which can be configured interactively in the +# [NR_CPUS_RANGE_BEGIN ... NR_CPUS_RANGE_END] range. +# +# The ranges are different on 32-bit and 64-bit kernels, depending on +# hardware capabilities and scalability features of the kernel. +# +# ( If MAXSMP is enabled we just use the highest possible value and disable +# interactive configuration. ) +# + +config NR_CPUS_RANGE_BEGIN + int + default NR_CPUS_RANGE_END if MAXSMP + default 1 if !SMP + default 2 + +config NR_CPUS_RANGE_END + int + depends on X86_32 + default 64 if SMP && X86_BIGSMP + default 8 if SMP && !X86_BIGSMP + default 1 if !SMP + +config NR_CPUS_RANGE_END + int + depends on X86_64 + default 8192 if SMP && ( MAXSMP || CPUMASK_OFFSTACK) + default 512 if SMP && (!MAXSMP && !CPUMASK_OFFSTACK) + default 1 if !SMP + +config NR_CPUS_DEFAULT + int + depends on X86_32 + default 32 if X86_BIGSMP + default 8 if SMP + default 1 if !SMP + +config NR_CPUS_DEFAULT + int + depends on X86_64 + default 8192 if MAXSMP + default 64 if SMP + default 1 if !SMP + config NR_CPUS int "Maximum number of CPUs" if SMP && !MAXSMP - range 2 8 if SMP && X86_32 && !X86_BIGSMP - range 2 64 if SMP && X86_32 && X86_BIGSMP - range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK && X86_64 - range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64 - default "1" if !SMP - default "8192" if MAXSMP - default "32" if SMP && X86_BIGSMP - default "8" if SMP && X86_32 - default "64" if SMP + range NR_CPUS_RANGE_BEGIN NR_CPUS_RANGE_END + default NR_CPUS_DEFAULT ---help--- This allows you to specify the maximum number of CPUs which this kernel will support. If CPUMASK_OFFSTACK is enabled, the maximum supported value is 8192, otherwise the maximum value is 512. The minimum value which makes sense is 2. - This is purely to save memory - each supported CPU adds - approximately eight kilobytes to the kernel image. + This is purely to save memory: each supported CPU adds about 8KB + to the kernel image. config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" @@ -1264,7 +1301,7 @@ config MICROCODE the Linux kernel. The preferred method to load microcode from a detached initrd is described - in Documentation/x86/early-microcode.txt. For that you need to enable + in Documentation/x86/microcode.txt. For that you need to enable CONFIG_BLK_DEV_INITRD in order for the loader to be able to scan the initrd for microcode blobs. @@ -1363,7 +1400,7 @@ config HIGHMEM4G config HIGHMEM64G bool "64GB" - depends on !M486 + depends on !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !WINCHIP3D && !MK6 select X86_PAE ---help--- Select this if you have a 32-bit processor and more than 4 @@ -1430,6 +1467,8 @@ config X86_PAE config X86_5LEVEL bool "Enable 5-level page tables support" + select DYNAMIC_MEMORY_LAYOUT + select SPARSEMEM_VMEMMAP depends on X86_64 ---help--- 5-level paging enables access to larger address space: @@ -1438,8 +1477,8 @@ config X86_5LEVEL It will be supported by future Intel CPUs. - Note: a kernel with this option enabled can only be booted - on machines that support the feature. + A kernel with the option enabled can be booted on machines that + support 4- or 5-level paging. See Documentation/x86/x86_64/5level-paging.txt for more information. @@ -1564,10 +1603,6 @@ config ARCH_HAVE_MEMORY_PRESENT def_bool y depends on X86_32 && DISCONTIGMEM -config NEED_NODE_MEMMAP_SIZE - def_bool y - depends on X86_32 && (DISCONTIGMEM || SPARSEMEM) - config ARCH_FLATMEM_ENABLE def_bool y depends on X86_32 && !NUMA @@ -1978,6 +2013,9 @@ config KEXEC_FILE for kernel and initramfs as opposed to list of segments as accepted by previous system call. +config ARCH_HAS_KEXEC_PURGATORY + def_bool KEXEC_FILE + config KEXEC_VERIFY_SIG bool "Verify kernel signature during kexec_file_load() syscall" depends on KEXEC_FILE @@ -2143,10 +2181,17 @@ config PHYSICAL_ALIGN Don't change this unless you know what you are doing. +config DYNAMIC_MEMORY_LAYOUT + bool + ---help--- + This option makes base addresses of vmalloc and vmemmap as well as + __PAGE_OFFSET movable during boot. + config RANDOMIZE_MEMORY bool "Randomize the kernel memory sections" depends on X86_64 depends on RANDOMIZE_BASE + select DYNAMIC_MEMORY_LAYOUT default RANDOMIZE_BASE ---help--- Randomizes the base virtual address of kernel memory sections @@ -2265,7 +2310,7 @@ choice it can be used to assist security vulnerability exploitation. This setting can be changed at boot time via the kernel command - line parameter vsyscall=[native|emulate|none]. + line parameter vsyscall=[emulate|none]. On a system with recent enough glibc (2.14 or newer) and no static binaries, you can say None without a performance penalty @@ -2273,15 +2318,6 @@ choice If unsure, select "Emulate". - config LEGACY_VSYSCALL_NATIVE - bool "Native" - help - Actual executable code is located in the fixed vsyscall - address mapping, implementing time() efficiently. Since - this makes the mapping executable, it can be used during - security vulnerability exploitation (traditionally as - ROP gadgets). This configuration is not recommended. - config LEGACY_VSYSCALL_EMULATE bool "Emulate" help @@ -2599,8 +2635,10 @@ config PCI_DIRECT depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC || PCI_GOMMCONFIG)) config PCI_MMCONFIG - def_bool y - depends on X86_32 && PCI && (ACPI || SFI) && (PCI_GOMMCONFIG || PCI_GOANY) + bool "Support mmconfig PCI config space access" if X86_64 + default y + depends on PCI && (ACPI || SFI || JAILHOUSE_GUEST) + depends on X86_64 || (PCI_GOANY || PCI_GOMMCONFIG) config PCI_OLPC def_bool y @@ -2615,9 +2653,9 @@ config PCI_DOMAINS def_bool y depends on PCI -config PCI_MMCONFIG - bool "Support mmconfig PCI config space access" - depends on X86_64 && PCI && ACPI +config MMCONF_FAM10H + def_bool y + depends on X86_64 && PCI_MMCONFIG && ACPI config PCI_CNB20LE_QUIRK bool "Read CNB20LE Host Bridge Windows" if EXPERT @@ -2635,11 +2673,13 @@ config PCI_CNB20LE_QUIRK source "drivers/pci/Kconfig" config ISA_BUS - bool "ISA-style bus support on modern systems" if EXPERT - select ISA_BUS_API + bool "ISA bus support on modern systems" if EXPERT help - Enables ISA-style drivers on modern systems. This is necessary to - support PC/104 devices on X86_64 platforms. + Expose ISA bus device drivers and options available for selection and + configuration. Enable this option if your target machine has an ISA + bus. ISA is an older system, displaced by PCI and newer bus + architectures -- if your target machine is modern, it probably does + not have an ISA bus. If unsure, say N. @@ -2728,11 +2768,9 @@ config OLPC_XO1_RTC config OLPC_XO1_SCI bool "OLPC XO-1 SCI extras" - depends on OLPC && OLPC_XO1_PM + depends on OLPC && OLPC_XO1_PM && GPIO_CS5535=y depends on INPUT=y select POWER_SUPPLY - select GPIO_CS5535 - select MFD_CORE ---help--- Add support for SCI-based features of the OLPC XO-1 laptop: - EC-driven system wakeups diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 65a9a4716e34..638411f22267 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -315,19 +315,6 @@ config X86_L1_CACHE_SHIFT default "4" if MELAN || M486 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX -config X86_PPRO_FENCE - bool "PentiumPro memory ordering errata workaround" - depends on M686 || M586MMX || M586TSC || M586 || M486 || MGEODEGX1 - ---help--- - Old PentiumPro multiprocessor systems had errata that could cause - memory operations to violate the x86 ordering standard in rare cases. - Enabling this option will attempt to work around some (but not all) - occurrences of this problem, at the cost of much heavier spinlock and - memory barrier operations. - - If unsure, say n here. Even distro kernels should think twice before - enabling this: there are few systems, and an unlikely bug. - config X86_F00F_BUG def_bool y depends on M586MMX || M586TSC || M586 || M486 @@ -374,7 +361,7 @@ config X86_TSC config X86_CMPXCHG64 def_bool y - depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM + depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8 # this should be set for all -march=.. options where the compiler # generates cmov. @@ -385,7 +372,7 @@ config X86_CMOV config X86_MINIMUM_CPU_FAMILY int default "64" if X86_64 - default "6" if X86_32 && X86_P6_NOP + default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8) default "5" if X86_32 && X86_CMPXCHG64 default "4" diff --git a/arch/x86/Makefile b/arch/x86/Makefile index fad55160dcb9..60135cbd905c 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -31,8 +31,7 @@ endif CODE16GCC_CFLAGS := -m32 -Wa,$(srctree)/arch/x86/boot/code16gcc.h M16_CFLAGS := $(call cc-option, -m16, $(CODE16GCC_CFLAGS)) -REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \ - -DDISABLE_BRANCH_PROFILING \ +REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -DDISABLE_BRANCH_PROFILING \ -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \ -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ -mno-mmx -mno-sse @@ -181,6 +180,10 @@ ifdef CONFIG_FUNCTION_GRAPH_TRACER endif endif +ifndef CC_HAVE_ASM_GOTO + $(error Compiler lacks asm-goto support.) +endif + # # Jump labels need '-maccumulate-outgoing-args' for gcc < 4.5.2 to prevent a # GCC bug (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=46226). There's no way @@ -223,6 +226,15 @@ KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) LDFLAGS := -m elf_$(UTS_MACHINE) +# +# The 64-bit kernel must be aligned to 2MB. Pass -z max-page-size=0x200000 to +# the linker to force 2MB page size regardless of the default page size used +# by the linker. +# +ifdef CONFIG_X86_64 +LDFLAGS += $(call ld-option, -z max-page-size=0x200000) +endif + # Speed up the build KBUILD_CFLAGS += -pipe # Workaround for a gcc prelease that unfortunately was shipped in a suse release @@ -232,10 +244,9 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables # Avoid indirect branches in kernel to deal with Spectre ifdef CONFIG_RETPOLINE - RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register) - ifneq ($(RETPOLINE_CFLAGS),) - KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE - endif +ifneq ($(RETPOLINE_CFLAGS),) + KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE +endif endif archscripts: scripts_basic diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index f25e1530e064..fa42f895fdde 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -26,7 +26,7 @@ KCOV_INSTRUMENT := n targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 -KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ -O2 +KBUILD_CFLAGS := -m$(BITS) -O2 KBUILD_CFLAGS += -fno-strict-aliasing $(call cc-option, -fPIE, -fPIC) KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING cflags-$(CONFIG_X86_32) := -march=i386 @@ -78,7 +78,7 @@ vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o ifdef CONFIG_X86_64 - vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o + vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr_64.o vmlinux-objs-y += $(obj)/mem_encrypt.o vmlinux-objs-y += $(obj)/pgtable_64.o endif diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 353e20c3f114..47d3efff6805 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -421,9 +421,10 @@ static void retrieve_apple_device_properties(struct boot_params *boot_params) } } +static const efi_char16_t apple[] = L"Apple"; + static void setup_quirks(struct boot_params *boot_params) { - efi_char16_t const apple[] = { 'A', 'p', 'p', 'l', 'e', 0 }; efi_char16_t *fw_vendor = (efi_char16_t *)(unsigned long) efi_table_attr(efi_system_table, fw_vendor, sys_table); @@ -439,7 +440,7 @@ setup_uga32(void **uga_handle, unsigned long size, u32 *width, u32 *height) struct efi_uga_draw_protocol *uga = NULL, *first_uga; efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID; unsigned long nr_ugas; - u32 *handles = (u32 *)uga_handle;; + u32 *handles = (u32 *)uga_handle; efi_status_t status = EFI_INVALID_PARAMETER; int i; @@ -484,7 +485,7 @@ setup_uga64(void **uga_handle, unsigned long size, u32 *width, u32 *height) struct efi_uga_draw_protocol *uga = NULL, *first_uga; efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID; unsigned long nr_ugas; - u64 *handles = (u64 *)uga_handle;; + u64 *handles = (u64 *)uga_handle; efi_status_t status = EFI_INVALID_PARAMETER; int i; diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index fc313e29fe2c..fca012baba19 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -33,6 +33,7 @@ #include <asm/processor-flags.h> #include <asm/asm-offsets.h> #include <asm/bootparam.h> +#include "pgtable.h" /* * Locally defined symbols should be marked hidden: @@ -304,55 +305,77 @@ ENTRY(startup_64) /* Set up the stack */ leaq boot_stack_end(%rbx), %rsp -#ifdef CONFIG_X86_5LEVEL /* - * Check if we need to enable 5-level paging. - * RSI holds real mode data and need to be preserved across - * a function call. + * At this point we are in long mode with 4-level paging enabled, + * but we might want to enable 5-level paging or vice versa. + * + * The problem is that we cannot do it directly. Setting or clearing + * CR4.LA57 in long mode would trigger #GP. So we need to switch off + * long mode and paging first. + * + * We also need a trampoline in lower memory to switch over from + * 4- to 5-level paging for cases when the bootloader puts the kernel + * above 4G, but didn't enable 5-level paging for us. + * + * The same trampoline can be used to switch from 5- to 4-level paging + * mode, like when starting 4-level paging kernel via kexec() when + * original kernel worked in 5-level paging mode. + * + * For the trampoline, we need the top page table to reside in lower + * memory as we don't have a way to load 64-bit values into CR3 in + * 32-bit mode. + * + * We go though the trampoline even if we don't have to: if we're + * already in a desired paging mode. This way the trampoline code gets + * tested on every boot. */ - pushq %rsi - call l5_paging_required - popq %rsi - /* If l5_paging_required() returned zero, we're done here. */ - cmpq $0, %rax - je lvl5 + /* Make sure we have GDT with 32-bit code segment */ + leaq gdt(%rip), %rax + movq %rax, gdt64+2(%rip) + lgdt gdt64(%rip) /* - * At this point we are in long mode with 4-level paging enabled, - * but we want to enable 5-level paging. + * paging_prepare() sets up the trampoline and checks if we need to + * enable 5-level paging. * - * The problem is that we cannot do it directly. Setting LA57 in - * long mode would trigger #GP. So we need to switch off long mode - * first. + * Address of the trampoline is returned in RAX. + * Non zero RDX on return means we need to enable 5-level paging. * - * NOTE: This is not going to work if bootloader put us above 4G - * limit. - * - * The first step is go into compatibility mode. + * RSI holds real mode data and needs to be preserved across + * this function call. */ + pushq %rsi + call paging_prepare + popq %rsi - /* Clear additional page table */ - leaq lvl5_pgtable(%rbx), %rdi - xorq %rax, %rax - movq $(PAGE_SIZE/8), %rcx - rep stosq + /* Save the trampoline address in RCX */ + movq %rax, %rcx /* - * Setup current CR3 as the first and only entry in a new top level - * page table. + * Load the address of trampoline_return() into RDI. + * It will be used by the trampoline to return to the main code. */ - movq %cr3, %rdi - leaq 0x7 (%rdi), %rax - movq %rax, lvl5_pgtable(%rbx) + leaq trampoline_return(%rip), %rdi /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */ pushq $__KERNEL32_CS - leaq compatible_mode(%rip), %rax + leaq TRAMPOLINE_32BIT_CODE_OFFSET(%rax), %rax pushq %rax lretq -lvl5: -#endif +trampoline_return: + /* Restore the stack, the 32-bit trampoline uses its own stack */ + leaq boot_stack_end(%rbx), %rsp + + /* + * cleanup_trampoline() would restore trampoline memory. + * + * RSI holds real mode data and needs to be preserved across + * this function call. + */ + pushq %rsi + call cleanup_trampoline + popq %rsi /* Zero EFLAGS */ pushq $0 @@ -490,46 +513,82 @@ relocated: jmp *%rax .code32 -#ifdef CONFIG_X86_5LEVEL -compatible_mode: - /* Setup data and stack segments */ +/* + * This is the 32-bit trampoline that will be copied over to low memory. + * + * RDI contains the return address (might be above 4G). + * ECX contains the base address of the trampoline memory. + * Non zero RDX on return means we need to enable 5-level paging. + */ +ENTRY(trampoline_32bit_src) + /* Set up data and stack segments */ movl $__KERNEL_DS, %eax movl %eax, %ds movl %eax, %ss + /* Set up new stack */ + leal TRAMPOLINE_32BIT_STACK_END(%ecx), %esp + /* Disable paging */ movl %cr0, %eax btrl $X86_CR0_PG_BIT, %eax movl %eax, %cr0 - /* Point CR3 to 5-level paging */ - leal lvl5_pgtable(%ebx), %eax - movl %eax, %cr3 + /* Check what paging mode we want to be in after the trampoline */ + cmpl $0, %edx + jz 1f - /* Enable PAE and LA57 mode */ + /* We want 5-level paging: don't touch CR3 if it already points to 5-level page tables */ + movl %cr4, %eax + testl $X86_CR4_LA57, %eax + jnz 3f + jmp 2f +1: + /* We want 4-level paging: don't touch CR3 if it already points to 4-level page tables */ movl %cr4, %eax - orl $(X86_CR4_PAE | X86_CR4_LA57), %eax + testl $X86_CR4_LA57, %eax + jz 3f +2: + /* Point CR3 to the trampoline's new top level page table */ + leal TRAMPOLINE_32BIT_PGTABLE_OFFSET(%ecx), %eax + movl %eax, %cr3 +3: + /* Enable PAE and LA57 (if required) paging modes */ + movl $X86_CR4_PAE, %eax + cmpl $0, %edx + jz 1f + orl $X86_CR4_LA57, %eax +1: movl %eax, %cr4 - /* Calculate address we are running at */ - call 1f -1: popl %edi - subl $1b, %edi + /* Calculate address of paging_enabled() once we are executing in the trampoline */ + leal paging_enabled - trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax - /* Prepare stack for far return to Long Mode */ + /* Prepare the stack for far return to Long Mode */ pushl $__KERNEL_CS - leal lvl5(%edi), %eax - push %eax + pushl %eax - /* Enable paging back */ + /* Enable paging again */ movl $(X86_CR0_PG | X86_CR0_PE), %eax movl %eax, %cr0 lret -#endif + .code64 +paging_enabled: + /* Return from the trampoline */ + jmp *%rdi + + /* + * The trampoline code has a size limit. + * Make sure we fail to compile if the trampoline code grows + * beyond TRAMPOLINE_32BIT_CODE_SIZE bytes. + */ + .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE + + .code32 no_longmode: - /* This isn't an x86-64 CPU so hang */ + /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */ 1: hlt jmp 1b @@ -537,6 +596,11 @@ no_longmode: #include "../../kernel/verify_cpu.S" .data +gdt64: + .word gdt_end - gdt + .long 0 + .word 0 + .quad 0 gdt: .word gdt_end - gdt .long gdt @@ -585,7 +649,3 @@ boot_stack_end: .balign 4096 pgtable: .fill BOOT_PGT_SIZE, 1, 0 -#ifdef CONFIG_X86_5LEVEL -lvl5_pgtable: - .fill PAGE_SIZE, 1, 0 -#endif diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 8199a6187251..a0a50b91ecef 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -46,8 +46,17 @@ #define STATIC #include <linux/decompress/mm.h> +#ifdef CONFIG_X86_5LEVEL +unsigned int pgtable_l5_enabled __ro_after_init; +unsigned int pgdir_shift __ro_after_init = 39; +unsigned int ptrs_per_p4d __ro_after_init = 1; +#endif + extern unsigned long get_cmd_line_ptr(void); +/* Used by PAGE_KERN* macros: */ +pteval_t __default_kernel_pte_mask __read_mostly = ~0; + /* Simplified build-specific string for starting entropy. */ static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; @@ -723,6 +732,14 @@ void choose_random_location(unsigned long input, return; } +#ifdef CONFIG_X86_5LEVEL + if (__read_cr4() & X86_CR4_LA57) { + pgtable_l5_enabled = 1; + pgdir_shift = 48; + ptrs_per_p4d = 512; + } +#endif + boot_params->hdr.loadflags |= KASLR_FLAG; /* Prepare to add new identity pagetables on demand. */ diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/kaslr_64.c index b5e5e02f8cde..522d11431433 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/kaslr_64.c @@ -16,13 +16,6 @@ #define __pa(x) ((unsigned long)(x)) #define __va(x) ((void *)((unsigned long)(x))) -/* - * The pgtable.h and mm/ident_map.c includes make use of the SME related - * information which is not used in the compressed image support. Un-define - * the SME support to avoid any compile and link errors. - */ -#undef CONFIG_AMD_MEM_ENCRYPT - /* No PAGE_TABLE_ISOLATION support needed either: */ #undef CONFIG_PAGE_TABLE_ISOLATION @@ -85,13 +78,14 @@ static struct x86_mapping_info mapping_info; /* Locates and clears a region for a new top level page table. */ void initialize_identity_maps(void) { - unsigned long sev_me_mask = get_sev_encryption_mask(); + /* If running as an SEV guest, the encryption mask is required. */ + set_sev_encryption_mask(); /* Init mapping_info with run-time function/buffer pointers. */ mapping_info.alloc_pgt_page = alloc_pgt_page; mapping_info.context = &pgt_data; - mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sev_me_mask; - mapping_info.kernpg_flag = _KERNPG_TABLE | sev_me_mask; + mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask; + mapping_info.kernpg_flag = _KERNPG_TABLE; /* * It should be impossible for this not to already be true, diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index 54f5f6625a73..eaa843a52907 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -88,9 +88,7 @@ ENTRY(get_sev_encryption_bit) ENDPROC(get_sev_encryption_bit) .code64 -ENTRY(get_sev_encryption_mask) - xor %rax, %rax - +ENTRY(set_sev_encryption_mask) #ifdef CONFIG_AMD_MEM_ENCRYPT push %rbp push %rdx @@ -101,9 +99,7 @@ ENTRY(get_sev_encryption_mask) testl %eax, %eax jz .Lno_sev_mask - xor %rdx, %rdx - bts %rax, %rdx /* Create the encryption mask */ - mov %rdx, %rax /* ... and return it */ + bts %rax, sme_me_mask(%rip) /* Create the encryption mask */ .Lno_sev_mask: movq %rbp, %rsp /* Restore original stack pointer */ @@ -112,9 +108,16 @@ ENTRY(get_sev_encryption_mask) pop %rbp #endif + xor %rax, %rax ret -ENDPROC(get_sev_encryption_mask) +ENDPROC(set_sev_encryption_mask) .data enc_bit: .int 0xffffffff + +#ifdef CONFIG_AMD_MEM_ENCRYPT + .balign 8 +GLOBAL(sme_me_mask) + .quad 0 +#endif diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 98761a1576ce..8dd1d5ccae58 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -14,6 +14,7 @@ #include "misc.h" #include "error.h" +#include "pgtable.h" #include "../string.h" #include "../voffset.h" @@ -169,16 +170,6 @@ void __puthex(unsigned long value) } } -static bool l5_supported(void) -{ - /* Check if leaf 7 is supported. */ - if (native_cpuid_eax(0) < 7) - return 0; - - /* Check if la57 is supported. */ - return native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)); -} - #if CONFIG_X86_NEED_RELOCS static void handle_relocations(void *output, unsigned long output_len, unsigned long virt_addr) @@ -309,6 +300,10 @@ static void parse_elf(void *output) switch (phdr->p_type) { case PT_LOAD: +#ifdef CONFIG_X86_64 + if ((phdr->p_align % 0x200000) != 0) + error("Alignment of LOAD segment isn't multiple of 2MB"); +#endif #ifdef CONFIG_RELOCATABLE dest = output; dest += (phdr->p_paddr - LOAD_PHYSICAL_ADDR); @@ -372,12 +367,6 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, console_init(); debug_putstr("early console in extract_kernel\n"); - if (IS_ENABLED(CONFIG_X86_5LEVEL) && !l5_supported()) { - error("This linux kernel as configured requires 5-level paging\n" - "This CPU does not support the required 'cr4.la57' feature\n" - "Unable to boot - please use a kernel appropriate for your CPU\n"); - } - free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; @@ -388,6 +377,11 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, debug_putaddr(output_len); debug_putaddr(kernel_total_size); +#ifdef CONFIG_X86_64 + /* Report address of 32-bit trampoline */ + debug_putaddr(trampoline_32bit); +#endif + /* * The memory hole needed for the kernel is the larger of either * the entire decompressed kernel plus relocation table, or the diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 9d323dc6b159..9e11be4cae19 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -12,6 +12,11 @@ #undef CONFIG_PARAVIRT_SPINLOCKS #undef CONFIG_KASAN +#ifdef CONFIG_X86_5LEVEL +/* cpu_feature_enabled() cannot be used that early */ +#define pgtable_l5_enabled __pgtable_l5_enabled +#endif + #include <linux/linkage.h> #include <linux/screen_info.h> #include <linux/elf.h> @@ -109,6 +114,6 @@ static inline void console_init(void) { } #endif -unsigned long get_sev_encryption_mask(void); +void set_sev_encryption_mask(void); #endif diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h new file mode 100644 index 000000000000..91f75638f6e6 --- /dev/null +++ b/arch/x86/boot/compressed/pgtable.h @@ -0,0 +1,20 @@ +#ifndef BOOT_COMPRESSED_PAGETABLE_H +#define BOOT_COMPRESSED_PAGETABLE_H + +#define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE) + +#define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0 + +#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE +#define TRAMPOLINE_32BIT_CODE_SIZE 0x60 + +#define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE + +#ifndef __ASSEMBLER__ + +extern unsigned long *trampoline_32bit; + +extern void trampoline_32bit_src(void *return_ptr); + +#endif /* __ASSEMBLER__ */ +#endif /* BOOT_COMPRESSED_PAGETABLE_H */ diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index b4469a37e9a1..32af1cbcd903 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -1,4 +1,6 @@ #include <asm/processor.h> +#include "pgtable.h" +#include "../string.h" /* * __force_order is used by special_insns.h asm code to force instruction @@ -9,20 +11,144 @@ */ unsigned long __force_order; -int l5_paging_required(void) +#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */ +#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */ + +struct paging_config { + unsigned long trampoline_start; + unsigned long l5_required; +}; + +/* Buffer to preserve trampoline memory */ +static char trampoline_save[TRAMPOLINE_32BIT_SIZE]; + +/* + * The page table is going to be used instead of page table in the trampoline + * memory. + * + * It must not be in BSS as BSS is cleared after cleanup_trampoline(). + */ +static char top_pgtable[PAGE_SIZE] __aligned(PAGE_SIZE) __section(.data); + +/* + * Trampoline address will be printed by extract_kernel() for debugging + * purposes. + * + * Avoid putting the pointer into .bss as it will be cleared between + * paging_prepare() and extract_kernel(). + */ +unsigned long *trampoline_32bit __section(.data); + +struct paging_config paging_prepare(void) { - /* Check if leaf 7 is supported. */ + struct paging_config paging_config = {}; + unsigned long bios_start, ebda_start; + + /* + * Check if LA57 is desired and supported. + * + * There are two parts to the check: + * - if the kernel supports 5-level paging: CONFIG_X86_5LEVEL=y + * - if the machine supports 5-level paging: + * + CPUID leaf 7 is supported + * + the leaf has the feature bit set + * + * That's substitute for boot_cpu_has() in early boot code. + */ + if (IS_ENABLED(CONFIG_X86_5LEVEL) && + native_cpuid_eax(0) >= 7 && + (native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) { + paging_config.l5_required = 1; + } + + /* + * Find a suitable spot for the trampoline. + * This code is based on reserve_bios_regions(). + */ + + ebda_start = *(unsigned short *)0x40e << 4; + bios_start = *(unsigned short *)0x413 << 10; - if (native_cpuid_eax(0) < 7) - return 0; + if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX) + bios_start = BIOS_START_MAX; + + if (ebda_start > BIOS_START_MIN && ebda_start < bios_start) + bios_start = ebda_start; + + /* Place the trampoline just below the end of low memory, aligned to 4k */ + paging_config.trampoline_start = bios_start - TRAMPOLINE_32BIT_SIZE; + paging_config.trampoline_start = round_down(paging_config.trampoline_start, PAGE_SIZE); + + trampoline_32bit = (unsigned long *)paging_config.trampoline_start; + + /* Preserve trampoline memory */ + memcpy(trampoline_save, trampoline_32bit, TRAMPOLINE_32BIT_SIZE); + + /* Clear trampoline memory first */ + memset(trampoline_32bit, 0, TRAMPOLINE_32BIT_SIZE); + + /* Copy trampoline code in place */ + memcpy(trampoline_32bit + TRAMPOLINE_32BIT_CODE_OFFSET / sizeof(unsigned long), + &trampoline_32bit_src, TRAMPOLINE_32BIT_CODE_SIZE); + + /* + * The code below prepares page table in trampoline memory. + * + * The new page table will be used by trampoline code for switching + * from 4- to 5-level paging or vice versa. + * + * If switching is not required, the page table is unused: trampoline + * code wouldn't touch CR3. + */ + + /* + * We are not going to use the page table in trampoline memory if we + * are already in the desired paging mode. + */ + if (paging_config.l5_required == !!(native_read_cr4() & X86_CR4_LA57)) + goto out; + + if (paging_config.l5_required) { + /* + * For 4- to 5-level paging transition, set up current CR3 as + * the first and the only entry in a new top-level page table. + */ + trampoline_32bit[TRAMPOLINE_32BIT_PGTABLE_OFFSET] = __native_read_cr3() | _PAGE_TABLE_NOENC; + } else { + unsigned long src; + + /* + * For 5- to 4-level paging transition, copy page table pointed + * by first entry in the current top-level page table as our + * new top-level page table. + * + * We cannot just point to the page table from trampoline as it + * may be above 4G. + */ + src = *(unsigned long *)__native_read_cr3() & PAGE_MASK; + memcpy(trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long), + (void *)src, PAGE_SIZE); + } + +out: + return paging_config; +} + +void cleanup_trampoline(void) +{ + void *trampoline_pgtable; - /* Check if la57 is supported. */ - if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) - return 0; + trampoline_pgtable = trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET; - /* Check if 5-level paging has already been enabled. */ - if (native_read_cr4() & X86_CR4_LA57) - return 0; + /* + * Move the top level page table out of trampoline memory, + * if it's there. + */ + if ((void *)__native_read_cr3() == trampoline_pgtable) { + memcpy(top_pgtable, trampoline_pgtable, PAGE_SIZE); + native_write_cr3((unsigned long)top_pgtable); + } - return 1; + /* Restore trampoline memory */ + memcpy(trampoline_32bit, trampoline_save, TRAMPOLINE_32BIT_SIZE); } diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 12e8484a8ee7..e762ef417562 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -94,23 +94,30 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff #define STACK_OFFSET 8*3 -#define HashKey 16*0 // store HashKey <<1 mod poly here -#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here -#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here -#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here -#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64 + +#define AadHash 16*0 +#define AadLen 16*1 +#define InLen (16*1)+8 +#define PBlockEncKey 16*2 +#define OrigIV 16*3 +#define CurCount 16*4 +#define PBlockLen 16*5 +#define HashKey 16*6 // store HashKey <<1 mod poly here +#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here +#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here +#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here +#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64 // bits of HashKey <<1 mod poly here //(for Karatsuba purposes) -#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64 +#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64 // bits of HashKey^2 <<1 mod poly here // (for Karatsuba purposes) -#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64 +#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64 // bits of HashKey^3 <<1 mod poly here // (for Karatsuba purposes) -#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64 +#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64 // bits of HashKey^4 <<1 mod poly here // (for Karatsuba purposes) -#define VARIABLE_OFFSET 16*8 #define arg1 rdi #define arg2 rsi @@ -118,10 +125,11 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff #define arg4 rcx #define arg5 r8 #define arg6 r9 -#define arg7 STACK_OFFSET+8(%r14) -#define arg8 STACK_OFFSET+16(%r14) -#define arg9 STACK_OFFSET+24(%r14) -#define arg10 STACK_OFFSET+32(%r14) +#define arg7 STACK_OFFSET+8(%rsp) +#define arg8 STACK_OFFSET+16(%rsp) +#define arg9 STACK_OFFSET+24(%rsp) +#define arg10 STACK_OFFSET+32(%rsp) +#define arg11 STACK_OFFSET+40(%rsp) #define keysize 2*15*16(%arg1) #endif @@ -171,6 +179,332 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff #define TKEYP T1 #endif +.macro FUNC_SAVE + push %r12 + push %r13 + push %r14 +# +# states of %xmm registers %xmm6:%xmm15 not saved +# all %xmm registers are clobbered +# +.endm + + +.macro FUNC_RESTORE + pop %r14 + pop %r13 + pop %r12 +.endm + +# Precompute hashkeys. +# Input: Hash subkey. +# Output: HashKeys stored in gcm_context_data. Only needs to be called +# once per key. +# clobbers r12, and tmp xmm registers. +.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7 + mov \SUBKEY, %r12 + movdqu (%r12), \TMP3 + movdqa SHUF_MASK(%rip), \TMP2 + PSHUFB_XMM \TMP2, \TMP3 + + # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) + + movdqa \TMP3, \TMP2 + psllq $1, \TMP3 + psrlq $63, \TMP2 + movdqa \TMP2, \TMP1 + pslldq $8, \TMP2 + psrldq $8, \TMP1 + por \TMP2, \TMP3 + + # reduce HashKey<<1 + + pshufd $0x24, \TMP1, \TMP2 + pcmpeqd TWOONE(%rip), \TMP2 + pand POLY(%rip), \TMP2 + pxor \TMP2, \TMP3 + movdqa \TMP3, HashKey(%arg2) + + movdqa \TMP3, \TMP5 + pshufd $78, \TMP3, \TMP1 + pxor \TMP3, \TMP1 + movdqa \TMP1, HashKey_k(%arg2) + + GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 +# TMP5 = HashKey^2<<1 (mod poly) + movdqa \TMP5, HashKey_2(%arg2) +# HashKey_2 = HashKey^2<<1 (mod poly) + pshufd $78, \TMP5, \TMP1 + pxor \TMP5, \TMP1 + movdqa \TMP1, HashKey_2_k(%arg2) + + GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 +# TMP5 = HashKey^3<<1 (mod poly) + movdqa \TMP5, HashKey_3(%arg2) + pshufd $78, \TMP5, \TMP1 + pxor \TMP5, \TMP1 + movdqa \TMP1, HashKey_3_k(%arg2) + + GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 +# TMP5 = HashKey^3<<1 (mod poly) + movdqa \TMP5, HashKey_4(%arg2) + pshufd $78, \TMP5, \TMP1 + pxor \TMP5, \TMP1 + movdqa \TMP1, HashKey_4_k(%arg2) +.endm + +# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding. +# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13 +.macro GCM_INIT Iv SUBKEY AAD AADLEN + mov \AADLEN, %r11 + mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length + xor %r11, %r11 + mov %r11, InLen(%arg2) # ctx_data.in_length = 0 + mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0 + mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0 + mov \Iv, %rax + movdqu (%rax), %xmm0 + movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv + + movdqa SHUF_MASK(%rip), %xmm2 + PSHUFB_XMM %xmm2, %xmm0 + movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv + + PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + movdqa HashKey(%arg2), %xmm13 + + CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \ + %xmm4, %xmm5, %xmm6 +.endm + +# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context +# struct has been initialized by GCM_INIT. +# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK +# Clobbers rax, r10-r13, and xmm0-xmm15 +.macro GCM_ENC_DEC operation + movdqu AadHash(%arg2), %xmm8 + movdqu HashKey(%arg2), %xmm13 + add %arg5, InLen(%arg2) + + xor %r11, %r11 # initialise the data pointer offset as zero + PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation + + sub %r11, %arg5 # sub partial block data used + mov %arg5, %r13 # save the number of bytes + + and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) + mov %r13, %r12 + # Encrypt/Decrypt first few blocks + + and $(3<<4), %r12 + jz _initial_num_blocks_is_0_\@ + cmp $(2<<4), %r12 + jb _initial_num_blocks_is_1_\@ + je _initial_num_blocks_is_2_\@ +_initial_num_blocks_is_3_\@: + INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ +%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation + sub $48, %r13 + jmp _initial_blocks_\@ +_initial_num_blocks_is_2_\@: + INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ +%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation + sub $32, %r13 + jmp _initial_blocks_\@ +_initial_num_blocks_is_1_\@: + INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ +%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation + sub $16, %r13 + jmp _initial_blocks_\@ +_initial_num_blocks_is_0_\@: + INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ +%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation +_initial_blocks_\@: + + # Main loop - Encrypt/Decrypt remaining blocks + + cmp $0, %r13 + je _zero_cipher_left_\@ + sub $64, %r13 + je _four_cipher_left_\@ +_crypt_by_4_\@: + GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \ + %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \ + %xmm7, %xmm8, enc + add $64, %r11 + sub $64, %r13 + jne _crypt_by_4_\@ +_four_cipher_left_\@: + GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ +%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 +_zero_cipher_left_\@: + movdqu %xmm8, AadHash(%arg2) + movdqu %xmm0, CurCount(%arg2) + + mov %arg5, %r13 + and $15, %r13 # %r13 = arg5 (mod 16) + je _multiple_of_16_bytes_\@ + + mov %r13, PBlockLen(%arg2) + + # Handle the last <16 Byte block separately + paddd ONE(%rip), %xmm0 # INCR CNT to get Yn + movdqu %xmm0, CurCount(%arg2) + movdqa SHUF_MASK(%rip), %xmm10 + PSHUFB_XMM %xmm10, %xmm0 + + ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) + movdqu %xmm0, PBlockEncKey(%arg2) + + cmp $16, %arg5 + jge _large_enough_update_\@ + + lea (%arg4,%r11,1), %r10 + mov %r13, %r12 + READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 + jmp _data_read_\@ + +_large_enough_update_\@: + sub $16, %r11 + add %r13, %r11 + + # receive the last <16 Byte block + movdqu (%arg4, %r11, 1), %xmm1 + + sub %r13, %r11 + add $16, %r11 + + lea SHIFT_MASK+16(%rip), %r12 + # adjust the shuffle mask pointer to be able to shift 16-r13 bytes + # (r13 is the number of bytes in plaintext mod 16) + sub %r13, %r12 + # get the appropriate shuffle mask + movdqu (%r12), %xmm2 + # shift right 16-r13 bytes + PSHUFB_XMM %xmm2, %xmm1 + +_data_read_\@: + lea ALL_F+16(%rip), %r12 + sub %r13, %r12 + +.ifc \operation, dec + movdqa %xmm1, %xmm2 +.endif + pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn) + movdqu (%r12), %xmm1 + # get the appropriate mask to mask out top 16-r13 bytes of xmm0 + pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 +.ifc \operation, dec + pand %xmm1, %xmm2 + movdqa SHUF_MASK(%rip), %xmm10 + PSHUFB_XMM %xmm10 ,%xmm2 + + pxor %xmm2, %xmm8 +.else + movdqa SHUF_MASK(%rip), %xmm10 + PSHUFB_XMM %xmm10,%xmm0 + + pxor %xmm0, %xmm8 +.endif + + movdqu %xmm8, AadHash(%arg2) +.ifc \operation, enc + # GHASH computation for the last <16 byte block + movdqa SHUF_MASK(%rip), %xmm10 + # shuffle xmm0 back to output as ciphertext + PSHUFB_XMM %xmm10, %xmm0 +.endif + + # Output %r13 bytes + MOVQ_R64_XMM %xmm0, %rax + cmp $8, %r13 + jle _less_than_8_bytes_left_\@ + mov %rax, (%arg3 , %r11, 1) + add $8, %r11 + psrldq $8, %xmm0 + MOVQ_R64_XMM %xmm0, %rax + sub $8, %r13 +_less_than_8_bytes_left_\@: + mov %al, (%arg3, %r11, 1) + add $1, %r11 + shr $8, %rax + sub $1, %r13 + jne _less_than_8_bytes_left_\@ +_multiple_of_16_bytes_\@: +.endm + +# GCM_COMPLETE Finishes update of tag of last partial block +# Output: Authorization Tag (AUTH_TAG) +# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 +.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN + movdqu AadHash(%arg2), %xmm8 + movdqu HashKey(%arg2), %xmm13 + + mov PBlockLen(%arg2), %r12 + + cmp $0, %r12 + je _partial_done\@ + + GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 + +_partial_done\@: + mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes) + shl $3, %r12 # convert into number of bits + movd %r12d, %xmm15 # len(A) in %xmm15 + mov InLen(%arg2), %r12 + shl $3, %r12 # len(C) in bits (*128) + MOVQ_R64_XMM %r12, %xmm1 + + pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 + pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) + pxor %xmm15, %xmm8 + GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 + # final GHASH computation + movdqa SHUF_MASK(%rip), %xmm10 + PSHUFB_XMM %xmm10, %xmm8 + + movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0 + ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) + pxor %xmm8, %xmm0 +_return_T_\@: + mov \AUTHTAG, %r10 # %r10 = authTag + mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len + cmp $16, %r11 + je _T_16_\@ + cmp $8, %r11 + jl _T_4_\@ +_T_8_\@: + MOVQ_R64_XMM %xmm0, %rax + mov %rax, (%r10) + add $8, %r10 + sub $8, %r11 + psrldq $8, %xmm0 + cmp $0, %r11 + je _return_T_done_\@ +_T_4_\@: + movd %xmm0, %eax + mov %eax, (%r10) + add $4, %r10 + sub $4, %r11 + psrldq $4, %xmm0 + cmp $0, %r11 + je _return_T_done_\@ +_T_123_\@: + movd %xmm0, %eax + cmp $2, %r11 + jl _T_1_\@ + mov %ax, (%r10) + cmp $2, %r11 + je _return_T_done_\@ + add $2, %r10 + sar $16, %eax +_T_1_\@: + mov %al, (%r10) + jmp _return_T_done_\@ +_T_16_\@: + movdqu %xmm0, (%r10) +_return_T_done_\@: +.endm #ifdef __x86_64__ /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) @@ -264,232 +598,188 @@ _read_next_byte_lt8_\@: _done_read_partial_block_\@: .endm -/* -* if a = number of total plaintext bytes -* b = floor(a/16) -* num_initial_blocks = b mod 4 -* encrypt the initial num_initial_blocks blocks and apply ghash on -* the ciphertext -* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers -* are clobbered -* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified -*/ - - -.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ -XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation - MOVADQ SHUF_MASK(%rip), %xmm14 - mov arg7, %r10 # %r10 = AAD - mov arg8, %r11 # %r11 = aadLen - pxor %xmm\i, %xmm\i - pxor \XMM2, \XMM2 +# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. +# clobbers r10-11, xmm14 +.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \ + TMP6 TMP7 + MOVADQ SHUF_MASK(%rip), %xmm14 + mov \AAD, %r10 # %r10 = AAD + mov \AADLEN, %r11 # %r11 = aadLen + pxor \TMP7, \TMP7 + pxor \TMP6, \TMP6 cmp $16, %r11 - jl _get_AAD_rest\num_initial_blocks\operation -_get_AAD_blocks\num_initial_blocks\operation: - movdqu (%r10), %xmm\i - PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data - pxor %xmm\i, \XMM2 - GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + jl _get_AAD_rest\@ +_get_AAD_blocks\@: + movdqu (%r10), \TMP7 + PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data + pxor \TMP7, \TMP6 + GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 add $16, %r10 sub $16, %r11 cmp $16, %r11 - jge _get_AAD_blocks\num_initial_blocks\operation + jge _get_AAD_blocks\@ - movdqu \XMM2, %xmm\i + movdqu \TMP6, \TMP7 /* read the last <16B of AAD */ -_get_AAD_rest\num_initial_blocks\operation: +_get_AAD_rest\@: cmp $0, %r11 - je _get_AAD_done\num_initial_blocks\operation - - READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i - PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data - pxor \XMM2, %xmm\i - GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + je _get_AAD_done\@ -_get_AAD_done\num_initial_blocks\operation: - xor %r11, %r11 # initialise the data pointer offset as zero - # start AES for num_initial_blocks blocks - - mov %arg5, %rax # %rax = *Y0 - movdqu (%rax), \XMM0 # XMM0 = Y0 - PSHUFB_XMM %xmm14, \XMM0 + READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7 + PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data + pxor \TMP6, \TMP7 + GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 + movdqu \TMP7, \TMP6 -.if (\i == 5) || (\i == 6) || (\i == 7) - MOVADQ ONE(%RIP),\TMP1 - MOVADQ (%arg1),\TMP2 -.irpc index, \i_seq - paddd \TMP1, \XMM0 # INCR Y0 - movdqa \XMM0, %xmm\index - PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap - pxor \TMP2, %xmm\index -.endr - lea 0x10(%arg1),%r10 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - add $5,%eax # 128->9, 192->11, 256->13 - -aes_loop_initial_dec\num_initial_blocks: - MOVADQ (%r10),\TMP1 -.irpc index, \i_seq - AESENC \TMP1, %xmm\index -.endr - add $16,%r10 - sub $1,%eax - jnz aes_loop_initial_dec\num_initial_blocks - - MOVADQ (%r10), \TMP1 -.irpc index, \i_seq - AESENCLAST \TMP1, %xmm\index # Last Round -.endr -.irpc index, \i_seq - movdqu (%arg3 , %r11, 1), \TMP1 - pxor \TMP1, %xmm\index - movdqu %xmm\index, (%arg2 , %r11, 1) - # write back plaintext/ciphertext for num_initial_blocks - add $16, %r11 - - movdqa \TMP1, %xmm\index - PSHUFB_XMM %xmm14, %xmm\index - # prepare plaintext/ciphertext for GHASH computation -.endr -.endif - - # apply GHASH on num_initial_blocks blocks +_get_AAD_done\@: + movdqu \TMP6, AadHash(%arg2) +.endm -.if \i == 5 - pxor %xmm5, %xmm6 - GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 - pxor %xmm6, %xmm7 - GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 - pxor %xmm7, %xmm8 - GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 -.elseif \i == 6 - pxor %xmm6, %xmm7 - GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 - pxor %xmm7, %xmm8 - GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 -.elseif \i == 7 - pxor %xmm7, %xmm8 - GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks +# between update calls. +# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK +# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context +# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 +.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ + AAD_HASH operation + mov PBlockLen(%arg2), %r13 + cmp $0, %r13 + je _partial_block_done_\@ # Leave Macro if no partial blocks + # Read in input data without over reading + cmp $16, \PLAIN_CYPH_LEN + jl _fewer_than_16_bytes_\@ + movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm + jmp _data_read_\@ + +_fewer_than_16_bytes_\@: + lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 + mov \PLAIN_CYPH_LEN, %r12 + READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1 + + mov PBlockLen(%arg2), %r13 + +_data_read_\@: # Finished reading in data + + movdqu PBlockEncKey(%arg2), %xmm9 + movdqu HashKey(%arg2), %xmm13 + + lea SHIFT_MASK(%rip), %r12 + + # adjust the shuffle mask pointer to be able to shift r13 bytes + # r16-r13 is the number of bytes in plaintext mod 16) + add %r13, %r12 + movdqu (%r12), %xmm2 # get the appropriate shuffle mask + PSHUFB_XMM %xmm2, %xmm9 # shift right r13 bytes + +.ifc \operation, dec + movdqa %xmm1, %xmm3 + pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn) + + mov \PLAIN_CYPH_LEN, %r10 + add %r13, %r10 + # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling + sub $16, %r10 + # Determine if if partial block is not being filled and + # shift mask accordingly + jge _no_extra_mask_1_\@ + sub %r10, %r12 +_no_extra_mask_1_\@: + + movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 + # get the appropriate mask to mask out bottom r13 bytes of xmm9 + pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9 + + pand %xmm1, %xmm3 + movdqa SHUF_MASK(%rip), %xmm10 + PSHUFB_XMM %xmm10, %xmm3 + PSHUFB_XMM %xmm2, %xmm3 + pxor %xmm3, \AAD_HASH + + cmp $0, %r10 + jl _partial_incomplete_1_\@ + + # GHASH computation for the last <16 Byte block + GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + xor %rax,%rax + + mov %rax, PBlockLen(%arg2) + jmp _dec_done_\@ +_partial_incomplete_1_\@: + add \PLAIN_CYPH_LEN, PBlockLen(%arg2) +_dec_done_\@: + movdqu \AAD_HASH, AadHash(%arg2) +.else + pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn) + + mov \PLAIN_CYPH_LEN, %r10 + add %r13, %r10 + # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling + sub $16, %r10 + # Determine if if partial block is not being filled and + # shift mask accordingly + jge _no_extra_mask_2_\@ + sub %r10, %r12 +_no_extra_mask_2_\@: + + movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 + # get the appropriate mask to mask out bottom r13 bytes of xmm9 + pand %xmm1, %xmm9 + + movdqa SHUF_MASK(%rip), %xmm1 + PSHUFB_XMM %xmm1, %xmm9 + PSHUFB_XMM %xmm2, %xmm9 + pxor %xmm9, \AAD_HASH + + cmp $0, %r10 + jl _partial_incomplete_2_\@ + + # GHASH computation for the last <16 Byte block + GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + xor %rax,%rax + + mov %rax, PBlockLen(%arg2) + jmp _encode_done_\@ +_partial_incomplete_2_\@: + add \PLAIN_CYPH_LEN, PBlockLen(%arg2) +_encode_done_\@: + movdqu \AAD_HASH, AadHash(%arg2) + + movdqa SHUF_MASK(%rip), %xmm10 + # shuffle xmm9 back to output as ciphertext + PSHUFB_XMM %xmm10, %xmm9 + PSHUFB_XMM %xmm2, %xmm9 .endif - cmp $64, %r13 - jl _initial_blocks_done\num_initial_blocks\operation - # no need for precomputed values -/* -* -* Precomputations for HashKey parallel with encryption of first 4 blocks. -* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i -*/ - MOVADQ ONE(%rip), \TMP1 - paddd \TMP1, \XMM0 # INCR Y0 - MOVADQ \XMM0, \XMM1 - PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap - - paddd \TMP1, \XMM0 # INCR Y0 - MOVADQ \XMM0, \XMM2 - PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap - - paddd \TMP1, \XMM0 # INCR Y0 - MOVADQ \XMM0, \XMM3 - PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap - - paddd \TMP1, \XMM0 # INCR Y0 - MOVADQ \XMM0, \XMM4 - PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap - - MOVADQ 0(%arg1),\TMP1 - pxor \TMP1, \XMM1 - pxor \TMP1, \XMM2 - pxor \TMP1, \XMM3 - pxor \TMP1, \XMM4 - movdqa \TMP3, \TMP5 - pshufd $78, \TMP3, \TMP1 - pxor \TMP3, \TMP1 - movdqa \TMP1, HashKey_k(%rsp) - GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 -# TMP5 = HashKey^2<<1 (mod poly) - movdqa \TMP5, HashKey_2(%rsp) -# HashKey_2 = HashKey^2<<1 (mod poly) - pshufd $78, \TMP5, \TMP1 - pxor \TMP5, \TMP1 - movdqa \TMP1, HashKey_2_k(%rsp) -.irpc index, 1234 # do 4 rounds - movaps 0x10*\index(%arg1), \TMP1 - AESENC \TMP1, \XMM1 - AESENC \TMP1, \XMM2 - AESENC \TMP1, \XMM3 - AESENC \TMP1, \XMM4 -.endr - GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 -# TMP5 = HashKey^3<<1 (mod poly) - movdqa \TMP5, HashKey_3(%rsp) - pshufd $78, \TMP5, \TMP1 - pxor \TMP5, \TMP1 - movdqa \TMP1, HashKey_3_k(%rsp) -.irpc index, 56789 # do next 5 rounds - movaps 0x10*\index(%arg1), \TMP1 - AESENC \TMP1, \XMM1 - AESENC \TMP1, \XMM2 - AESENC \TMP1, \XMM3 - AESENC \TMP1, \XMM4 -.endr - GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 -# TMP5 = HashKey^3<<1 (mod poly) - movdqa \TMP5, HashKey_4(%rsp) - pshufd $78, \TMP5, \TMP1 - pxor \TMP5, \TMP1 - movdqa \TMP1, HashKey_4_k(%rsp) - lea 0xa0(%arg1),%r10 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - sub $4,%eax # 128->0, 192->2, 256->4 - jz aes_loop_pre_dec_done\num_initial_blocks - -aes_loop_pre_dec\num_initial_blocks: - MOVADQ (%r10),\TMP2 -.irpc index, 1234 - AESENC \TMP2, %xmm\index -.endr - add $16,%r10 - sub $1,%eax - jnz aes_loop_pre_dec\num_initial_blocks - -aes_loop_pre_dec_done\num_initial_blocks: - MOVADQ (%r10), \TMP2 - AESENCLAST \TMP2, \XMM1 - AESENCLAST \TMP2, \XMM2 - AESENCLAST \TMP2, \XMM3 - AESENCLAST \TMP2, \XMM4 - movdqu 16*0(%arg3 , %r11 , 1), \TMP1 - pxor \TMP1, \XMM1 - movdqu \XMM1, 16*0(%arg2 , %r11 , 1) - movdqa \TMP1, \XMM1 - movdqu 16*1(%arg3 , %r11 , 1), \TMP1 - pxor \TMP1, \XMM2 - movdqu \XMM2, 16*1(%arg2 , %r11 , 1) - movdqa \TMP1, \XMM2 - movdqu 16*2(%arg3 , %r11 , 1), \TMP1 - pxor \TMP1, \XMM3 - movdqu \XMM3, 16*2(%arg2 , %r11 , 1) - movdqa \TMP1, \XMM3 - movdqu 16*3(%arg3 , %r11 , 1), \TMP1 - pxor \TMP1, \XMM4 - movdqu \XMM4, 16*3(%arg2 , %r11 , 1) - movdqa \TMP1, \XMM4 - add $64, %r11 - PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap - pxor \XMMDst, \XMM1 -# combine GHASHed value with the corresponding ciphertext - PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap - PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap - PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap - -_initial_blocks_done\num_initial_blocks\operation: - -.endm + # output encrypted Bytes + cmp $0, %r10 + jl _partial_fill_\@ + mov %r13, %r12 + mov $16, %r13 + # Set r13 to be the number of bytes to write out + sub %r12, %r13 + jmp _count_set_\@ +_partial_fill_\@: + mov \PLAIN_CYPH_LEN, %r13 +_count_set_\@: + movdqa %xmm9, %xmm0 + MOVQ_R64_XMM %xmm0, %rax + cmp $8, %r13 + jle _less_than_8_bytes_left_\@ + mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) + add $8, \DATA_OFFSET + psrldq $8, %xmm0 + MOVQ_R64_XMM %xmm0, %rax + sub $8, %r13 +_less_than_8_bytes_left_\@: + movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) + add $1, \DATA_OFFSET + shr $8, %rax + sub $1, %r13 + jne _less_than_8_bytes_left_\@ +_partial_block_done_\@: +.endm # PARTIAL_BLOCK /* * if a = number of total plaintext bytes @@ -499,49 +789,19 @@ _initial_blocks_done\num_initial_blocks\operation: * the ciphertext * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers * are clobbered -* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified +* arg1, %arg2, %arg3 are used as a pointer only, not modified */ -.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ -XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation - MOVADQ SHUF_MASK(%rip), %xmm14 - mov arg7, %r10 # %r10 = AAD - mov arg8, %r11 # %r11 = aadLen - pxor %xmm\i, %xmm\i - pxor \XMM2, \XMM2 +.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ + XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation + MOVADQ SHUF_MASK(%rip), %xmm14 - cmp $16, %r11 - jl _get_AAD_rest\num_initial_blocks\operation -_get_AAD_blocks\num_initial_blocks\operation: - movdqu (%r10), %xmm\i - PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data - pxor %xmm\i, \XMM2 - GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 - add $16, %r10 - sub $16, %r11 - cmp $16, %r11 - jge _get_AAD_blocks\num_initial_blocks\operation + movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0 - movdqu \XMM2, %xmm\i - - /* read the last <16B of AAD */ -_get_AAD_rest\num_initial_blocks\operation: - cmp $0, %r11 - je _get_AAD_done\num_initial_blocks\operation - - READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i - PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data - pxor \XMM2, %xmm\i - GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 - -_get_AAD_done\num_initial_blocks\operation: - xor %r11, %r11 # initialise the data pointer offset as zero # start AES for num_initial_blocks blocks - mov %arg5, %rax # %rax = *Y0 - movdqu (%rax), \XMM0 # XMM0 = Y0 - PSHUFB_XMM %xmm14, \XMM0 + movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0 .if (\i == 5) || (\i == 6) || (\i == 7) @@ -549,7 +809,11 @@ _get_AAD_done\num_initial_blocks\operation: MOVADQ 0(%arg1),\TMP2 .irpc index, \i_seq paddd \TMP1, \XMM0 # INCR Y0 +.ifc \operation, dec + movdqa \XMM0, %xmm\index +.else MOVADQ \XMM0, %xmm\index +.endif PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap pxor \TMP2, %xmm\index .endr @@ -558,25 +822,29 @@ _get_AAD_done\num_initial_blocks\operation: shr $2,%eax # 128->4, 192->6, 256->8 add $5,%eax # 128->9, 192->11, 256->13 -aes_loop_initial_enc\num_initial_blocks: +aes_loop_initial_\@: MOVADQ (%r10),\TMP1 .irpc index, \i_seq AESENC \TMP1, %xmm\index .endr add $16,%r10 sub $1,%eax - jnz aes_loop_initial_enc\num_initial_blocks + jnz aes_loop_initial_\@ MOVADQ (%r10), \TMP1 .irpc index, \i_seq AESENCLAST \TMP1, %xmm\index # Last Round .endr .irpc index, \i_seq - movdqu (%arg3 , %r11, 1), \TMP1 + movdqu (%arg4 , %r11, 1), \TMP1 pxor \TMP1, %xmm\index - movdqu %xmm\index, (%arg2 , %r11, 1) + movdqu %xmm\index, (%arg3 , %r11, 1) # write back plaintext/ciphertext for num_initial_blocks add $16, %r11 + +.ifc \operation, dec + movdqa \TMP1, %xmm\index +.endif PSHUFB_XMM %xmm14, %xmm\index # prepare plaintext/ciphertext for GHASH computation @@ -602,7 +870,7 @@ aes_loop_initial_enc\num_initial_blocks: GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 .endif cmp $64, %r13 - jl _initial_blocks_done\num_initial_blocks\operation + jl _initial_blocks_done\@ # no need for precomputed values /* * @@ -631,17 +899,6 @@ aes_loop_initial_enc\num_initial_blocks: pxor \TMP1, \XMM2 pxor \TMP1, \XMM3 pxor \TMP1, \XMM4 - movdqa \TMP3, \TMP5 - pshufd $78, \TMP3, \TMP1 - pxor \TMP3, \TMP1 - movdqa \TMP1, HashKey_k(%rsp) - GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 -# TMP5 = HashKey^2<<1 (mod poly) - movdqa \TMP5, HashKey_2(%rsp) -# HashKey_2 = HashKey^2<<1 (mod poly) - pshufd $78, \TMP5, \TMP1 - pxor \TMP5, \TMP1 - movdqa \TMP1, HashKey_2_k(%rsp) .irpc index, 1234 # do 4 rounds movaps 0x10*\index(%arg1), \TMP1 AESENC \TMP1, \XMM1 @@ -649,12 +906,6 @@ aes_loop_initial_enc\num_initial_blocks: AESENC \TMP1, \XMM3 AESENC \TMP1, \XMM4 .endr - GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 -# TMP5 = HashKey^3<<1 (mod poly) - movdqa \TMP5, HashKey_3(%rsp) - pshufd $78, \TMP5, \TMP1 - pxor \TMP5, \TMP1 - movdqa \TMP1, HashKey_3_k(%rsp) .irpc index, 56789 # do next 5 rounds movaps 0x10*\index(%arg1), \TMP1 AESENC \TMP1, \XMM1 @@ -662,45 +913,56 @@ aes_loop_initial_enc\num_initial_blocks: AESENC \TMP1, \XMM3 AESENC \TMP1, \XMM4 .endr - GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 -# TMP5 = HashKey^3<<1 (mod poly) - movdqa \TMP5, HashKey_4(%rsp) - pshufd $78, \TMP5, \TMP1 - pxor \TMP5, \TMP1 - movdqa \TMP1, HashKey_4_k(%rsp) lea 0xa0(%arg1),%r10 mov keysize,%eax shr $2,%eax # 128->4, 192->6, 256->8 sub $4,%eax # 128->0, 192->2, 256->4 - jz aes_loop_pre_enc_done\num_initial_blocks + jz aes_loop_pre_done\@ -aes_loop_pre_enc\num_initial_blocks: +aes_loop_pre_\@: MOVADQ (%r10),\TMP2 .irpc index, 1234 AESENC \TMP2, %xmm\index .endr add $16,%r10 sub $1,%eax - jnz aes_loop_pre_enc\num_initial_blocks + jnz aes_loop_pre_\@ -aes_loop_pre_enc_done\num_initial_blocks: +aes_loop_pre_done\@: MOVADQ (%r10), \TMP2 AESENCLAST \TMP2, \XMM1 AESENCLAST \TMP2, \XMM2 AESENCLAST \TMP2, \XMM3 AESENCLAST \TMP2, \XMM4 - movdqu 16*0(%arg3 , %r11 , 1), \TMP1 + movdqu 16*0(%arg4 , %r11 , 1), \TMP1 pxor \TMP1, \XMM1 - movdqu 16*1(%arg3 , %r11 , 1), \TMP1 +.ifc \operation, dec + movdqu \XMM1, 16*0(%arg3 , %r11 , 1) + movdqa \TMP1, \XMM1 +.endif + movdqu 16*1(%arg4 , %r11 , 1), \TMP1 pxor \TMP1, \XMM2 - movdqu 16*2(%arg3 , %r11 , 1), \TMP1 +.ifc \operation, dec + movdqu \XMM2, 16*1(%arg3 , %r11 , 1) + movdqa \TMP1, \XMM2 +.endif + movdqu 16*2(%arg4 , %r11 , 1), \TMP1 pxor \TMP1, \XMM3 - movdqu 16*3(%arg3 , %r11 , 1), \TMP1 +.ifc \operation, dec + movdqu \XMM3, 16*2(%arg3 , %r11 , 1) + movdqa \TMP1, \XMM3 +.endif + movdqu 16*3(%arg4 , %r11 , 1), \TMP1 pxor \TMP1, \XMM4 - movdqu \XMM1, 16*0(%arg2 , %r11 , 1) - movdqu \XMM2, 16*1(%arg2 , %r11 , 1) - movdqu \XMM3, 16*2(%arg2 , %r11 , 1) - movdqu \XMM4, 16*3(%arg2 , %r11 , 1) +.ifc \operation, dec + movdqu \XMM4, 16*3(%arg3 , %r11 , 1) + movdqa \TMP1, \XMM4 +.else + movdqu \XMM1, 16*0(%arg3 , %r11 , 1) + movdqu \XMM2, 16*1(%arg3 , %r11 , 1) + movdqu \XMM3, 16*2(%arg3 , %r11 , 1) + movdqu \XMM4, 16*3(%arg3 , %r11 , 1) +.endif add $64, %r11 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap @@ -710,14 +972,14 @@ aes_loop_pre_enc_done\num_initial_blocks: PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap -_initial_blocks_done\num_initial_blocks\operation: +_initial_blocks_done\@: .endm /* * encrypt 4 blocks at a time * ghash the 4 previously encrypted ciphertext blocks -* arg1, %arg2, %arg3 are used as pointers only, not modified +* arg1, %arg3, %arg4 are used as pointers only, not modified * %r11 is the data offset value */ .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \ @@ -735,7 +997,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation pshufd $78, \XMM5, \TMP6 pxor \XMM5, \TMP6 paddd ONE(%rip), \XMM0 # INCR CNT - movdqa HashKey_4(%rsp), \TMP5 + movdqa HashKey_4(%arg2), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 movdqa \XMM0, \XMM1 paddd ONE(%rip), \XMM0 # INCR CNT @@ -754,7 +1016,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation pxor (%arg1), \XMM2 pxor (%arg1), \XMM3 pxor (%arg1), \XMM4 - movdqa HashKey_4_k(%rsp), \TMP5 + movdqa HashKey_4_k(%arg2), \TMP5 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) movaps 0x10(%arg1), \TMP1 AESENC \TMP1, \XMM1 # Round 1 @@ -769,7 +1031,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM6, \TMP1 pshufd $78, \XMM6, \TMP2 pxor \XMM6, \TMP2 - movdqa HashKey_3(%rsp), \TMP5 + movdqa HashKey_3(%arg2), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 movaps 0x30(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 3 @@ -782,7 +1044,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation AESENC \TMP3, \XMM2 AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM4 - movdqa HashKey_3_k(%rsp), \TMP5 + movdqa HashKey_3_k(%arg2), \TMP5 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movaps 0x50(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 5 @@ -796,7 +1058,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM7, \TMP1 pshufd $78, \XMM7, \TMP2 pxor \XMM7, \TMP2 - movdqa HashKey_2(%rsp ), \TMP5 + movdqa HashKey_2(%arg2), \TMP5 # Multiply TMP5 * HashKey using karatsuba @@ -812,7 +1074,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation AESENC \TMP3, \XMM2 AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM4 - movdqa HashKey_2_k(%rsp), \TMP5 + movdqa HashKey_2_k(%arg2), \TMP5 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movaps 0x80(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 8 @@ -830,7 +1092,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM8, \TMP1 pshufd $78, \XMM8, \TMP2 pxor \XMM8, \TMP2 - movdqa HashKey(%rsp), \TMP5 + movdqa HashKey(%arg2), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 movaps 0x90(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 9 @@ -842,37 +1104,37 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation mov keysize,%eax shr $2,%eax # 128->4, 192->6, 256->8 sub $4,%eax # 128->0, 192->2, 256->4 - jz aes_loop_par_enc_done + jz aes_loop_par_enc_done\@ -aes_loop_par_enc: +aes_loop_par_enc\@: MOVADQ (%r10),\TMP3 .irpc index, 1234 AESENC \TMP3, %xmm\index .endr add $16,%r10 sub $1,%eax - jnz aes_loop_par_enc + jnz aes_loop_par_enc\@ -aes_loop_par_enc_done: +aes_loop_par_enc_done\@: MOVADQ (%r10), \TMP3 AESENCLAST \TMP3, \XMM1 # Round 10 AESENCLAST \TMP3, \XMM2 AESENCLAST \TMP3, \XMM3 AESENCLAST \TMP3, \XMM4 - movdqa HashKey_k(%rsp), \TMP5 + movdqa HashKey_k(%arg2), \TMP5 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movdqu (%arg3,%r11,1), \TMP3 + movdqu (%arg4,%r11,1), \TMP3 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK - movdqu 16(%arg3,%r11,1), \TMP3 + movdqu 16(%arg4,%r11,1), \TMP3 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK - movdqu 32(%arg3,%r11,1), \TMP3 + movdqu 32(%arg4,%r11,1), \TMP3 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK - movdqu 48(%arg3,%r11,1), \TMP3 + movdqu 48(%arg4,%r11,1), \TMP3 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK - movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer - movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer - movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer - movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer + movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer + movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer + movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer + movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap @@ -925,7 +1187,7 @@ aes_loop_par_enc_done: /* * decrypt 4 blocks at a time * ghash the 4 previously decrypted ciphertext blocks -* arg1, %arg2, %arg3 are used as pointers only, not modified +* arg1, %arg3, %arg4 are used as pointers only, not modified * %r11 is the data offset value */ .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \ @@ -943,7 +1205,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation pshufd $78, \XMM5, \TMP6 pxor \XMM5, \TMP6 paddd ONE(%rip), \XMM0 # INCR CNT - movdqa HashKey_4(%rsp), \TMP5 + movdqa HashKey_4(%arg2), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 movdqa \XMM0, \XMM1 paddd ONE(%rip), \XMM0 # INCR CNT @@ -962,7 +1224,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation pxor (%arg1), \XMM2 pxor (%arg1), \XMM3 pxor (%arg1), \XMM4 - movdqa HashKey_4_k(%rsp), \TMP5 + movdqa HashKey_4_k(%arg2), \TMP5 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) movaps 0x10(%arg1), \TMP1 AESENC \TMP1, \XMM1 # Round 1 @@ -977,7 +1239,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM6, \TMP1 pshufd $78, \XMM6, \TMP2 pxor \XMM6, \TMP2 - movdqa HashKey_3(%rsp), \TMP5 + movdqa HashKey_3(%arg2), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 movaps 0x30(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 3 @@ -990,7 +1252,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation AESENC \TMP3, \XMM2 AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM4 - movdqa HashKey_3_k(%rsp), \TMP5 + movdqa HashKey_3_k(%arg2), \TMP5 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movaps 0x50(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 5 @@ -1004,7 +1266,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM7, \TMP1 pshufd $78, \XMM7, \TMP2 pxor \XMM7, \TMP2 - movdqa HashKey_2(%rsp ), \TMP5 + movdqa HashKey_2(%arg2), \TMP5 # Multiply TMP5 * HashKey using karatsuba @@ -1020,7 +1282,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation AESENC \TMP3, \XMM2 AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM4 - movdqa HashKey_2_k(%rsp), \TMP5 + movdqa HashKey_2_k(%arg2), \TMP5 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movaps 0x80(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 8 @@ -1038,7 +1300,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM8, \TMP1 pshufd $78, \XMM8, \TMP2 pxor \XMM8, \TMP2 - movdqa HashKey(%rsp), \TMP5 + movdqa HashKey(%arg2), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 movaps 0x90(%arg1), \TMP3 AESENC \TMP3, \XMM1 # Round 9 @@ -1050,40 +1312,40 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation mov keysize,%eax shr $2,%eax # 128->4, 192->6, 256->8 sub $4,%eax # 128->0, 192->2, 256->4 - jz aes_loop_par_dec_done + jz aes_loop_par_dec_done\@ -aes_loop_par_dec: +aes_loop_par_dec\@: MOVADQ (%r10),\TMP3 .irpc index, 1234 AESENC \TMP3, %xmm\index .endr add $16,%r10 sub $1,%eax - jnz aes_loop_par_dec + jnz aes_loop_par_dec\@ -aes_loop_par_dec_done: +aes_loop_par_dec_done\@: MOVADQ (%r10), \TMP3 AESENCLAST \TMP3, \XMM1 # last round AESENCLAST \TMP3, \XMM2 AESENCLAST \TMP3, \XMM3 AESENCLAST \TMP3, \XMM4 - movdqa HashKey_k(%rsp), \TMP5 + movdqa HashKey_k(%arg2), \TMP5 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movdqu (%arg3,%r11,1), \TMP3 + movdqu (%arg4,%r11,1), \TMP3 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK - movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer + movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer movdqa \TMP3, \XMM1 - movdqu 16(%arg3,%r11,1), \TMP3 + movdqu 16(%arg4,%r11,1), \TMP3 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK - movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer + movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer movdqa \TMP3, \XMM2 - movdqu 32(%arg3,%r11,1), \TMP3 + movdqu 32(%arg4,%r11,1), \TMP3 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK - movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer + movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer movdqa \TMP3, \XMM3 - movdqu 48(%arg3,%r11,1), \TMP3 + movdqu 48(%arg4,%r11,1), \TMP3 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK - movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer + movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer movdqa \TMP3, \XMM4 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap @@ -1143,10 +1405,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst movdqa \XMM1, \TMP6 pshufd $78, \XMM1, \TMP2 pxor \XMM1, \TMP2 - movdqa HashKey_4(%rsp), \TMP5 + movdqa HashKey_4(%arg2), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0 - movdqa HashKey_4_k(%rsp), \TMP4 + movdqa HashKey_4_k(%arg2), \TMP4 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movdqa \XMM1, \XMMDst movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 @@ -1156,10 +1418,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst movdqa \XMM2, \TMP1 pshufd $78, \XMM2, \TMP2 pxor \XMM2, \TMP2 - movdqa HashKey_3(%rsp), \TMP5 + movdqa HashKey_3(%arg2), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0 - movdqa HashKey_3_k(%rsp), \TMP4 + movdqa HashKey_3_k(%arg2), \TMP4 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) pxor \TMP1, \TMP6 pxor \XMM2, \XMMDst @@ -1171,10 +1433,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst movdqa \XMM3, \TMP1 pshufd $78, \XMM3, \TMP2 pxor \XMM3, \TMP2 - movdqa HashKey_2(%rsp), \TMP5 + movdqa HashKey_2(%arg2), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0 - movdqa HashKey_2_k(%rsp), \TMP4 + movdqa HashKey_2_k(%arg2), \TMP4 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) pxor \TMP1, \TMP6 pxor \XMM3, \XMMDst @@ -1184,10 +1446,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst movdqa \XMM4, \TMP1 pshufd $78, \XMM4, \TMP2 pxor \XMM4, \TMP2 - movdqa HashKey(%rsp), \TMP5 + movdqa HashKey(%arg2), \TMP5 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0 - movdqa HashKey_k(%rsp), \TMP4 + movdqa HashKey_k(%arg2), \TMP4 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) pxor \TMP1, \TMP6 pxor \XMM4, \XMMDst @@ -1256,6 +1518,8 @@ _esb_loop_\@: .endm /***************************************************************************** * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. +* struct gcm_context_data *data +* // Context data * u8 *out, // Plaintext output. Encrypt in-place is allowed. * const u8 *in, // Ciphertext input * u64 plaintext_len, // Length of data in bytes for decryption. @@ -1333,195 +1597,20 @@ _esb_loop_\@: * *****************************************************************************/ ENTRY(aesni_gcm_dec) - push %r12 - push %r13 - push %r14 - mov %rsp, %r14 -/* -* states of %xmm registers %xmm6:%xmm15 not saved -* all %xmm registers are clobbered -*/ - sub $VARIABLE_OFFSET, %rsp - and $~63, %rsp # align rsp to 64 bytes - mov %arg6, %r12 - movdqu (%r12), %xmm13 # %xmm13 = HashKey - movdqa SHUF_MASK(%rip), %xmm2 - PSHUFB_XMM %xmm2, %xmm13 - - -# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH) - - movdqa %xmm13, %xmm2 - psllq $1, %xmm13 - psrlq $63, %xmm2 - movdqa %xmm2, %xmm1 - pslldq $8, %xmm2 - psrldq $8, %xmm1 - por %xmm2, %xmm13 - - # Reduction - - pshufd $0x24, %xmm1, %xmm2 - pcmpeqd TWOONE(%rip), %xmm2 - pand POLY(%rip), %xmm2 - pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly) - - - # Decrypt first few blocks - - movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly) - mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext - and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) - mov %r13, %r12 - and $(3<<4), %r12 - jz _initial_num_blocks_is_0_decrypt - cmp $(2<<4), %r12 - jb _initial_num_blocks_is_1_decrypt - je _initial_num_blocks_is_2_decrypt -_initial_num_blocks_is_3_decrypt: - INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec - sub $48, %r13 - jmp _initial_blocks_decrypted -_initial_num_blocks_is_2_decrypt: - INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec - sub $32, %r13 - jmp _initial_blocks_decrypted -_initial_num_blocks_is_1_decrypt: - INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec - sub $16, %r13 - jmp _initial_blocks_decrypted -_initial_num_blocks_is_0_decrypt: - INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec -_initial_blocks_decrypted: - cmp $0, %r13 - je _zero_cipher_left_decrypt - sub $64, %r13 - je _four_cipher_left_decrypt -_decrypt_by_4: - GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ -%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec - add $64, %r11 - sub $64, %r13 - jne _decrypt_by_4 -_four_cipher_left_decrypt: - GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ -%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 -_zero_cipher_left_decrypt: - mov %arg4, %r13 - and $15, %r13 # %r13 = arg4 (mod 16) - je _multiple_of_16_bytes_decrypt - - # Handle the last <16 byte block separately - - paddd ONE(%rip), %xmm0 # increment CNT to get Yn - movdqa SHUF_MASK(%rip), %xmm10 - PSHUFB_XMM %xmm10, %xmm0 - - ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn) - - lea (%arg3,%r11,1), %r10 - mov %r13, %r12 - READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 - - lea ALL_F+16(%rip), %r12 - sub %r13, %r12 - movdqa %xmm1, %xmm2 - pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn) - movdqu (%r12), %xmm1 - # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0 - pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0 - pand %xmm1, %xmm2 - movdqa SHUF_MASK(%rip), %xmm10 - PSHUFB_XMM %xmm10 ,%xmm2 - - pxor %xmm2, %xmm8 - GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 - - # output %r13 bytes - MOVQ_R64_XMM %xmm0, %rax - cmp $8, %r13 - jle _less_than_8_bytes_left_decrypt - mov %rax, (%arg2 , %r11, 1) - add $8, %r11 - psrldq $8, %xmm0 - MOVQ_R64_XMM %xmm0, %rax - sub $8, %r13 -_less_than_8_bytes_left_decrypt: - mov %al, (%arg2, %r11, 1) - add $1, %r11 - shr $8, %rax - sub $1, %r13 - jne _less_than_8_bytes_left_decrypt -_multiple_of_16_bytes_decrypt: - mov arg8, %r12 # %r13 = aadLen (number of bytes) - shl $3, %r12 # convert into number of bits - movd %r12d, %xmm15 # len(A) in %xmm15 - shl $3, %arg4 # len(C) in bits (*128) - MOVQ_R64_XMM %arg4, %xmm1 - pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 - pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) - pxor %xmm15, %xmm8 - GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 - # final GHASH computation - movdqa SHUF_MASK(%rip), %xmm10 - PSHUFB_XMM %xmm10, %xmm8 + FUNC_SAVE - mov %arg5, %rax # %rax = *Y0 - movdqu (%rax), %xmm0 # %xmm0 = Y0 - ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) - pxor %xmm8, %xmm0 -_return_T_decrypt: - mov arg9, %r10 # %r10 = authTag - mov arg10, %r11 # %r11 = auth_tag_len - cmp $16, %r11 - je _T_16_decrypt - cmp $8, %r11 - jl _T_4_decrypt -_T_8_decrypt: - MOVQ_R64_XMM %xmm0, %rax - mov %rax, (%r10) - add $8, %r10 - sub $8, %r11 - psrldq $8, %xmm0 - cmp $0, %r11 - je _return_T_done_decrypt -_T_4_decrypt: - movd %xmm0, %eax - mov %eax, (%r10) - add $4, %r10 - sub $4, %r11 - psrldq $4, %xmm0 - cmp $0, %r11 - je _return_T_done_decrypt -_T_123_decrypt: - movd %xmm0, %eax - cmp $2, %r11 - jl _T_1_decrypt - mov %ax, (%r10) - cmp $2, %r11 - je _return_T_done_decrypt - add $2, %r10 - sar $16, %eax -_T_1_decrypt: - mov %al, (%r10) - jmp _return_T_done_decrypt -_T_16_decrypt: - movdqu %xmm0, (%r10) -_return_T_done_decrypt: - mov %r14, %rsp - pop %r14 - pop %r13 - pop %r12 + GCM_INIT %arg6, arg7, arg8, arg9 + GCM_ENC_DEC dec + GCM_COMPLETE arg10, arg11 + FUNC_RESTORE ret ENDPROC(aesni_gcm_dec) /***************************************************************************** * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. +* struct gcm_context_data *data +* // Context data * u8 *out, // Ciphertext output. Encrypt in-place is allowed. * const u8 *in, // Plaintext input * u64 plaintext_len, // Length of data in bytes for encryption. @@ -1596,195 +1685,78 @@ ENDPROC(aesni_gcm_dec) * poly = x^128 + x^127 + x^126 + x^121 + 1 ***************************************************************************/ ENTRY(aesni_gcm_enc) - push %r12 - push %r13 - push %r14 - mov %rsp, %r14 -# -# states of %xmm registers %xmm6:%xmm15 not saved -# all %xmm registers are clobbered -# - sub $VARIABLE_OFFSET, %rsp - and $~63, %rsp - mov %arg6, %r12 - movdqu (%r12), %xmm13 - movdqa SHUF_MASK(%rip), %xmm2 - PSHUFB_XMM %xmm2, %xmm13 - - -# precompute HashKey<<1 mod poly from the HashKey (required for GHASH) - - movdqa %xmm13, %xmm2 - psllq $1, %xmm13 - psrlq $63, %xmm2 - movdqa %xmm2, %xmm1 - pslldq $8, %xmm2 - psrldq $8, %xmm1 - por %xmm2, %xmm13 - - # reduce HashKey<<1 - - pshufd $0x24, %xmm1, %xmm2 - pcmpeqd TWOONE(%rip), %xmm2 - pand POLY(%rip), %xmm2 - pxor %xmm2, %xmm13 - movdqa %xmm13, HashKey(%rsp) - mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly) - and $-16, %r13 - mov %r13, %r12 + FUNC_SAVE - # Encrypt first few blocks + GCM_INIT %arg6, arg7, arg8, arg9 + GCM_ENC_DEC enc - and $(3<<4), %r12 - jz _initial_num_blocks_is_0_encrypt - cmp $(2<<4), %r12 - jb _initial_num_blocks_is_1_encrypt - je _initial_num_blocks_is_2_encrypt -_initial_num_blocks_is_3_encrypt: - INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc - sub $48, %r13 - jmp _initial_blocks_encrypted -_initial_num_blocks_is_2_encrypt: - INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc - sub $32, %r13 - jmp _initial_blocks_encrypted -_initial_num_blocks_is_1_encrypt: - INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc - sub $16, %r13 - jmp _initial_blocks_encrypted -_initial_num_blocks_is_0_encrypt: - INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc -_initial_blocks_encrypted: - - # Main loop - Encrypt remaining blocks - - cmp $0, %r13 - je _zero_cipher_left_encrypt - sub $64, %r13 - je _four_cipher_left_encrypt -_encrypt_by_4_encrypt: - GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ -%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc - add $64, %r11 - sub $64, %r13 - jne _encrypt_by_4_encrypt -_four_cipher_left_encrypt: - GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ -%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 -_zero_cipher_left_encrypt: - mov %arg4, %r13 - and $15, %r13 # %r13 = arg4 (mod 16) - je _multiple_of_16_bytes_encrypt - - # Handle the last <16 Byte block separately - paddd ONE(%rip), %xmm0 # INCR CNT to get Yn - movdqa SHUF_MASK(%rip), %xmm10 - PSHUFB_XMM %xmm10, %xmm0 - - ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) - - lea (%arg3,%r11,1), %r10 - mov %r13, %r12 - READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 - - lea ALL_F+16(%rip), %r12 - sub %r13, %r12 - pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn) - movdqu (%r12), %xmm1 - # get the appropriate mask to mask out top 16-r13 bytes of xmm0 - pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 - movdqa SHUF_MASK(%rip), %xmm10 - PSHUFB_XMM %xmm10,%xmm0 + GCM_COMPLETE arg10, arg11 + FUNC_RESTORE + ret +ENDPROC(aesni_gcm_enc) - pxor %xmm0, %xmm8 - GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 - # GHASH computation for the last <16 byte block - movdqa SHUF_MASK(%rip), %xmm10 - PSHUFB_XMM %xmm10, %xmm0 +/***************************************************************************** +* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. +* struct gcm_context_data *data, +* // context data +* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) +* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) +* // concatenated with 0x00000001. 16-byte aligned pointer. +* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. +* const u8 *aad, // Additional Authentication Data (AAD) +* u64 aad_len) // Length of AAD in bytes. +*/ +ENTRY(aesni_gcm_init) + FUNC_SAVE + GCM_INIT %arg3, %arg4,%arg5, %arg6 + FUNC_RESTORE + ret +ENDPROC(aesni_gcm_init) - # shuffle xmm0 back to output as ciphertext +/***************************************************************************** +* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. +* struct gcm_context_data *data, +* // context data +* u8 *out, // Ciphertext output. Encrypt in-place is allowed. +* const u8 *in, // Plaintext input +* u64 plaintext_len, // Length of data in bytes for encryption. +*/ +ENTRY(aesni_gcm_enc_update) + FUNC_SAVE + GCM_ENC_DEC enc + FUNC_RESTORE + ret +ENDPROC(aesni_gcm_enc_update) - # Output %r13 bytes - MOVQ_R64_XMM %xmm0, %rax - cmp $8, %r13 - jle _less_than_8_bytes_left_encrypt - mov %rax, (%arg2 , %r11, 1) - add $8, %r11 - psrldq $8, %xmm0 - MOVQ_R64_XMM %xmm0, %rax - sub $8, %r13 -_less_than_8_bytes_left_encrypt: - mov %al, (%arg2, %r11, 1) - add $1, %r11 - shr $8, %rax - sub $1, %r13 - jne _less_than_8_bytes_left_encrypt -_multiple_of_16_bytes_encrypt: - mov arg8, %r12 # %r12 = addLen (number of bytes) - shl $3, %r12 - movd %r12d, %xmm15 # len(A) in %xmm15 - shl $3, %arg4 # len(C) in bits (*128) - MOVQ_R64_XMM %arg4, %xmm1 - pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 - pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) - pxor %xmm15, %xmm8 - GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 - # final GHASH computation - movdqa SHUF_MASK(%rip), %xmm10 - PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap +/***************************************************************************** +* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. +* struct gcm_context_data *data, +* // context data +* u8 *out, // Ciphertext output. Encrypt in-place is allowed. +* const u8 *in, // Plaintext input +* u64 plaintext_len, // Length of data in bytes for encryption. +*/ +ENTRY(aesni_gcm_dec_update) + FUNC_SAVE + GCM_ENC_DEC dec + FUNC_RESTORE + ret +ENDPROC(aesni_gcm_dec_update) - mov %arg5, %rax # %rax = *Y0 - movdqu (%rax), %xmm0 # %xmm0 = Y0 - ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0) - pxor %xmm8, %xmm0 -_return_T_encrypt: - mov arg9, %r10 # %r10 = authTag - mov arg10, %r11 # %r11 = auth_tag_len - cmp $16, %r11 - je _T_16_encrypt - cmp $8, %r11 - jl _T_4_encrypt -_T_8_encrypt: - MOVQ_R64_XMM %xmm0, %rax - mov %rax, (%r10) - add $8, %r10 - sub $8, %r11 - psrldq $8, %xmm0 - cmp $0, %r11 - je _return_T_done_encrypt -_T_4_encrypt: - movd %xmm0, %eax - mov %eax, (%r10) - add $4, %r10 - sub $4, %r11 - psrldq $4, %xmm0 - cmp $0, %r11 - je _return_T_done_encrypt -_T_123_encrypt: - movd %xmm0, %eax - cmp $2, %r11 - jl _T_1_encrypt - mov %ax, (%r10) - cmp $2, %r11 - je _return_T_done_encrypt - add $2, %r10 - sar $16, %eax -_T_1_encrypt: - mov %al, (%r10) - jmp _return_T_done_encrypt -_T_16_encrypt: - movdqu %xmm0, (%r10) -_return_T_done_encrypt: - mov %r14, %rsp - pop %r14 - pop %r13 - pop %r12 +/***************************************************************************** +* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. +* struct gcm_context_data *data, +* // context data +* u8 *auth_tag, // Authenticated Tag output. +* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), +* // 12 or 8. +*/ +ENTRY(aesni_gcm_finalize) + FUNC_SAVE + GCM_COMPLETE %arg3 %arg4 + FUNC_RESTORE ret -ENDPROC(aesni_gcm_enc) +ENDPROC(aesni_gcm_finalize) #endif diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 34cf1c1f8c98..acbe7e8336d8 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -72,6 +72,21 @@ struct aesni_xts_ctx { u8 raw_crypt_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR; }; +#define GCM_BLOCK_LEN 16 + +struct gcm_context_data { + /* init, update and finalize context data */ + u8 aad_hash[GCM_BLOCK_LEN]; + u64 aad_length; + u64 in_length; + u8 partial_block_enc_key[GCM_BLOCK_LEN]; + u8 orig_IV[GCM_BLOCK_LEN]; + u8 current_counter[GCM_BLOCK_LEN]; + u64 partial_block_len; + u64 unused; + u8 hash_keys[GCM_BLOCK_LEN * 8]; +}; + asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, unsigned int key_len); asmlinkage void aesni_enc(struct crypto_aes_ctx *ctx, u8 *out, @@ -105,6 +120,7 @@ asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out, /* asmlinkage void aesni_gcm_enc() * void *ctx, AES Key schedule. Starts on a 16 byte boundary. + * struct gcm_context_data. May be uninitialized. * u8 *out, Ciphertext output. Encrypt in-place is allowed. * const u8 *in, Plaintext input * unsigned long plaintext_len, Length of data in bytes for encryption. @@ -117,13 +133,15 @@ asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out, * unsigned long auth_tag_len), Authenticated Tag Length in bytes. * Valid values are 16 (most likely), 12 or 8. */ -asmlinkage void aesni_gcm_enc(void *ctx, u8 *out, +asmlinkage void aesni_gcm_enc(void *ctx, + struct gcm_context_data *gdata, u8 *out, const u8 *in, unsigned long plaintext_len, u8 *iv, u8 *hash_subkey, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len); /* asmlinkage void aesni_gcm_dec() * void *ctx, AES Key schedule. Starts on a 16 byte boundary. + * struct gcm_context_data. May be uninitialized. * u8 *out, Plaintext output. Decrypt in-place is allowed. * const u8 *in, Ciphertext input * unsigned long ciphertext_len, Length of data in bytes for decryption. @@ -137,11 +155,28 @@ asmlinkage void aesni_gcm_enc(void *ctx, u8 *out, * unsigned long auth_tag_len) Authenticated Tag Length in bytes. * Valid values are 16 (most likely), 12 or 8. */ -asmlinkage void aesni_gcm_dec(void *ctx, u8 *out, +asmlinkage void aesni_gcm_dec(void *ctx, + struct gcm_context_data *gdata, u8 *out, const u8 *in, unsigned long ciphertext_len, u8 *iv, u8 *hash_subkey, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len); +/* Scatter / Gather routines, with args similar to above */ +asmlinkage void aesni_gcm_init(void *ctx, + struct gcm_context_data *gdata, + u8 *iv, + u8 *hash_subkey, const u8 *aad, + unsigned long aad_len); +asmlinkage void aesni_gcm_enc_update(void *ctx, + struct gcm_context_data *gdata, u8 *out, + const u8 *in, unsigned long plaintext_len); +asmlinkage void aesni_gcm_dec_update(void *ctx, + struct gcm_context_data *gdata, u8 *out, + const u8 *in, + unsigned long ciphertext_len); +asmlinkage void aesni_gcm_finalize(void *ctx, + struct gcm_context_data *gdata, + u8 *auth_tag, unsigned long auth_tag_len); #ifdef CONFIG_AS_AVX asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv, @@ -167,15 +202,17 @@ asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, u8 *out, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len); -static void aesni_gcm_enc_avx(void *ctx, u8 *out, +static void aesni_gcm_enc_avx(void *ctx, + struct gcm_context_data *data, u8 *out, const u8 *in, unsigned long plaintext_len, u8 *iv, u8 *hash_subkey, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len) { struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx; if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)){ - aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad, - aad_len, auth_tag, auth_tag_len); + aesni_gcm_enc(ctx, data, out, in, + plaintext_len, iv, hash_subkey, aad, + aad_len, auth_tag, auth_tag_len); } else { aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad, @@ -183,15 +220,17 @@ static void aesni_gcm_enc_avx(void *ctx, u8 *out, } } -static void aesni_gcm_dec_avx(void *ctx, u8 *out, +static void aesni_gcm_dec_avx(void *ctx, + struct gcm_context_data *data, u8 *out, const u8 *in, unsigned long ciphertext_len, u8 *iv, u8 *hash_subkey, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len) { struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx; if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) { - aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, aad, - aad_len, auth_tag, auth_tag_len); + aesni_gcm_dec(ctx, data, out, in, + ciphertext_len, iv, hash_subkey, aad, + aad_len, auth_tag, auth_tag_len); } else { aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad, @@ -218,15 +257,17 @@ asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, u8 *out, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len); -static void aesni_gcm_enc_avx2(void *ctx, u8 *out, +static void aesni_gcm_enc_avx2(void *ctx, + struct gcm_context_data *data, u8 *out, const u8 *in, unsigned long plaintext_len, u8 *iv, u8 *hash_subkey, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len) { struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx; if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) { - aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad, - aad_len, auth_tag, auth_tag_len); + aesni_gcm_enc(ctx, data, out, in, + plaintext_len, iv, hash_subkey, aad, + aad_len, auth_tag, auth_tag_len); } else if (plaintext_len < AVX_GEN4_OPTSIZE) { aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad, @@ -238,15 +279,17 @@ static void aesni_gcm_enc_avx2(void *ctx, u8 *out, } } -static void aesni_gcm_dec_avx2(void *ctx, u8 *out, +static void aesni_gcm_dec_avx2(void *ctx, + struct gcm_context_data *data, u8 *out, const u8 *in, unsigned long ciphertext_len, u8 *iv, u8 *hash_subkey, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len) { struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx; if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) { - aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, - aad, aad_len, auth_tag, auth_tag_len); + aesni_gcm_dec(ctx, data, out, in, + ciphertext_len, iv, hash_subkey, + aad, aad_len, auth_tag, auth_tag_len); } else if (ciphertext_len < AVX_GEN4_OPTSIZE) { aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad, @@ -259,15 +302,19 @@ static void aesni_gcm_dec_avx2(void *ctx, u8 *out, } #endif -static void (*aesni_gcm_enc_tfm)(void *ctx, u8 *out, - const u8 *in, unsigned long plaintext_len, u8 *iv, - u8 *hash_subkey, const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len); +static void (*aesni_gcm_enc_tfm)(void *ctx, + struct gcm_context_data *data, u8 *out, + const u8 *in, unsigned long plaintext_len, + u8 *iv, u8 *hash_subkey, const u8 *aad, + unsigned long aad_len, u8 *auth_tag, + unsigned long auth_tag_len); -static void (*aesni_gcm_dec_tfm)(void *ctx, u8 *out, - const u8 *in, unsigned long ciphertext_len, u8 *iv, - u8 *hash_subkey, const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len); +static void (*aesni_gcm_dec_tfm)(void *ctx, + struct gcm_context_data *data, u8 *out, + const u8 *in, unsigned long ciphertext_len, + u8 *iv, u8 *hash_subkey, const u8 *aad, + unsigned long aad_len, u8 *auth_tag, + unsigned long auth_tag_len); static inline struct aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) @@ -744,6 +791,127 @@ static int generic_gcmaes_set_authsize(struct crypto_aead *tfm, return 0; } +static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, + unsigned int assoclen, u8 *hash_subkey, + u8 *iv, void *aes_ctx) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + unsigned long auth_tag_len = crypto_aead_authsize(tfm); + struct gcm_context_data data AESNI_ALIGN_ATTR; + struct scatter_walk dst_sg_walk = {}; + unsigned long left = req->cryptlen; + unsigned long len, srclen, dstlen; + struct scatter_walk assoc_sg_walk; + struct scatter_walk src_sg_walk; + struct scatterlist src_start[2]; + struct scatterlist dst_start[2]; + struct scatterlist *src_sg; + struct scatterlist *dst_sg; + u8 *src, *dst, *assoc; + u8 *assocmem = NULL; + u8 authTag[16]; + + if (!enc) + left -= auth_tag_len; + + /* Linearize assoc, if not already linear */ + if (req->src->length >= assoclen && req->src->length && + (!PageHighMem(sg_page(req->src)) || + req->src->offset + req->src->length < PAGE_SIZE)) { + scatterwalk_start(&assoc_sg_walk, req->src); + assoc = scatterwalk_map(&assoc_sg_walk); + } else { + /* assoc can be any length, so must be on heap */ + assocmem = kmalloc(assoclen, GFP_ATOMIC); + if (unlikely(!assocmem)) + return -ENOMEM; + assoc = assocmem; + + scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0); + } + + src_sg = scatterwalk_ffwd(src_start, req->src, req->assoclen); + scatterwalk_start(&src_sg_walk, src_sg); + if (req->src != req->dst) { + dst_sg = scatterwalk_ffwd(dst_start, req->dst, req->assoclen); + scatterwalk_start(&dst_sg_walk, dst_sg); + } + + kernel_fpu_begin(); + aesni_gcm_init(aes_ctx, &data, iv, + hash_subkey, assoc, assoclen); + if (req->src != req->dst) { + while (left) { + src = scatterwalk_map(&src_sg_walk); + dst = scatterwalk_map(&dst_sg_walk); + srclen = scatterwalk_clamp(&src_sg_walk, left); + dstlen = scatterwalk_clamp(&dst_sg_walk, left); + len = min(srclen, dstlen); + if (len) { + if (enc) + aesni_gcm_enc_update(aes_ctx, &data, + dst, src, len); + else + aesni_gcm_dec_update(aes_ctx, &data, + dst, src, len); + } + left -= len; + + scatterwalk_unmap(src); + scatterwalk_unmap(dst); + scatterwalk_advance(&src_sg_walk, len); + scatterwalk_advance(&dst_sg_walk, len); + scatterwalk_done(&src_sg_walk, 0, left); + scatterwalk_done(&dst_sg_walk, 1, left); + } + } else { + while (left) { + dst = src = scatterwalk_map(&src_sg_walk); + len = scatterwalk_clamp(&src_sg_walk, left); + if (len) { + if (enc) + aesni_gcm_enc_update(aes_ctx, &data, + src, src, len); + else + aesni_gcm_dec_update(aes_ctx, &data, + src, src, len); + } + left -= len; + scatterwalk_unmap(src); + scatterwalk_advance(&src_sg_walk, len); + scatterwalk_done(&src_sg_walk, 1, left); + } + } + aesni_gcm_finalize(aes_ctx, &data, authTag, auth_tag_len); + kernel_fpu_end(); + + if (!assocmem) + scatterwalk_unmap(assoc); + else + kfree(assocmem); + + if (!enc) { + u8 authTagMsg[16]; + + /* Copy out original authTag */ + scatterwalk_map_and_copy(authTagMsg, req->src, + req->assoclen + req->cryptlen - + auth_tag_len, + auth_tag_len, 0); + + /* Compare generated tag with passed in tag. */ + return crypto_memneq(authTagMsg, authTag, auth_tag_len) ? + -EBADMSG : 0; + } + + /* Copy in the authTag */ + scatterwalk_map_and_copy(authTag, req->dst, + req->assoclen + req->cryptlen, + auth_tag_len, 1); + + return 0; +} + static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen, u8 *hash_subkey, u8 *iv, void *aes_ctx) { @@ -753,7 +921,14 @@ static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen, unsigned long auth_tag_len = crypto_aead_authsize(tfm); struct scatter_walk src_sg_walk; struct scatter_walk dst_sg_walk = {}; + struct gcm_context_data data AESNI_ALIGN_ATTR; + if (((struct crypto_aes_ctx *)aes_ctx)->key_length != AES_KEYSIZE_128 || + aesni_gcm_enc_tfm == aesni_gcm_enc || + req->cryptlen < AVX_GEN2_OPTSIZE) { + return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv, + aes_ctx); + } if (sg_is_last(req->src) && (!PageHighMem(sg_page(req->src)) || req->src->offset + req->src->length <= PAGE_SIZE) && @@ -782,7 +957,7 @@ static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen, } kernel_fpu_begin(); - aesni_gcm_enc_tfm(aes_ctx, dst, src, req->cryptlen, iv, + aesni_gcm_enc_tfm(aes_ctx, &data, dst, src, req->cryptlen, iv, hash_subkey, assoc, assoclen, dst + req->cryptlen, auth_tag_len); kernel_fpu_end(); @@ -817,8 +992,15 @@ static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen, u8 authTag[16]; struct scatter_walk src_sg_walk; struct scatter_walk dst_sg_walk = {}; + struct gcm_context_data data AESNI_ALIGN_ATTR; int retval = 0; + if (((struct crypto_aes_ctx *)aes_ctx)->key_length != AES_KEYSIZE_128 || + aesni_gcm_enc_tfm == aesni_gcm_enc || + req->cryptlen < AVX_GEN2_OPTSIZE) { + return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv, + aes_ctx); + } tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len); if (sg_is_last(req->src) && @@ -849,7 +1031,7 @@ static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen, kernel_fpu_begin(); - aesni_gcm_dec_tfm(aes_ctx, dst, src, tempCipherLen, iv, + aesni_gcm_dec_tfm(aes_ctx, &data, dst, src, tempCipherLen, iv, hash_subkey, assoc, assoclen, authTag, auth_tag_len); kernel_fpu_end(); diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index f9eca34301e2..3e0c07cc9124 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -25,13 +25,13 @@ * */ -#include <asm/processor.h> +#include <crypto/algapi.h> #include <crypto/blowfish.h> +#include <crypto/internal/skcipher.h> #include <linux/crypto.h> #include <linux/init.h> #include <linux/module.h> #include <linux/types.h> -#include <crypto/algapi.h> /* regular block cipher functions */ asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src, @@ -77,20 +77,28 @@ static void blowfish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) blowfish_dec_blk(crypto_tfm_ctx(tfm), dst, src); } -static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, +static int blowfish_setkey_skcipher(struct crypto_skcipher *tfm, + const u8 *key, unsigned int keylen) +{ + return blowfish_setkey(&tfm->base, key, keylen); +} + +static int ecb_crypt(struct skcipher_request *req, void (*fn)(struct bf_ctx *, u8 *, const u8 *), void (*fn_4way)(struct bf_ctx *, u8 *, const u8 *)) { - struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); unsigned int bsize = BF_BLOCK_SIZE; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct bf_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; unsigned int nbytes; int err; - err = blkcipher_walk_virt(desc, walk); + err = skcipher_walk_virt(&walk, req, false); - while ((nbytes = walk->nbytes)) { - u8 *wsrc = walk->src.virt.addr; - u8 *wdst = walk->dst.virt.addr; + while ((nbytes = walk.nbytes)) { + u8 *wsrc = walk.src.virt.addr; + u8 *wdst = walk.dst.virt.addr; /* Process four block batch */ if (nbytes >= bsize * 4) { @@ -116,34 +124,25 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, } while (nbytes >= bsize); done: - err = blkcipher_walk_done(desc, walk, nbytes); + err = skcipher_walk_done(&walk, nbytes); } return err; } -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ecb_encrypt(struct skcipher_request *req) { - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, blowfish_enc_blk, blowfish_enc_blk_4way); + return ecb_crypt(req, blowfish_enc_blk, blowfish_enc_blk_4way); } -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ecb_decrypt(struct skcipher_request *req) { - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, blowfish_dec_blk, blowfish_dec_blk_4way); + return ecb_crypt(req, blowfish_dec_blk, blowfish_dec_blk_4way); } -static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static unsigned int __cbc_encrypt(struct bf_ctx *ctx, + struct skcipher_walk *walk) { - struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); unsigned int bsize = BF_BLOCK_SIZE; unsigned int nbytes = walk->nbytes; u64 *src = (u64 *)walk->src.virt.addr; @@ -164,27 +163,27 @@ static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, return nbytes; } -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int cbc_encrypt(struct skcipher_request *req) { - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct bf_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); + err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes)) { - nbytes = __cbc_encrypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); + nbytes = __cbc_encrypt(ctx, &walk); + err = skcipher_walk_done(&walk, nbytes); } return err; } -static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static unsigned int __cbc_decrypt(struct bf_ctx *ctx, + struct skcipher_walk *walk) { - struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); unsigned int bsize = BF_BLOCK_SIZE; unsigned int nbytes = walk->nbytes; u64 *src = (u64 *)walk->src.virt.addr; @@ -245,24 +244,25 @@ done: return nbytes; } -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int cbc_decrypt(struct skcipher_request *req) { - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct bf_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); + err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes)) { - nbytes = __cbc_decrypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); + nbytes = __cbc_decrypt(ctx, &walk); + err = skcipher_walk_done(&walk, nbytes); } return err; } -static void ctr_crypt_final(struct bf_ctx *ctx, struct blkcipher_walk *walk) +static void ctr_crypt_final(struct bf_ctx *ctx, struct skcipher_walk *walk) { u8 *ctrblk = walk->iv; u8 keystream[BF_BLOCK_SIZE]; @@ -276,10 +276,8 @@ static void ctr_crypt_final(struct bf_ctx *ctx, struct blkcipher_walk *walk) crypto_inc(ctrblk, BF_BLOCK_SIZE); } -static unsigned int __ctr_crypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static unsigned int __ctr_crypt(struct bf_ctx *ctx, struct skcipher_walk *walk) { - struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); unsigned int bsize = BF_BLOCK_SIZE; unsigned int nbytes = walk->nbytes; u64 *src = (u64 *)walk->src.virt.addr; @@ -332,29 +330,30 @@ done: return nbytes; } -static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ctr_crypt(struct skcipher_request *req) { - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct bf_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, BF_BLOCK_SIZE); + err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) { - nbytes = __ctr_crypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); + nbytes = __ctr_crypt(ctx, &walk); + err = skcipher_walk_done(&walk, nbytes); } - if (walk.nbytes) { - ctr_crypt_final(crypto_blkcipher_ctx(desc->tfm), &walk); - err = blkcipher_walk_done(desc, &walk, 0); + if (nbytes) { + ctr_crypt_final(ctx, &walk); + err = skcipher_walk_done(&walk, 0); } return err; } -static struct crypto_alg bf_algs[4] = { { +static struct crypto_alg bf_cipher_alg = { .cra_name = "blowfish", .cra_driver_name = "blowfish-asm", .cra_priority = 200, @@ -372,66 +371,50 @@ static struct crypto_alg bf_algs[4] = { { .cia_decrypt = blowfish_decrypt, } } -}, { - .cra_name = "ecb(blowfish)", - .cra_driver_name = "ecb-blowfish-asm", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = BF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct bf_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = BF_MIN_KEY_SIZE, - .max_keysize = BF_MAX_KEY_SIZE, - .setkey = blowfish_setkey, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, +}; + +static struct skcipher_alg bf_skcipher_algs[] = { + { + .base.cra_name = "ecb(blowfish)", + .base.cra_driver_name = "ecb-blowfish-asm", + .base.cra_priority = 300, + .base.cra_blocksize = BF_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct bf_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = BF_MIN_KEY_SIZE, + .max_keysize = BF_MAX_KEY_SIZE, + .setkey = blowfish_setkey_skcipher, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, { + .base.cra_name = "cbc(blowfish)", + .base.cra_driver_name = "cbc-blowfish-asm", + .base.cra_priority = 300, + .base.cra_blocksize = BF_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct bf_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = BF_MIN_KEY_SIZE, + .max_keysize = BF_MAX_KEY_SIZE, + .ivsize = BF_BLOCK_SIZE, + .setkey = blowfish_setkey_skcipher, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, { + .base.cra_name = "ctr(blowfish)", + .base.cra_driver_name = "ctr-blowfish-asm", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct bf_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = BF_MIN_KEY_SIZE, + .max_keysize = BF_MAX_KEY_SIZE, + .ivsize = BF_BLOCK_SIZE, + .chunksize = BF_BLOCK_SIZE, + .setkey = blowfish_setkey_skcipher, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, }, -}, { - .cra_name = "cbc(blowfish)", - .cra_driver_name = "cbc-blowfish-asm", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = BF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct bf_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = BF_MIN_KEY_SIZE, - .max_keysize = BF_MAX_KEY_SIZE, - .ivsize = BF_BLOCK_SIZE, - .setkey = blowfish_setkey, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}, { - .cra_name = "ctr(blowfish)", - .cra_driver_name = "ctr-blowfish-asm", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct bf_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = BF_MIN_KEY_SIZE, - .max_keysize = BF_MAX_KEY_SIZE, - .ivsize = BF_BLOCK_SIZE, - .setkey = blowfish_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, - }, -} }; +}; static bool is_blacklisted_cpu(void) { @@ -456,6 +439,8 @@ MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist"); static int __init init(void) { + int err; + if (!force && is_blacklisted_cpu()) { printk(KERN_INFO "blowfish-x86_64: performance on this CPU " @@ -464,12 +449,23 @@ static int __init init(void) return -ENODEV; } - return crypto_register_algs(bf_algs, ARRAY_SIZE(bf_algs)); + err = crypto_register_alg(&bf_cipher_alg); + if (err) + return err; + + err = crypto_register_skciphers(bf_skcipher_algs, + ARRAY_SIZE(bf_skcipher_algs)); + if (err) + crypto_unregister_alg(&bf_cipher_alg); + + return err; } static void __exit fini(void) { - crypto_unregister_algs(bf_algs, ARRAY_SIZE(bf_algs)); + crypto_unregister_alg(&bf_cipher_alg); + crypto_unregister_skciphers(bf_skcipher_algs, + ARRAY_SIZE(bf_skcipher_algs)); } module_init(init); diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index 60907c139c4e..d4992e458f92 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -10,18 +10,15 @@ * */ -#include <linux/module.h> -#include <linux/types.h> -#include <linux/crypto.h> -#include <linux/err.h> -#include <crypto/ablk_helper.h> -#include <crypto/algapi.h> -#include <crypto/ctr.h> -#include <crypto/lrw.h> -#include <crypto/xts.h> -#include <asm/fpu/api.h> #include <asm/crypto/camellia.h> #include <asm/crypto/glue_helper.h> +#include <crypto/algapi.h> +#include <crypto/internal/simd.h> +#include <crypto/xts.h> +#include <linux/crypto.h> +#include <linux/err.h> +#include <linux/module.h> +#include <linux/types.h> #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 #define CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS 32 @@ -150,413 +147,120 @@ static const struct common_glue_ctx camellia_dec_xts = { } } }; -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keylen) { - return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes); + return __camellia_setkey(crypto_skcipher_ctx(tfm), key, keylen, + &tfm->base.crt_flags); } -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes); + return glue_ecb_req_128bit(&camellia_enc, req); } -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc, - dst, src, nbytes); -} - -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src, - nbytes); -} - -static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes); -} - -static inline bool camellia_fpu_begin(bool fpu_enabled, unsigned int nbytes) -{ - return glue_fpu_begin(CAMELLIA_BLOCK_SIZE, - CAMELLIA_AESNI_PARALLEL_BLOCKS, NULL, fpu_enabled, - nbytes); -} - -static inline void camellia_fpu_end(bool fpu_enabled) -{ - glue_fpu_end(fpu_enabled); -} - -static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key, - unsigned int key_len) -{ - return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len, - &tfm->crt_flags); -} - -struct crypt_priv { - struct camellia_ctx *ctx; - bool fpu_enabled; -}; - -static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +static int ecb_decrypt(struct skcipher_request *req) { - const unsigned int bsize = CAMELLIA_BLOCK_SIZE; - struct crypt_priv *ctx = priv; - int i; - - ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes); - - if (nbytes >= CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS * bsize) { - camellia_ecb_enc_32way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS; - nbytes -= bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS; - } - - if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) { - camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; - nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; - } - - while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { - camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; - nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; - } - - for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) - camellia_enc_blk(ctx->ctx, srcdst, srcdst); + return glue_ecb_req_128bit(&camellia_dec, req); } -static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +static int cbc_encrypt(struct skcipher_request *req) { - const unsigned int bsize = CAMELLIA_BLOCK_SIZE; - struct crypt_priv *ctx = priv; - int i; - - ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes); - - if (nbytes >= CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS * bsize) { - camellia_ecb_dec_32way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS; - nbytes -= bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS; - } - - if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) { - camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; - nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; - } - - while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { - camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; - nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; - } - - for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) - camellia_dec_blk(ctx->ctx, srcdst, srcdst); + return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(camellia_enc_blk), + req); } -static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int cbc_decrypt(struct skcipher_request *req) { - struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->camellia_ctx, - .fpu_enabled = false, - }; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = &crypt_ctx, - .crypt_fn = encrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = lrw_crypt(desc, dst, src, nbytes, &req); - camellia_fpu_end(crypt_ctx.fpu_enabled); - - return ret; + return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req); } -static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ctr_crypt(struct skcipher_request *req) { - struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->camellia_ctx, - .fpu_enabled = false, - }; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = &crypt_ctx, - .crypt_fn = decrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = lrw_crypt(desc, dst, src, nbytes, &req); - camellia_fpu_end(crypt_ctx.fpu_enabled); - - return ret; + return glue_ctr_req_128bit(&camellia_ctr, req); } -static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int xts_encrypt(struct skcipher_request *req) { - struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_crypt_128bit(&camellia_enc_xts, desc, dst, src, nbytes, - XTS_TWEAK_CAST(camellia_enc_blk), - &ctx->tweak_ctx, &ctx->crypt_ctx); + return glue_xts_req_128bit(&camellia_enc_xts, req, + XTS_TWEAK_CAST(camellia_enc_blk), + &ctx->tweak_ctx, &ctx->crypt_ctx); } -static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int xts_decrypt(struct skcipher_request *req) { - struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_crypt_128bit(&camellia_dec_xts, desc, dst, src, nbytes, - XTS_TWEAK_CAST(camellia_enc_blk), - &ctx->tweak_ctx, &ctx->crypt_ctx); + return glue_xts_req_128bit(&camellia_dec_xts, req, + XTS_TWEAK_CAST(camellia_enc_blk), + &ctx->tweak_ctx, &ctx->crypt_ctx); } -static struct crypto_alg cmll_algs[10] = { { - .cra_name = "__ecb-camellia-aesni-avx2", - .cra_driver_name = "__driver-ecb-camellia-aesni-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = CAMELLIA_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct camellia_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE, - .setkey = camellia_setkey, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}, { - .cra_name = "__cbc-camellia-aesni-avx2", - .cra_driver_name = "__driver-cbc-camellia-aesni-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = CAMELLIA_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct camellia_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE, - .setkey = camellia_setkey, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}, { - .cra_name = "__ctr-camellia-aesni-avx2", - .cra_driver_name = "__driver-ctr-camellia-aesni-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct camellia_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .setkey = camellia_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, - }, -}, { - .cra_name = "__lrw-camellia-aesni-avx2", - .cra_driver_name = "__driver-lrw-camellia-aesni-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = CAMELLIA_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct camellia_lrw_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_exit = lrw_camellia_exit_tfm, - .cra_u = { - .blkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE + - CAMELLIA_BLOCK_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE + - CAMELLIA_BLOCK_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .setkey = lrw_camellia_setkey, - .encrypt = lrw_encrypt, - .decrypt = lrw_decrypt, - }, - }, -}, { - .cra_name = "__xts-camellia-aesni-avx2", - .cra_driver_name = "__driver-xts-camellia-aesni-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = CAMELLIA_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct camellia_xts_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2, - .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2, - .ivsize = CAMELLIA_BLOCK_SIZE, - .setkey = xts_camellia_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, - }, - }, -}, { - .cra_name = "ecb(camellia)", - .cra_driver_name = "ecb-camellia-aesni-avx2", - .cra_priority = 500, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = CAMELLIA_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "cbc(camellia)", - .cra_driver_name = "cbc-camellia-aesni-avx2", - .cra_priority = 500, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = CAMELLIA_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = __ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "ctr(camellia)", - .cra_driver_name = "ctr-camellia-aesni-avx2", - .cra_priority = 500, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_encrypt, - .geniv = "chainiv", - }, - }, -}, { - .cra_name = "lrw(camellia)", - .cra_driver_name = "lrw-camellia-aesni-avx2", - .cra_priority = 500, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = CAMELLIA_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE + - CAMELLIA_BLOCK_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE + - CAMELLIA_BLOCK_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "xts(camellia)", - .cra_driver_name = "xts-camellia-aesni-avx2", - .cra_priority = 500, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = CAMELLIA_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2, - .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2, - .ivsize = CAMELLIA_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, +static struct skcipher_alg camellia_algs[] = { + { + .base.cra_name = "__ecb(camellia)", + .base.cra_driver_name = "__ecb-camellia-aesni-avx2", + .base.cra_priority = 500, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct camellia_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .setkey = camellia_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, { + .base.cra_name = "__cbc(camellia)", + .base.cra_driver_name = "__cbc-camellia-aesni-avx2", + .base.cra_priority = 500, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct camellia_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .ivsize = CAMELLIA_BLOCK_SIZE, + .setkey = camellia_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, { + .base.cra_name = "__ctr(camellia)", + .base.cra_driver_name = "__ctr-camellia-aesni-avx2", + .base.cra_priority = 500, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct camellia_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .ivsize = CAMELLIA_BLOCK_SIZE, + .chunksize = CAMELLIA_BLOCK_SIZE, + .setkey = camellia_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, { + .base.cra_name = "__xts(camellia)", + .base.cra_driver_name = "__xts-camellia-aesni-avx2", + .base.cra_priority = 500, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct camellia_xts_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = 2 * CAMELLIA_MIN_KEY_SIZE, + .max_keysize = 2 * CAMELLIA_MAX_KEY_SIZE, + .ivsize = CAMELLIA_BLOCK_SIZE, + .setkey = xts_camellia_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, }, -} }; +}; + +static struct simd_skcipher_alg *camellia_simd_algs[ARRAY_SIZE(camellia_algs)]; static int __init camellia_aesni_init(void) { @@ -576,12 +280,15 @@ static int __init camellia_aesni_init(void) return -ENODEV; } - return crypto_register_algs(cmll_algs, ARRAY_SIZE(cmll_algs)); + return simd_register_skciphers_compat(camellia_algs, + ARRAY_SIZE(camellia_algs), + camellia_simd_algs); } static void __exit camellia_aesni_fini(void) { - crypto_unregister_algs(cmll_algs, ARRAY_SIZE(cmll_algs)); + simd_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs), + camellia_simd_algs); } module_init(camellia_aesni_init); diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index d96429da88eb..d09f6521466a 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -10,18 +10,15 @@ * */ -#include <linux/module.h> -#include <linux/types.h> -#include <linux/crypto.h> -#include <linux/err.h> -#include <crypto/ablk_helper.h> -#include <crypto/algapi.h> -#include <crypto/ctr.h> -#include <crypto/lrw.h> -#include <crypto/xts.h> -#include <asm/fpu/api.h> #include <asm/crypto/camellia.h> #include <asm/crypto/glue_helper.h> +#include <crypto/algapi.h> +#include <crypto/internal/simd.h> +#include <crypto/xts.h> +#include <linux/crypto.h> +#include <linux/err.h> +#include <linux/module.h> +#include <linux/types.h> #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 @@ -154,401 +151,142 @@ static const struct common_glue_ctx camellia_dec_xts = { } } }; -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes); -} - -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keylen) { - return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes); + return __camellia_setkey(crypto_skcipher_ctx(tfm), key, keylen, + &tfm->base.crt_flags); } -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ecb_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc, - dst, src, nbytes); + return glue_ecb_req_128bit(&camellia_enc, req); } -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ecb_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src, - nbytes); + return glue_ecb_req_128bit(&camellia_dec, req); } -static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int cbc_encrypt(struct skcipher_request *req) { - return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes); + return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(camellia_enc_blk), + req); } -static inline bool camellia_fpu_begin(bool fpu_enabled, unsigned int nbytes) +static int cbc_decrypt(struct skcipher_request *req) { - return glue_fpu_begin(CAMELLIA_BLOCK_SIZE, - CAMELLIA_AESNI_PARALLEL_BLOCKS, NULL, fpu_enabled, - nbytes); + return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req); } -static inline void camellia_fpu_end(bool fpu_enabled) +static int ctr_crypt(struct skcipher_request *req) { - glue_fpu_end(fpu_enabled); + return glue_ctr_req_128bit(&camellia_ctr, req); } -static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key, - unsigned int key_len) +int xts_camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keylen) { - return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len, - &tfm->crt_flags); + struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + u32 *flags = &tfm->base.crt_flags; + int err; + + err = xts_verify_key(tfm, key, keylen); + if (err) + return err; + + /* first half of xts-key is for crypt */ + err = __camellia_setkey(&ctx->crypt_ctx, key, keylen / 2, flags); + if (err) + return err; + + /* second half of xts-key is for tweak */ + return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2, + flags); } +EXPORT_SYMBOL_GPL(xts_camellia_setkey); -struct crypt_priv { - struct camellia_ctx *ctx; - bool fpu_enabled; -}; - -static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +static int xts_encrypt(struct skcipher_request *req) { - const unsigned int bsize = CAMELLIA_BLOCK_SIZE; - struct crypt_priv *ctx = priv; - int i; - - ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes); - - if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) { - camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; - nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; - } - - while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { - camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; - nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; - } + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) - camellia_enc_blk(ctx->ctx, srcdst, srcdst); + return glue_xts_req_128bit(&camellia_enc_xts, req, + XTS_TWEAK_CAST(camellia_enc_blk), + &ctx->tweak_ctx, &ctx->crypt_ctx); } -static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +static int xts_decrypt(struct skcipher_request *req) { - const unsigned int bsize = CAMELLIA_BLOCK_SIZE; - struct crypt_priv *ctx = priv; - int i; - - ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes); - - if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) { - camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; - nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; - } + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { - camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; - nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; - } - - for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) - camellia_dec_blk(ctx->ctx, srcdst, srcdst); + return glue_xts_req_128bit(&camellia_dec_xts, req, + XTS_TWEAK_CAST(camellia_enc_blk), + &ctx->tweak_ctx, &ctx->crypt_ctx); } -static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->camellia_ctx, - .fpu_enabled = false, - }; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = &crypt_ctx, - .crypt_fn = encrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = lrw_crypt(desc, dst, src, nbytes, &req); - camellia_fpu_end(crypt_ctx.fpu_enabled); - - return ret; -} - -static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->camellia_ctx, - .fpu_enabled = false, - }; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = &crypt_ctx, - .crypt_fn = decrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = lrw_crypt(desc, dst, src, nbytes, &req); - camellia_fpu_end(crypt_ctx.fpu_enabled); - - return ret; -} - -static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - - return glue_xts_crypt_128bit(&camellia_enc_xts, desc, dst, src, nbytes, - XTS_TWEAK_CAST(camellia_enc_blk), - &ctx->tweak_ctx, &ctx->crypt_ctx); -} - -static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - - return glue_xts_crypt_128bit(&camellia_dec_xts, desc, dst, src, nbytes, - XTS_TWEAK_CAST(camellia_enc_blk), - &ctx->tweak_ctx, &ctx->crypt_ctx); -} - -static struct crypto_alg cmll_algs[10] = { { - .cra_name = "__ecb-camellia-aesni", - .cra_driver_name = "__driver-ecb-camellia-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = CAMELLIA_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct camellia_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE, - .setkey = camellia_setkey, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}, { - .cra_name = "__cbc-camellia-aesni", - .cra_driver_name = "__driver-cbc-camellia-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = CAMELLIA_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct camellia_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE, - .setkey = camellia_setkey, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}, { - .cra_name = "__ctr-camellia-aesni", - .cra_driver_name = "__driver-ctr-camellia-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct camellia_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .setkey = camellia_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, - }, -}, { - .cra_name = "__lrw-camellia-aesni", - .cra_driver_name = "__driver-lrw-camellia-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = CAMELLIA_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct camellia_lrw_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_exit = lrw_camellia_exit_tfm, - .cra_u = { - .blkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE + - CAMELLIA_BLOCK_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE + - CAMELLIA_BLOCK_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .setkey = lrw_camellia_setkey, - .encrypt = lrw_encrypt, - .decrypt = lrw_decrypt, - }, - }, -}, { - .cra_name = "__xts-camellia-aesni", - .cra_driver_name = "__driver-xts-camellia-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = CAMELLIA_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct camellia_xts_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2, - .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2, - .ivsize = CAMELLIA_BLOCK_SIZE, - .setkey = xts_camellia_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, - }, - }, -}, { - .cra_name = "ecb(camellia)", - .cra_driver_name = "ecb-camellia-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = CAMELLIA_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "cbc(camellia)", - .cra_driver_name = "cbc-camellia-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = CAMELLIA_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = __ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "ctr(camellia)", - .cra_driver_name = "ctr-camellia-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_encrypt, - .geniv = "chainiv", - }, - }, -}, { - .cra_name = "lrw(camellia)", - .cra_driver_name = "lrw-camellia-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = CAMELLIA_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE + - CAMELLIA_BLOCK_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE + - CAMELLIA_BLOCK_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "xts(camellia)", - .cra_driver_name = "xts-camellia-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = CAMELLIA_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2, - .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2, - .ivsize = CAMELLIA_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, +static struct skcipher_alg camellia_algs[] = { + { + .base.cra_name = "__ecb(camellia)", + .base.cra_driver_name = "__ecb-camellia-aesni", + .base.cra_priority = 400, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct camellia_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .setkey = camellia_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, { + .base.cra_name = "__cbc(camellia)", + .base.cra_driver_name = "__cbc-camellia-aesni", + .base.cra_priority = 400, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct camellia_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .ivsize = CAMELLIA_BLOCK_SIZE, + .setkey = camellia_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, { + .base.cra_name = "__ctr(camellia)", + .base.cra_driver_name = "__ctr-camellia-aesni", + .base.cra_priority = 400, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct camellia_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .ivsize = CAMELLIA_BLOCK_SIZE, + .chunksize = CAMELLIA_BLOCK_SIZE, + .setkey = camellia_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, { + .base.cra_name = "__xts(camellia)", + .base.cra_driver_name = "__xts-camellia-aesni", + .base.cra_priority = 400, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct camellia_xts_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = 2 * CAMELLIA_MIN_KEY_SIZE, + .max_keysize = 2 * CAMELLIA_MAX_KEY_SIZE, + .ivsize = CAMELLIA_BLOCK_SIZE, + .setkey = xts_camellia_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, }, -} }; +}; + +static struct simd_skcipher_alg *camellia_simd_algs[ARRAY_SIZE(camellia_algs)]; static int __init camellia_aesni_init(void) { @@ -567,12 +305,15 @@ static int __init camellia_aesni_init(void) return -ENODEV; } - return crypto_register_algs(cmll_algs, ARRAY_SIZE(cmll_algs)); + return simd_register_skciphers_compat(camellia_algs, + ARRAY_SIZE(camellia_algs), + camellia_simd_algs); } static void __exit camellia_aesni_fini(void) { - crypto_unregister_algs(cmll_algs, ARRAY_SIZE(cmll_algs)); + simd_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs), + camellia_simd_algs); } module_init(camellia_aesni_init); diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index af4840ab2a3d..dcd5e0f71b00 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -23,15 +23,12 @@ * */ -#include <asm/processor.h> #include <asm/unaligned.h> #include <linux/crypto.h> #include <linux/init.h> #include <linux/module.h> #include <linux/types.h> #include <crypto/algapi.h> -#include <crypto/lrw.h> -#include <crypto/xts.h> #include <asm/crypto/camellia.h> #include <asm/crypto/glue_helper.h> @@ -1272,13 +1269,19 @@ int __camellia_setkey(struct camellia_ctx *cctx, const unsigned char *key, } EXPORT_SYMBOL_GPL(__camellia_setkey); -static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key, +static int camellia_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int key_len) { - return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len, + return __camellia_setkey(crypto_tfm_ctx(tfm), key, key_len, &tfm->crt_flags); } +static int camellia_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, + unsigned int key_len) +{ + return camellia_setkey(&tfm->base, key, key_len); +} + void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src) { u128 iv = *src; @@ -1373,188 +1376,33 @@ static const struct common_glue_ctx camellia_dec_cbc = { } } }; -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes); -} - -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes); -} - -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc, - dst, src, nbytes); -} - -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src, - nbytes); -} - -static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes); -} - -static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) -{ - const unsigned int bsize = CAMELLIA_BLOCK_SIZE; - struct camellia_ctx *ctx = priv; - int i; - - while (nbytes >= 2 * bsize) { - camellia_enc_blk_2way(ctx, srcdst, srcdst); - srcdst += bsize * 2; - nbytes -= bsize * 2; - } - - for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) - camellia_enc_blk(ctx, srcdst, srcdst); -} - -static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) -{ - const unsigned int bsize = CAMELLIA_BLOCK_SIZE; - struct camellia_ctx *ctx = priv; - int i; - - while (nbytes >= 2 * bsize) { - camellia_dec_blk_2way(ctx, srcdst, srcdst); - srcdst += bsize * 2; - nbytes -= bsize * 2; - } - - for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) - camellia_dec_blk(ctx, srcdst, srcdst); -} - -int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) -{ - struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm); - int err; - - err = __camellia_setkey(&ctx->camellia_ctx, key, - keylen - CAMELLIA_BLOCK_SIZE, - &tfm->crt_flags); - if (err) - return err; - - return lrw_init_table(&ctx->lrw_table, - key + keylen - CAMELLIA_BLOCK_SIZE); -} -EXPORT_SYMBOL_GPL(lrw_camellia_setkey); - -static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[2 * 4]; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = &ctx->camellia_ctx, - .crypt_fn = encrypt_callback, - }; - - return lrw_crypt(desc, dst, src, nbytes, &req); -} - -static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ecb_encrypt(struct skcipher_request *req) { - struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[2 * 4]; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = &ctx->camellia_ctx, - .crypt_fn = decrypt_callback, - }; - - return lrw_crypt(desc, dst, src, nbytes, &req); + return glue_ecb_req_128bit(&camellia_enc, req); } -void lrw_camellia_exit_tfm(struct crypto_tfm *tfm) +static int ecb_decrypt(struct skcipher_request *req) { - struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm); - - lrw_free_table(&ctx->lrw_table); + return glue_ecb_req_128bit(&camellia_dec, req); } -EXPORT_SYMBOL_GPL(lrw_camellia_exit_tfm); -int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) +static int cbc_encrypt(struct skcipher_request *req) { - struct camellia_xts_ctx *ctx = crypto_tfm_ctx(tfm); - u32 *flags = &tfm->crt_flags; - int err; - - err = xts_check_key(tfm, key, keylen); - if (err) - return err; - - /* first half of xts-key is for crypt */ - err = __camellia_setkey(&ctx->crypt_ctx, key, keylen / 2, flags); - if (err) - return err; - - /* second half of xts-key is for tweak */ - return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2, - flags); + return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(camellia_enc_blk), + req); } -EXPORT_SYMBOL_GPL(xts_camellia_setkey); -static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int cbc_decrypt(struct skcipher_request *req) { - struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - le128 buf[2 * 4]; - struct xts_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .tweak_ctx = &ctx->tweak_ctx, - .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk), - .crypt_ctx = &ctx->crypt_ctx, - .crypt_fn = encrypt_callback, - }; - - return xts_crypt(desc, dst, src, nbytes, &req); + return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req); } -static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ctr_crypt(struct skcipher_request *req) { - struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - le128 buf[2 * 4]; - struct xts_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .tweak_ctx = &ctx->tweak_ctx, - .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk), - .crypt_ctx = &ctx->crypt_ctx, - .crypt_fn = decrypt_callback, - }; - - return xts_crypt(desc, dst, src, nbytes, &req); + return glue_ctr_req_128bit(&camellia_ctr, req); } -static struct crypto_alg camellia_algs[6] = { { +static struct crypto_alg camellia_cipher_alg = { .cra_name = "camellia", .cra_driver_name = "camellia-asm", .cra_priority = 200, @@ -1572,109 +1420,50 @@ static struct crypto_alg camellia_algs[6] = { { .cia_decrypt = camellia_decrypt } } -}, { - .cra_name = "ecb(camellia)", - .cra_driver_name = "ecb-camellia-asm", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = CAMELLIA_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct camellia_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE, - .setkey = camellia_setkey, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}, { - .cra_name = "cbc(camellia)", - .cra_driver_name = "cbc-camellia-asm", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = CAMELLIA_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct camellia_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .setkey = camellia_setkey, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}, { - .cra_name = "ctr(camellia)", - .cra_driver_name = "ctr-camellia-asm", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct camellia_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .setkey = camellia_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, - }, -}, { - .cra_name = "lrw(camellia)", - .cra_driver_name = "lrw-camellia-asm", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = CAMELLIA_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct camellia_lrw_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_exit = lrw_camellia_exit_tfm, - .cra_u = { - .blkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE + - CAMELLIA_BLOCK_SIZE, - .max_keysize = CAMELLIA_MAX_KEY_SIZE + - CAMELLIA_BLOCK_SIZE, - .ivsize = CAMELLIA_BLOCK_SIZE, - .setkey = lrw_camellia_setkey, - .encrypt = lrw_encrypt, - .decrypt = lrw_decrypt, - }, - }, -}, { - .cra_name = "xts(camellia)", - .cra_driver_name = "xts-camellia-asm", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = CAMELLIA_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct camellia_xts_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2, - .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2, - .ivsize = CAMELLIA_BLOCK_SIZE, - .setkey = xts_camellia_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, - }, - }, -} }; +}; + +static struct skcipher_alg camellia_skcipher_algs[] = { + { + .base.cra_name = "ecb(camellia)", + .base.cra_driver_name = "ecb-camellia-asm", + .base.cra_priority = 300, + .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct camellia_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .setkey = camellia_setkey_skcipher, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, { + .base.cra_name = "cbc(camellia)", + .base.cra_driver_name = "cbc-camellia-asm", + .base.cra_priority = 300, + .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct camellia_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .ivsize = CAMELLIA_BLOCK_SIZE, + .setkey = camellia_setkey_skcipher, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, { + .base.cra_name = "ctr(camellia)", + .base.cra_driver_name = "ctr-camellia-asm", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct camellia_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .ivsize = CAMELLIA_BLOCK_SIZE, + .chunksize = CAMELLIA_BLOCK_SIZE, + .setkey = camellia_setkey_skcipher, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + } +}; static bool is_blacklisted_cpu(void) { @@ -1700,6 +1489,8 @@ MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist"); static int __init init(void) { + int err; + if (!force && is_blacklisted_cpu()) { printk(KERN_INFO "camellia-x86_64: performance on this CPU " @@ -1708,12 +1499,23 @@ static int __init init(void) return -ENODEV; } - return crypto_register_algs(camellia_algs, ARRAY_SIZE(camellia_algs)); + err = crypto_register_alg(&camellia_cipher_alg); + if (err) + return err; + + err = crypto_register_skciphers(camellia_skcipher_algs, + ARRAY_SIZE(camellia_skcipher_algs)); + if (err) + crypto_unregister_alg(&camellia_cipher_alg); + + return err; } static void __exit fini(void) { - crypto_unregister_algs(camellia_algs, ARRAY_SIZE(camellia_algs)); + crypto_unregister_alg(&camellia_cipher_alg); + crypto_unregister_skciphers(camellia_skcipher_algs, + ARRAY_SIZE(camellia_skcipher_algs)); } module_init(init); diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index dbea6020ffe7..41034745d6a2 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -21,18 +21,14 @@ * */ -#include <linux/module.h> -#include <linux/hardirq.h> -#include <linux/types.h> -#include <linux/crypto.h> -#include <linux/err.h> -#include <crypto/ablk_helper.h> +#include <asm/crypto/glue_helper.h> #include <crypto/algapi.h> #include <crypto/cast5.h> -#include <crypto/cryptd.h> -#include <crypto/ctr.h> -#include <asm/fpu/api.h> -#include <asm/crypto/glue_helper.h> +#include <crypto/internal/simd.h> +#include <linux/crypto.h> +#include <linux/err.h> +#include <linux/module.h> +#include <linux/types.h> #define CAST5_PARALLEL_BLOCKS 16 @@ -45,10 +41,17 @@ asmlinkage void cast5_cbc_dec_16way(struct cast5_ctx *ctx, u8 *dst, asmlinkage void cast5_ctr_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src, __be64 *iv); -static inline bool cast5_fpu_begin(bool fpu_enabled, unsigned int nbytes) +static int cast5_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keylen) +{ + return cast5_setkey(&tfm->base, key, keylen); +} + +static inline bool cast5_fpu_begin(bool fpu_enabled, struct skcipher_walk *walk, + unsigned int nbytes) { return glue_fpu_begin(CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS, - NULL, fpu_enabled, nbytes); + walk, fpu_enabled, nbytes); } static inline void cast5_fpu_end(bool fpu_enabled) @@ -56,29 +59,28 @@ static inline void cast5_fpu_end(bool fpu_enabled) return glue_fpu_end(fpu_enabled); } -static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, - bool enc) +static int ecb_crypt(struct skcipher_request *req, bool enc) { bool fpu_enabled = false; - struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; const unsigned int bsize = CAST5_BLOCK_SIZE; unsigned int nbytes; void (*fn)(struct cast5_ctx *ctx, u8 *dst, const u8 *src); int err; - fn = (enc) ? cast5_ecb_enc_16way : cast5_ecb_dec_16way; - - err = blkcipher_walk_virt(desc, walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, false); - while ((nbytes = walk->nbytes)) { - u8 *wsrc = walk->src.virt.addr; - u8 *wdst = walk->dst.virt.addr; + while ((nbytes = walk.nbytes)) { + u8 *wsrc = walk.src.virt.addr; + u8 *wdst = walk.dst.virt.addr; - fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes); + fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes); /* Process multi-block batch */ if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { + fn = (enc) ? cast5_ecb_enc_16way : cast5_ecb_dec_16way; do { fn(ctx, wdst, wsrc); @@ -103,76 +105,58 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, } while (nbytes >= bsize); done: - err = blkcipher_walk_done(desc, walk, nbytes); + err = skcipher_walk_done(&walk, nbytes); } cast5_fpu_end(fpu_enabled); return err; } -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ecb_encrypt(struct skcipher_request *req) { - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, true); + return ecb_crypt(req, true); } -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ecb_decrypt(struct skcipher_request *req) { - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, false); + return ecb_crypt(req, false); } -static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static int cbc_encrypt(struct skcipher_request *req) { - struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); const unsigned int bsize = CAST5_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u64 *src = (u64 *)walk->src.virt.addr; - u64 *dst = (u64 *)walk->dst.virt.addr; - u64 *iv = (u64 *)walk->iv; - - do { - *dst = *src ^ *iv; - __cast5_encrypt(ctx, (u8 *)dst, (u8 *)dst); - iv = dst; - - src += 1; - dst += 1; - nbytes -= bsize; - } while (nbytes >= bsize); - - *(u64 *)walk->iv = *iv; - return nbytes; -} - -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); + err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes)) { - nbytes = __cbc_encrypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); + u64 *src = (u64 *)walk.src.virt.addr; + u64 *dst = (u64 *)walk.dst.virt.addr; + u64 *iv = (u64 *)walk.iv; + + do { + *dst = *src ^ *iv; + __cast5_encrypt(ctx, (u8 *)dst, (u8 *)dst); + iv = dst; + src++; + dst++; + nbytes -= bsize; + } while (nbytes >= bsize); + + *(u64 *)walk.iv = *iv; + err = skcipher_walk_done(&walk, nbytes); } return err; } -static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static unsigned int __cbc_decrypt(struct cast5_ctx *ctx, + struct skcipher_walk *walk) { - struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); const unsigned int bsize = CAST5_BLOCK_SIZE; unsigned int nbytes = walk->nbytes; u64 *src = (u64 *)walk->src.virt.addr; @@ -224,31 +208,29 @@ done: return nbytes; } -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int cbc_decrypt(struct skcipher_request *req) { + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm); bool fpu_enabled = false; - struct blkcipher_walk walk; + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes)) { - fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes); - nbytes = __cbc_decrypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); + fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes); + nbytes = __cbc_decrypt(ctx, &walk); + err = skcipher_walk_done(&walk, nbytes); } cast5_fpu_end(fpu_enabled); return err; } -static void ctr_crypt_final(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static void ctr_crypt_final(struct skcipher_walk *walk, struct cast5_ctx *ctx) { - struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); u8 *ctrblk = walk->iv; u8 keystream[CAST5_BLOCK_SIZE]; u8 *src = walk->src.virt.addr; @@ -261,10 +243,9 @@ static void ctr_crypt_final(struct blkcipher_desc *desc, crypto_inc(ctrblk, CAST5_BLOCK_SIZE); } -static unsigned int __ctr_crypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static unsigned int __ctr_crypt(struct skcipher_walk *walk, + struct cast5_ctx *ctx) { - struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); const unsigned int bsize = CAST5_BLOCK_SIZE; unsigned int nbytes = walk->nbytes; u64 *src = (u64 *)walk->src.virt.addr; @@ -307,162 +288,80 @@ done: return nbytes; } -static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ctr_crypt(struct skcipher_request *req) { + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm); bool fpu_enabled = false; - struct blkcipher_walk walk; + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, CAST5_BLOCK_SIZE); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) { - fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes); - nbytes = __ctr_crypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); + fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes); + nbytes = __ctr_crypt(&walk, ctx); + err = skcipher_walk_done(&walk, nbytes); } cast5_fpu_end(fpu_enabled); if (walk.nbytes) { - ctr_crypt_final(desc, &walk); - err = blkcipher_walk_done(desc, &walk, 0); + ctr_crypt_final(&walk, ctx); + err = skcipher_walk_done(&walk, 0); } return err; } +static struct skcipher_alg cast5_algs[] = { + { + .base.cra_name = "__ecb(cast5)", + .base.cra_driver_name = "__ecb-cast5-avx", + .base.cra_priority = 200, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = CAST5_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct cast5_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = CAST5_MIN_KEY_SIZE, + .max_keysize = CAST5_MAX_KEY_SIZE, + .setkey = cast5_setkey_skcipher, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, { + .base.cra_name = "__cbc(cast5)", + .base.cra_driver_name = "__cbc-cast5-avx", + .base.cra_priority = 200, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = CAST5_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct cast5_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = CAST5_MIN_KEY_SIZE, + .max_keysize = CAST5_MAX_KEY_SIZE, + .ivsize = CAST5_BLOCK_SIZE, + .setkey = cast5_setkey_skcipher, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, { + .base.cra_name = "__ctr(cast5)", + .base.cra_driver_name = "__ctr-cast5-avx", + .base.cra_priority = 200, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct cast5_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = CAST5_MIN_KEY_SIZE, + .max_keysize = CAST5_MAX_KEY_SIZE, + .ivsize = CAST5_BLOCK_SIZE, + .chunksize = CAST5_BLOCK_SIZE, + .setkey = cast5_setkey_skcipher, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + } +}; -static struct crypto_alg cast5_algs[6] = { { - .cra_name = "__ecb-cast5-avx", - .cra_driver_name = "__driver-ecb-cast5-avx", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = CAST5_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct cast5_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = CAST5_MIN_KEY_SIZE, - .max_keysize = CAST5_MAX_KEY_SIZE, - .setkey = cast5_setkey, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}, { - .cra_name = "__cbc-cast5-avx", - .cra_driver_name = "__driver-cbc-cast5-avx", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = CAST5_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct cast5_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = CAST5_MIN_KEY_SIZE, - .max_keysize = CAST5_MAX_KEY_SIZE, - .setkey = cast5_setkey, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}, { - .cra_name = "__ctr-cast5-avx", - .cra_driver_name = "__driver-ctr-cast5-avx", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct cast5_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = CAST5_MIN_KEY_SIZE, - .max_keysize = CAST5_MAX_KEY_SIZE, - .ivsize = CAST5_BLOCK_SIZE, - .setkey = cast5_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, - }, -}, { - .cra_name = "ecb(cast5)", - .cra_driver_name = "ecb-cast5-avx", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = CAST5_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = CAST5_MIN_KEY_SIZE, - .max_keysize = CAST5_MAX_KEY_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "cbc(cast5)", - .cra_driver_name = "cbc-cast5-avx", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = CAST5_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = CAST5_MIN_KEY_SIZE, - .max_keysize = CAST5_MAX_KEY_SIZE, - .ivsize = CAST5_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = __ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "ctr(cast5)", - .cra_driver_name = "ctr-cast5-avx", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = CAST5_MIN_KEY_SIZE, - .max_keysize = CAST5_MAX_KEY_SIZE, - .ivsize = CAST5_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_encrypt, - .geniv = "chainiv", - }, - }, -} }; +static struct simd_skcipher_alg *cast5_simd_algs[ARRAY_SIZE(cast5_algs)]; static int __init cast5_init(void) { @@ -474,12 +373,15 @@ static int __init cast5_init(void) return -ENODEV; } - return crypto_register_algs(cast5_algs, ARRAY_SIZE(cast5_algs)); + return simd_register_skciphers_compat(cast5_algs, + ARRAY_SIZE(cast5_algs), + cast5_simd_algs); } static void __exit cast5_exit(void) { - crypto_unregister_algs(cast5_algs, ARRAY_SIZE(cast5_algs)); + simd_unregister_skciphers(cast5_algs, ARRAY_SIZE(cast5_algs), + cast5_simd_algs); } module_init(cast5_init); diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index 50e684768c55..9fb66b5e94b2 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -24,19 +24,13 @@ */ #include <linux/module.h> -#include <linux/hardirq.h> #include <linux/types.h> #include <linux/crypto.h> #include <linux/err.h> -#include <crypto/ablk_helper.h> #include <crypto/algapi.h> #include <crypto/cast6.h> -#include <crypto/cryptd.h> -#include <crypto/b128ops.h> -#include <crypto/ctr.h> -#include <crypto/lrw.h> +#include <crypto/internal/simd.h> #include <crypto/xts.h> -#include <asm/fpu/api.h> #include <asm/crypto/glue_helper.h> #define CAST6_PARALLEL_BLOCKS 8 @@ -56,6 +50,12 @@ asmlinkage void cast6_xts_enc_8way(struct cast6_ctx *ctx, u8 *dst, asmlinkage void cast6_xts_dec_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src, le128 *iv); +static int cast6_setkey_skcipher(struct crypto_skcipher *tfm, + const u8 *key, unsigned int keylen) +{ + return cast6_setkey(&tfm->base, key, keylen); +} + static void cast6_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) { glue_xts_crypt_128bit_one(ctx, dst, src, iv, @@ -157,164 +157,30 @@ static const struct common_glue_ctx cast6_dec_xts = { } } }; -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_ecb_crypt_128bit(&cast6_enc, desc, dst, src, nbytes); -} - -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_ecb_crypt_128bit(&cast6_dec, desc, dst, src, nbytes); -} - -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__cast6_encrypt), desc, - dst, src, nbytes); -} - -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_cbc_decrypt_128bit(&cast6_dec_cbc, desc, dst, src, - nbytes); -} - -static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ecb_encrypt(struct skcipher_request *req) { - return glue_ctr_crypt_128bit(&cast6_ctr, desc, dst, src, nbytes); + return glue_ecb_req_128bit(&cast6_enc, req); } -static inline bool cast6_fpu_begin(bool fpu_enabled, unsigned int nbytes) +static int ecb_decrypt(struct skcipher_request *req) { - return glue_fpu_begin(CAST6_BLOCK_SIZE, CAST6_PARALLEL_BLOCKS, - NULL, fpu_enabled, nbytes); + return glue_ecb_req_128bit(&cast6_dec, req); } -static inline void cast6_fpu_end(bool fpu_enabled) +static int cbc_encrypt(struct skcipher_request *req) { - glue_fpu_end(fpu_enabled); + return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(__cast6_encrypt), + req); } -struct crypt_priv { - struct cast6_ctx *ctx; - bool fpu_enabled; -}; - -static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +static int cbc_decrypt(struct skcipher_request *req) { - const unsigned int bsize = CAST6_BLOCK_SIZE; - struct crypt_priv *ctx = priv; - int i; - - ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); - - if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { - cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst); - return; - } - - for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) - __cast6_encrypt(ctx->ctx, srcdst, srcdst); + return glue_cbc_decrypt_req_128bit(&cast6_dec_cbc, req); } -static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +static int ctr_crypt(struct skcipher_request *req) { - const unsigned int bsize = CAST6_BLOCK_SIZE; - struct crypt_priv *ctx = priv; - int i; - - ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); - - if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { - cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst); - return; - } - - for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) - __cast6_decrypt(ctx->ctx, srcdst, srcdst); -} - -struct cast6_lrw_ctx { - struct lrw_table_ctx lrw_table; - struct cast6_ctx cast6_ctx; -}; - -static int lrw_cast6_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) -{ - struct cast6_lrw_ctx *ctx = crypto_tfm_ctx(tfm); - int err; - - err = __cast6_setkey(&ctx->cast6_ctx, key, keylen - CAST6_BLOCK_SIZE, - &tfm->crt_flags); - if (err) - return err; - - return lrw_init_table(&ctx->lrw_table, key + keylen - CAST6_BLOCK_SIZE); -} - -static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct cast6_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[CAST6_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->cast6_ctx, - .fpu_enabled = false, - }; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = &crypt_ctx, - .crypt_fn = encrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = lrw_crypt(desc, dst, src, nbytes, &req); - cast6_fpu_end(crypt_ctx.fpu_enabled); - - return ret; -} - -static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct cast6_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[CAST6_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->cast6_ctx, - .fpu_enabled = false, - }; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = &crypt_ctx, - .crypt_fn = decrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = lrw_crypt(desc, dst, src, nbytes, &req); - cast6_fpu_end(crypt_ctx.fpu_enabled); - - return ret; -} - -static void lrw_exit_tfm(struct crypto_tfm *tfm) -{ - struct cast6_lrw_ctx *ctx = crypto_tfm_ctx(tfm); - - lrw_free_table(&ctx->lrw_table); + return glue_ctr_req_128bit(&cast6_ctr, req); } struct cast6_xts_ctx { @@ -322,14 +188,14 @@ struct cast6_xts_ctx { struct cast6_ctx crypt_ctx; }; -static int xts_cast6_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) +static int xts_cast6_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keylen) { - struct cast6_xts_ctx *ctx = crypto_tfm_ctx(tfm); - u32 *flags = &tfm->crt_flags; + struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + u32 *flags = &tfm->base.crt_flags; int err; - err = xts_check_key(tfm, key, keylen); + err = xts_verify_key(tfm, key, keylen); if (err) return err; @@ -343,245 +209,87 @@ static int xts_cast6_setkey(struct crypto_tfm *tfm, const u8 *key, flags); } -static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int xts_encrypt(struct skcipher_request *req) { - struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_crypt_128bit(&cast6_enc_xts, desc, dst, src, nbytes, - XTS_TWEAK_CAST(__cast6_encrypt), - &ctx->tweak_ctx, &ctx->crypt_ctx); + return glue_xts_req_128bit(&cast6_enc_xts, req, + XTS_TWEAK_CAST(__cast6_encrypt), + &ctx->tweak_ctx, &ctx->crypt_ctx); } -static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int xts_decrypt(struct skcipher_request *req) { - struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - return glue_xts_crypt_128bit(&cast6_dec_xts, desc, dst, src, nbytes, - XTS_TWEAK_CAST(__cast6_encrypt), - &ctx->tweak_ctx, &ctx->crypt_ctx); + return glue_xts_req_128bit(&cast6_dec_xts, req, + XTS_TWEAK_CAST(__cast6_encrypt), + &ctx->tweak_ctx, &ctx->crypt_ctx); } -static struct crypto_alg cast6_algs[10] = { { - .cra_name = "__ecb-cast6-avx", - .cra_driver_name = "__driver-ecb-cast6-avx", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = CAST6_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct cast6_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = CAST6_MIN_KEY_SIZE, - .max_keysize = CAST6_MAX_KEY_SIZE, - .setkey = cast6_setkey, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}, { - .cra_name = "__cbc-cast6-avx", - .cra_driver_name = "__driver-cbc-cast6-avx", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = CAST6_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct cast6_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = CAST6_MIN_KEY_SIZE, - .max_keysize = CAST6_MAX_KEY_SIZE, - .setkey = cast6_setkey, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}, { - .cra_name = "__ctr-cast6-avx", - .cra_driver_name = "__driver-ctr-cast6-avx", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct cast6_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = CAST6_MIN_KEY_SIZE, - .max_keysize = CAST6_MAX_KEY_SIZE, - .ivsize = CAST6_BLOCK_SIZE, - .setkey = cast6_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, - }, -}, { - .cra_name = "__lrw-cast6-avx", - .cra_driver_name = "__driver-lrw-cast6-avx", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = CAST6_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct cast6_lrw_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_exit = lrw_exit_tfm, - .cra_u = { - .blkcipher = { - .min_keysize = CAST6_MIN_KEY_SIZE + - CAST6_BLOCK_SIZE, - .max_keysize = CAST6_MAX_KEY_SIZE + - CAST6_BLOCK_SIZE, - .ivsize = CAST6_BLOCK_SIZE, - .setkey = lrw_cast6_setkey, - .encrypt = lrw_encrypt, - .decrypt = lrw_decrypt, - }, - }, -}, { - .cra_name = "__xts-cast6-avx", - .cra_driver_name = "__driver-xts-cast6-avx", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = CAST6_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct cast6_xts_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = CAST6_MIN_KEY_SIZE * 2, - .max_keysize = CAST6_MAX_KEY_SIZE * 2, - .ivsize = CAST6_BLOCK_SIZE, - .setkey = xts_cast6_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, - }, - }, -}, { - .cra_name = "ecb(cast6)", - .cra_driver_name = "ecb-cast6-avx", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = CAST6_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = CAST6_MIN_KEY_SIZE, - .max_keysize = CAST6_MAX_KEY_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "cbc(cast6)", - .cra_driver_name = "cbc-cast6-avx", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = CAST6_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = CAST6_MIN_KEY_SIZE, - .max_keysize = CAST6_MAX_KEY_SIZE, - .ivsize = CAST6_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = __ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "ctr(cast6)", - .cra_driver_name = "ctr-cast6-avx", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = CAST6_MIN_KEY_SIZE, - .max_keysize = CAST6_MAX_KEY_SIZE, - .ivsize = CAST6_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_encrypt, - .geniv = "chainiv", - }, - }, -}, { - .cra_name = "lrw(cast6)", - .cra_driver_name = "lrw-cast6-avx", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = CAST6_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = CAST6_MIN_KEY_SIZE + - CAST6_BLOCK_SIZE, - .max_keysize = CAST6_MAX_KEY_SIZE + - CAST6_BLOCK_SIZE, - .ivsize = CAST6_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "xts(cast6)", - .cra_driver_name = "xts-cast6-avx", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = CAST6_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = CAST6_MIN_KEY_SIZE * 2, - .max_keysize = CAST6_MAX_KEY_SIZE * 2, - .ivsize = CAST6_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, +static struct skcipher_alg cast6_algs[] = { + { + .base.cra_name = "__ecb(cast6)", + .base.cra_driver_name = "__ecb-cast6-avx", + .base.cra_priority = 200, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = CAST6_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct cast6_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = CAST6_MIN_KEY_SIZE, + .max_keysize = CAST6_MAX_KEY_SIZE, + .setkey = cast6_setkey_skcipher, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, { + .base.cra_name = "__cbc(cast6)", + .base.cra_driver_name = "__cbc-cast6-avx", + .base.cra_priority = 200, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = CAST6_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct cast6_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = CAST6_MIN_KEY_SIZE, + .max_keysize = CAST6_MAX_KEY_SIZE, + .ivsize = CAST6_BLOCK_SIZE, + .setkey = cast6_setkey_skcipher, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, { + .base.cra_name = "__ctr(cast6)", + .base.cra_driver_name = "__ctr-cast6-avx", + .base.cra_priority = 200, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct cast6_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = CAST6_MIN_KEY_SIZE, + .max_keysize = CAST6_MAX_KEY_SIZE, + .ivsize = CAST6_BLOCK_SIZE, + .chunksize = CAST6_BLOCK_SIZE, + .setkey = cast6_setkey_skcipher, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, { + .base.cra_name = "__xts(cast6)", + .base.cra_driver_name = "__xts-cast6-avx", + .base.cra_priority = 200, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = CAST6_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct cast6_xts_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = 2 * CAST6_MIN_KEY_SIZE, + .max_keysize = 2 * CAST6_MAX_KEY_SIZE, + .ivsize = CAST6_BLOCK_SIZE, + .setkey = xts_cast6_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, }, -} }; +}; + +static struct simd_skcipher_alg *cast6_simd_algs[ARRAY_SIZE(cast6_algs)]; static int __init cast6_init(void) { @@ -593,12 +301,15 @@ static int __init cast6_init(void) return -ENODEV; } - return crypto_register_algs(cast6_algs, ARRAY_SIZE(cast6_algs)); + return simd_register_skciphers_compat(cast6_algs, + ARRAY_SIZE(cast6_algs), + cast6_simd_algs); } static void __exit cast6_exit(void) { - crypto_unregister_algs(cast6_algs, ARRAY_SIZE(cast6_algs)); + simd_unregister_skciphers(cast6_algs, ARRAY_SIZE(cast6_algs), + cast6_simd_algs); } module_init(cast6_init); diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c index 30c0a37f4882..5c610d4ef9fc 100644 --- a/arch/x86/crypto/des3_ede_glue.c +++ b/arch/x86/crypto/des3_ede_glue.c @@ -20,13 +20,13 @@ * */ -#include <asm/processor.h> +#include <crypto/algapi.h> #include <crypto/des.h> +#include <crypto/internal/skcipher.h> #include <linux/crypto.h> #include <linux/init.h> #include <linux/module.h> #include <linux/types.h> -#include <crypto/algapi.h> struct des3_ede_x86_ctx { u32 enc_expkey[DES3_EDE_EXPKEY_WORDS]; @@ -83,18 +83,18 @@ static void des3_ede_x86_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) des3_ede_dec_blk(crypto_tfm_ctx(tfm), dst, src); } -static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, - const u32 *expkey) +static int ecb_crypt(struct skcipher_request *req, const u32 *expkey) { - unsigned int bsize = DES3_EDE_BLOCK_SIZE; + const unsigned int bsize = DES3_EDE_BLOCK_SIZE; + struct skcipher_walk walk; unsigned int nbytes; int err; - err = blkcipher_walk_virt(desc, walk); + err = skcipher_walk_virt(&walk, req, false); - while ((nbytes = walk->nbytes)) { - u8 *wsrc = walk->src.virt.addr; - u8 *wdst = walk->dst.virt.addr; + while ((nbytes = walk.nbytes)) { + u8 *wsrc = walk.src.virt.addr; + u8 *wdst = walk.dst.virt.addr; /* Process four block batch */ if (nbytes >= bsize * 3) { @@ -121,36 +121,31 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, } while (nbytes >= bsize); done: - err = blkcipher_walk_done(desc, walk, nbytes); + err = skcipher_walk_done(&walk, nbytes); } return err; } -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ecb_encrypt(struct skcipher_request *req) { - struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm); - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, ctx->enc_expkey); + return ecb_crypt(req, ctx->enc_expkey); } -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ecb_decrypt(struct skcipher_request *req) { - struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm); - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, ctx->dec_expkey); + return ecb_crypt(req, ctx->dec_expkey); } -static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static unsigned int __cbc_encrypt(struct des3_ede_x86_ctx *ctx, + struct skcipher_walk *walk) { - struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); unsigned int bsize = DES3_EDE_BLOCK_SIZE; unsigned int nbytes = walk->nbytes; u64 *src = (u64 *)walk->src.virt.addr; @@ -171,27 +166,27 @@ static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, return nbytes; } -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int cbc_encrypt(struct skcipher_request *req) { - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); + err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes)) { - nbytes = __cbc_encrypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); + nbytes = __cbc_encrypt(ctx, &walk); + err = skcipher_walk_done(&walk, nbytes); } return err; } -static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static unsigned int __cbc_decrypt(struct des3_ede_x86_ctx *ctx, + struct skcipher_walk *walk) { - struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); unsigned int bsize = DES3_EDE_BLOCK_SIZE; unsigned int nbytes = walk->nbytes; u64 *src = (u64 *)walk->src.virt.addr; @@ -250,25 +245,26 @@ done: return nbytes; } -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int cbc_decrypt(struct skcipher_request *req) { - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); + err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes)) { - nbytes = __cbc_decrypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); + nbytes = __cbc_decrypt(ctx, &walk); + err = skcipher_walk_done(&walk, nbytes); } return err; } static void ctr_crypt_final(struct des3_ede_x86_ctx *ctx, - struct blkcipher_walk *walk) + struct skcipher_walk *walk) { u8 *ctrblk = walk->iv; u8 keystream[DES3_EDE_BLOCK_SIZE]; @@ -282,10 +278,9 @@ static void ctr_crypt_final(struct des3_ede_x86_ctx *ctx, crypto_inc(ctrblk, DES3_EDE_BLOCK_SIZE); } -static unsigned int __ctr_crypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static unsigned int __ctr_crypt(struct des3_ede_x86_ctx *ctx, + struct skcipher_walk *walk) { - struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); unsigned int bsize = DES3_EDE_BLOCK_SIZE; unsigned int nbytes = walk->nbytes; __be64 *src = (__be64 *)walk->src.virt.addr; @@ -333,23 +328,24 @@ done: return nbytes; } -static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ctr_crypt(struct skcipher_request *req) { - struct blkcipher_walk walk; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, DES3_EDE_BLOCK_SIZE); + err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes) >= DES3_EDE_BLOCK_SIZE) { - nbytes = __ctr_crypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); + nbytes = __ctr_crypt(ctx, &walk); + err = skcipher_walk_done(&walk, nbytes); } - if (walk.nbytes) { - ctr_crypt_final(crypto_blkcipher_ctx(desc->tfm), &walk); - err = blkcipher_walk_done(desc, &walk, 0); + if (nbytes) { + ctr_crypt_final(ctx, &walk); + err = skcipher_walk_done(&walk, 0); } return err; @@ -381,7 +377,14 @@ static int des3_ede_x86_setkey(struct crypto_tfm *tfm, const u8 *key, return 0; } -static struct crypto_alg des3_ede_algs[4] = { { +static int des3_ede_x86_setkey_skcipher(struct crypto_skcipher *tfm, + const u8 *key, + unsigned int keylen) +{ + return des3_ede_x86_setkey(&tfm->base, key, keylen); +} + +static struct crypto_alg des3_ede_cipher = { .cra_name = "des3_ede", .cra_driver_name = "des3_ede-asm", .cra_priority = 200, @@ -399,66 +402,50 @@ static struct crypto_alg des3_ede_algs[4] = { { .cia_decrypt = des3_ede_x86_decrypt, } } -}, { - .cra_name = "ecb(des3_ede)", - .cra_driver_name = "ecb-des3_ede-asm", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = DES3_EDE_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct des3_ede_x86_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = DES3_EDE_KEY_SIZE, - .max_keysize = DES3_EDE_KEY_SIZE, - .setkey = des3_ede_x86_setkey, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}, { - .cra_name = "cbc(des3_ede)", - .cra_driver_name = "cbc-des3_ede-asm", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = DES3_EDE_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct des3_ede_x86_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = DES3_EDE_KEY_SIZE, - .max_keysize = DES3_EDE_KEY_SIZE, - .ivsize = DES3_EDE_BLOCK_SIZE, - .setkey = des3_ede_x86_setkey, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}, { - .cra_name = "ctr(des3_ede)", - .cra_driver_name = "ctr-des3_ede-asm", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct des3_ede_x86_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = DES3_EDE_KEY_SIZE, - .max_keysize = DES3_EDE_KEY_SIZE, - .ivsize = DES3_EDE_BLOCK_SIZE, - .setkey = des3_ede_x86_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, - }, -} }; +}; + +static struct skcipher_alg des3_ede_skciphers[] = { + { + .base.cra_name = "ecb(des3_ede)", + .base.cra_driver_name = "ecb-des3_ede-asm", + .base.cra_priority = 300, + .base.cra_blocksize = DES3_EDE_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct des3_ede_x86_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = DES3_EDE_KEY_SIZE, + .max_keysize = DES3_EDE_KEY_SIZE, + .setkey = des3_ede_x86_setkey_skcipher, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, { + .base.cra_name = "cbc(des3_ede)", + .base.cra_driver_name = "cbc-des3_ede-asm", + .base.cra_priority = 300, + .base.cra_blocksize = DES3_EDE_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct des3_ede_x86_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = DES3_EDE_KEY_SIZE, + .max_keysize = DES3_EDE_KEY_SIZE, + .ivsize = DES3_EDE_BLOCK_SIZE, + .setkey = des3_ede_x86_setkey_skcipher, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, { + .base.cra_name = "ctr(des3_ede)", + .base.cra_driver_name = "ctr-des3_ede-asm", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct des3_ede_x86_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = DES3_EDE_KEY_SIZE, + .max_keysize = DES3_EDE_KEY_SIZE, + .ivsize = DES3_EDE_BLOCK_SIZE, + .chunksize = DES3_EDE_BLOCK_SIZE, + .setkey = des3_ede_x86_setkey_skcipher, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + } +}; static bool is_blacklisted_cpu(void) { @@ -483,17 +470,30 @@ MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist"); static int __init des3_ede_x86_init(void) { + int err; + if (!force && is_blacklisted_cpu()) { pr_info("des3_ede-x86_64: performance on this CPU would be suboptimal: disabling des3_ede-x86_64.\n"); return -ENODEV; } - return crypto_register_algs(des3_ede_algs, ARRAY_SIZE(des3_ede_algs)); + err = crypto_register_alg(&des3_ede_cipher); + if (err) + return err; + + err = crypto_register_skciphers(des3_ede_skciphers, + ARRAY_SIZE(des3_ede_skciphers)); + if (err) + crypto_unregister_alg(&des3_ede_cipher); + + return err; } static void __exit des3_ede_x86_fini(void) { - crypto_unregister_algs(des3_ede_algs, ARRAY_SIZE(des3_ede_algs)); + crypto_unregister_alg(&des3_ede_cipher); + crypto_unregister_skciphers(des3_ede_skciphers, + ARRAY_SIZE(des3_ede_skciphers)); } module_init(des3_ede_x86_init); diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c index d61e57960fe0..a78ef99a9981 100644 --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c @@ -29,313 +29,212 @@ #include <crypto/b128ops.h> #include <crypto/gf128mul.h> #include <crypto/internal/skcipher.h> -#include <crypto/lrw.h> #include <crypto/xts.h> #include <asm/crypto/glue_helper.h> -static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, - struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +int glue_ecb_req_128bit(const struct common_glue_ctx *gctx, + struct skcipher_request *req) { - void *ctx = crypto_blkcipher_ctx(desc->tfm); + void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); const unsigned int bsize = 128 / 8; - unsigned int nbytes, i, func_bytes; + struct skcipher_walk walk; bool fpu_enabled = false; + unsigned int nbytes; int err; - err = blkcipher_walk_virt(desc, walk); + err = skcipher_walk_virt(&walk, req, false); - while ((nbytes = walk->nbytes)) { - u8 *wsrc = walk->src.virt.addr; - u8 *wdst = walk->dst.virt.addr; + while ((nbytes = walk.nbytes)) { + const u8 *src = walk.src.virt.addr; + u8 *dst = walk.dst.virt.addr; + unsigned int func_bytes; + unsigned int i; fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - desc, fpu_enabled, nbytes); - + &walk, fpu_enabled, nbytes); for (i = 0; i < gctx->num_funcs; i++) { func_bytes = bsize * gctx->funcs[i].num_blocks; - /* Process multi-block batch */ - if (nbytes >= func_bytes) { - do { - gctx->funcs[i].fn_u.ecb(ctx, wdst, - wsrc); + if (nbytes < func_bytes) + continue; - wsrc += func_bytes; - wdst += func_bytes; - nbytes -= func_bytes; - } while (nbytes >= func_bytes); + /* Process multi-block batch */ + do { + gctx->funcs[i].fn_u.ecb(ctx, dst, src); + src += func_bytes; + dst += func_bytes; + nbytes -= func_bytes; + } while (nbytes >= func_bytes); - if (nbytes < bsize) - goto done; - } + if (nbytes < bsize) + break; } - -done: - err = blkcipher_walk_done(desc, walk, nbytes); + err = skcipher_walk_done(&walk, nbytes); } glue_fpu_end(fpu_enabled); return err; } +EXPORT_SYMBOL_GPL(glue_ecb_req_128bit); -int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, - struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +int glue_cbc_encrypt_req_128bit(const common_glue_func_t fn, + struct skcipher_request *req) { - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return __glue_ecb_crypt_128bit(gctx, desc, &walk); -} -EXPORT_SYMBOL_GPL(glue_ecb_crypt_128bit); - -static unsigned int __glue_cbc_encrypt_128bit(const common_glue_func_t fn, - struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - void *ctx = crypto_blkcipher_ctx(desc->tfm); + void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); const unsigned int bsize = 128 / 8; - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - u128 *iv = (u128 *)walk->iv; - - do { - u128_xor(dst, src, iv); - fn(ctx, (u8 *)dst, (u8 *)dst); - iv = dst; - - src += 1; - dst += 1; - nbytes -= bsize; - } while (nbytes >= bsize); - - *(u128 *)walk->iv = *iv; - return nbytes; -} - -int glue_cbc_encrypt_128bit(const common_glue_func_t fn, - struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; + struct skcipher_walk walk; + unsigned int nbytes; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); + err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes)) { - nbytes = __glue_cbc_encrypt_128bit(fn, desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); + const u128 *src = (u128 *)walk.src.virt.addr; + u128 *dst = (u128 *)walk.dst.virt.addr; + u128 *iv = (u128 *)walk.iv; + + do { + u128_xor(dst, src, iv); + fn(ctx, (u8 *)dst, (u8 *)dst); + iv = dst; + src++; + dst++; + nbytes -= bsize; + } while (nbytes >= bsize); + + *(u128 *)walk.iv = *iv; + err = skcipher_walk_done(&walk, nbytes); } - return err; } -EXPORT_SYMBOL_GPL(glue_cbc_encrypt_128bit); +EXPORT_SYMBOL_GPL(glue_cbc_encrypt_req_128bit); -static unsigned int -__glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, - struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx, + struct skcipher_request *req) { - void *ctx = crypto_blkcipher_ctx(desc->tfm); + void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); const unsigned int bsize = 128 / 8; - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - u128 last_iv; - unsigned int num_blocks, func_bytes; - unsigned int i; + struct skcipher_walk walk; + bool fpu_enabled = false; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); - /* Start of the last block. */ - src += nbytes / bsize - 1; - dst += nbytes / bsize - 1; + while ((nbytes = walk.nbytes)) { + const u128 *src = walk.src.virt.addr; + u128 *dst = walk.dst.virt.addr; + unsigned int func_bytes, num_blocks; + unsigned int i; + u128 last_iv; - last_iv = *src; + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, + &walk, fpu_enabled, nbytes); + /* Start of the last block. */ + src += nbytes / bsize - 1; + dst += nbytes / bsize - 1; - for (i = 0; i < gctx->num_funcs; i++) { - num_blocks = gctx->funcs[i].num_blocks; - func_bytes = bsize * num_blocks; + last_iv = *src; - /* Process multi-block batch */ - if (nbytes >= func_bytes) { + for (i = 0; i < gctx->num_funcs; i++) { + num_blocks = gctx->funcs[i].num_blocks; + func_bytes = bsize * num_blocks; + + if (nbytes < func_bytes) + continue; + + /* Process multi-block batch */ do { - nbytes -= func_bytes - bsize; src -= num_blocks - 1; dst -= num_blocks - 1; gctx->funcs[i].fn_u.cbc(ctx, dst, src); - nbytes -= bsize; + nbytes -= func_bytes; if (nbytes < bsize) goto done; - u128_xor(dst, dst, src - 1); - src -= 1; - dst -= 1; + u128_xor(dst, dst, --src); + dst--; } while (nbytes >= func_bytes); } - } - done: - u128_xor(dst, dst, (u128 *)walk->iv); - *(u128 *)walk->iv = last_iv; - - return nbytes; -} - -int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, - struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - const unsigned int bsize = 128 / 8; - bool fpu_enabled = false; - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - - while ((nbytes = walk.nbytes)) { - fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - desc, fpu_enabled, nbytes); - nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); + u128_xor(dst, dst, (u128 *)walk.iv); + *(u128 *)walk.iv = last_iv; + err = skcipher_walk_done(&walk, nbytes); } glue_fpu_end(fpu_enabled); return err; } -EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit); +EXPORT_SYMBOL_GPL(glue_cbc_decrypt_req_128bit); -static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr, - struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +int glue_ctr_req_128bit(const struct common_glue_ctx *gctx, + struct skcipher_request *req) { - void *ctx = crypto_blkcipher_ctx(desc->tfm); - u8 *src = (u8 *)walk->src.virt.addr; - u8 *dst = (u8 *)walk->dst.virt.addr; - unsigned int nbytes = walk->nbytes; - le128 ctrblk; - u128 tmp; + void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); + const unsigned int bsize = 128 / 8; + struct skcipher_walk walk; + bool fpu_enabled = false; + unsigned int nbytes; + int err; - be128_to_le128(&ctrblk, (be128 *)walk->iv); + err = skcipher_walk_virt(&walk, req, false); - memcpy(&tmp, src, nbytes); - fn_ctr(ctx, &tmp, &tmp, &ctrblk); - memcpy(dst, &tmp, nbytes); + while ((nbytes = walk.nbytes) >= bsize) { + const u128 *src = walk.src.virt.addr; + u128 *dst = walk.dst.virt.addr; + unsigned int func_bytes, num_blocks; + unsigned int i; + le128 ctrblk; - le128_to_be128((be128 *)walk->iv, &ctrblk); -} + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, + &walk, fpu_enabled, nbytes); -static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, - struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - const unsigned int bsize = 128 / 8; - void *ctx = crypto_blkcipher_ctx(desc->tfm); - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - le128 ctrblk; - unsigned int num_blocks, func_bytes; - unsigned int i; + be128_to_le128(&ctrblk, (be128 *)walk.iv); - be128_to_le128(&ctrblk, (be128 *)walk->iv); + for (i = 0; i < gctx->num_funcs; i++) { + num_blocks = gctx->funcs[i].num_blocks; + func_bytes = bsize * num_blocks; - /* Process multi-block batch */ - for (i = 0; i < gctx->num_funcs; i++) { - num_blocks = gctx->funcs[i].num_blocks; - func_bytes = bsize * num_blocks; + if (nbytes < func_bytes) + continue; - if (nbytes >= func_bytes) { + /* Process multi-block batch */ do { gctx->funcs[i].fn_u.ctr(ctx, dst, src, &ctrblk); - src += num_blocks; dst += num_blocks; nbytes -= func_bytes; } while (nbytes >= func_bytes); if (nbytes < bsize) - goto done; + break; } - } - -done: - le128_to_be128((be128 *)walk->iv, &ctrblk); - return nbytes; -} - -int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, - struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - const unsigned int bsize = 128 / 8; - bool fpu_enabled = false; - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, bsize); - while ((nbytes = walk.nbytes) >= bsize) { - fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - desc, fpu_enabled, nbytes); - nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); + le128_to_be128((be128 *)walk.iv, &ctrblk); + err = skcipher_walk_done(&walk, nbytes); } glue_fpu_end(fpu_enabled); - if (walk.nbytes) { - glue_ctr_crypt_final_128bit( - gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk); - err = blkcipher_walk_done(desc, &walk, 0); - } - - return err; -} -EXPORT_SYMBOL_GPL(glue_ctr_crypt_128bit); - -static unsigned int __glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, - void *ctx, - struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - const unsigned int bsize = 128 / 8; - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - unsigned int num_blocks, func_bytes; - unsigned int i; - - /* Process multi-block batch */ - for (i = 0; i < gctx->num_funcs; i++) { - num_blocks = gctx->funcs[i].num_blocks; - func_bytes = bsize * num_blocks; - - if (nbytes >= func_bytes) { - do { - gctx->funcs[i].fn_u.xts(ctx, dst, src, - (le128 *)walk->iv); + if (nbytes) { + le128 ctrblk; + u128 tmp; - src += num_blocks; - dst += num_blocks; - nbytes -= func_bytes; - } while (nbytes >= func_bytes); + be128_to_le128(&ctrblk, (be128 *)walk.iv); + memcpy(&tmp, walk.src.virt.addr, nbytes); + gctx->funcs[gctx->num_funcs - 1].fn_u.ctr(ctx, &tmp, &tmp, + &ctrblk); + memcpy(walk.dst.virt.addr, &tmp, nbytes); + le128_to_be128((be128 *)walk.iv, &ctrblk); - if (nbytes < bsize) - goto done; - } + err = skcipher_walk_done(&walk, 0); } -done: - return nbytes; + return err; } +EXPORT_SYMBOL_GPL(glue_ctr_req_128bit); static unsigned int __glue_xts_req_128bit(const struct common_glue_ctx *gctx, void *ctx, @@ -372,46 +271,6 @@ done: return nbytes; } -/* for implementations implementing faster XTS IV generator */ -int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, - struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes, - void (*tweak_fn)(void *ctx, u8 *dst, const u8 *src), - void *tweak_ctx, void *crypt_ctx) -{ - const unsigned int bsize = 128 / 8; - bool fpu_enabled = false; - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - - err = blkcipher_walk_virt(desc, &walk); - nbytes = walk.nbytes; - if (!nbytes) - return err; - - /* set minimum length to bsize, for tweak_fn */ - fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - desc, fpu_enabled, - nbytes < bsize ? bsize : nbytes); - - /* calculate first value of T */ - tweak_fn(tweak_ctx, walk.iv, walk.iv); - - while (nbytes) { - nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk); - - err = blkcipher_walk_done(desc, &walk, nbytes); - nbytes = walk.nbytes; - } - - glue_fpu_end(fpu_enabled); - - return err; -} -EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit); - int glue_xts_req_128bit(const struct common_glue_ctx *gctx, struct skcipher_request *req, common_glue_func_t tweak_fn, void *tweak_ctx, @@ -429,9 +288,9 @@ int glue_xts_req_128bit(const struct common_glue_ctx *gctx, return err; /* set minimum length to bsize, for tweak_fn */ - fpu_enabled = glue_skwalk_fpu_begin(bsize, gctx->fpu_blocks_limit, - &walk, fpu_enabled, - nbytes < bsize ? bsize : nbytes); + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, + &walk, fpu_enabled, + nbytes < bsize ? bsize : nbytes); /* calculate first value of T */ tweak_fn(tweak_ctx, walk.iv, walk.iv); diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index 870f6d812a2d..03347b16ac9d 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -14,15 +14,12 @@ #include <linux/types.h> #include <linux/crypto.h> #include <linux/err.h> -#include <crypto/ablk_helper.h> #include <crypto/algapi.h> -#include <crypto/ctr.h> -#include <crypto/lrw.h> -#include <crypto/xts.h> +#include <crypto/internal/simd.h> #include <crypto/serpent.h> -#include <asm/fpu/api.h> -#include <asm/crypto/serpent-avx.h> +#include <crypto/xts.h> #include <asm/crypto/glue_helper.h> +#include <asm/crypto/serpent-avx.h> #define SERPENT_AVX2_PARALLEL_BLOCKS 16 @@ -40,6 +37,12 @@ asmlinkage void serpent_xts_enc_16way(struct serpent_ctx *ctx, u8 *dst, asmlinkage void serpent_xts_dec_16way(struct serpent_ctx *ctx, u8 *dst, const u8 *src, le128 *iv); +static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, + const u8 *key, unsigned int keylen) +{ + return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen); +} + static const struct common_glue_ctx serpent_enc = { .num_funcs = 3, .fpu_blocks_limit = 8, @@ -136,403 +139,113 @@ static const struct common_glue_ctx serpent_dec_xts = { } } }; -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes); + return glue_ecb_req_128bit(&serpent_enc, req); } -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes); + return glue_ecb_req_128bit(&serpent_dec, req); } -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc, - dst, src, nbytes); + return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(__serpent_encrypt), + req); } -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src, - nbytes); + return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req); } -static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ctr_crypt(struct skcipher_request *req) { - return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes); + return glue_ctr_req_128bit(&serpent_ctr, req); } -static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes) +static int xts_encrypt(struct skcipher_request *req) { - /* since reusing AVX functions, starts using FPU at 8 parallel blocks */ - return glue_fpu_begin(SERPENT_BLOCK_SIZE, 8, NULL, fpu_enabled, nbytes); -} + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); -static inline void serpent_fpu_end(bool fpu_enabled) -{ - glue_fpu_end(fpu_enabled); + return glue_xts_req_128bit(&serpent_enc_xts, req, + XTS_TWEAK_CAST(__serpent_encrypt), + &ctx->tweak_ctx, &ctx->crypt_ctx); } -struct crypt_priv { - struct serpent_ctx *ctx; - bool fpu_enabled; -}; - -static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +static int xts_decrypt(struct skcipher_request *req) { - const unsigned int bsize = SERPENT_BLOCK_SIZE; - struct crypt_priv *ctx = priv; - int i; - - ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); - - if (nbytes >= SERPENT_AVX2_PARALLEL_BLOCKS * bsize) { - serpent_ecb_enc_16way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * SERPENT_AVX2_PARALLEL_BLOCKS; - nbytes -= bsize * SERPENT_AVX2_PARALLEL_BLOCKS; - } + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) { - serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst); - srcdst += bsize * SERPENT_PARALLEL_BLOCKS; - nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; - } - - for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) - __serpent_encrypt(ctx->ctx, srcdst, srcdst); + return glue_xts_req_128bit(&serpent_dec_xts, req, + XTS_TWEAK_CAST(__serpent_encrypt), + &ctx->tweak_ctx, &ctx->crypt_ctx); } -static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) -{ - const unsigned int bsize = SERPENT_BLOCK_SIZE; - struct crypt_priv *ctx = priv; - int i; - - ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); - - if (nbytes >= SERPENT_AVX2_PARALLEL_BLOCKS * bsize) { - serpent_ecb_dec_16way(ctx->ctx, srcdst, srcdst); - srcdst += bsize * SERPENT_AVX2_PARALLEL_BLOCKS; - nbytes -= bsize * SERPENT_AVX2_PARALLEL_BLOCKS; - } - - while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) { - serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst); - srcdst += bsize * SERPENT_PARALLEL_BLOCKS; - nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; - } - - for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) - __serpent_decrypt(ctx->ctx, srcdst, srcdst); -} - -static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[SERPENT_AVX2_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->serpent_ctx, - .fpu_enabled = false, - }; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = &crypt_ctx, - .crypt_fn = encrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = lrw_crypt(desc, dst, src, nbytes, &req); - serpent_fpu_end(crypt_ctx.fpu_enabled); - - return ret; -} - -static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[SERPENT_AVX2_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->serpent_ctx, - .fpu_enabled = false, - }; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = &crypt_ctx, - .crypt_fn = decrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = lrw_crypt(desc, dst, src, nbytes, &req); - serpent_fpu_end(crypt_ctx.fpu_enabled); - - return ret; -} - -static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - - return glue_xts_crypt_128bit(&serpent_enc_xts, desc, dst, src, nbytes, - XTS_TWEAK_CAST(__serpent_encrypt), - &ctx->tweak_ctx, &ctx->crypt_ctx); -} - -static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - - return glue_xts_crypt_128bit(&serpent_dec_xts, desc, dst, src, nbytes, - XTS_TWEAK_CAST(__serpent_encrypt), - &ctx->tweak_ctx, &ctx->crypt_ctx); -} - -static struct crypto_alg srp_algs[10] = { { - .cra_name = "__ecb-serpent-avx2", - .cra_driver_name = "__driver-ecb-serpent-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct serpent_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(srp_algs[0].cra_list), - .cra_u = { - .blkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .setkey = serpent_setkey, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}, { - .cra_name = "__cbc-serpent-avx2", - .cra_driver_name = "__driver-cbc-serpent-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct serpent_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(srp_algs[1].cra_list), - .cra_u = { - .blkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .setkey = serpent_setkey, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}, { - .cra_name = "__ctr-serpent-avx2", - .cra_driver_name = "__driver-ctr-serpent-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct serpent_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(srp_algs[2].cra_list), - .cra_u = { - .blkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = serpent_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, - }, -}, { - .cra_name = "__lrw-serpent-avx2", - .cra_driver_name = "__driver-lrw-serpent-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct serpent_lrw_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(srp_algs[3].cra_list), - .cra_exit = lrw_serpent_exit_tfm, - .cra_u = { - .blkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE + - SERPENT_BLOCK_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE + - SERPENT_BLOCK_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = lrw_serpent_setkey, - .encrypt = lrw_encrypt, - .decrypt = lrw_decrypt, - }, - }, -}, { - .cra_name = "__xts-serpent-avx2", - .cra_driver_name = "__driver-xts-serpent-avx2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct serpent_xts_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(srp_algs[4].cra_list), - .cra_u = { - .blkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE * 2, - .max_keysize = SERPENT_MAX_KEY_SIZE * 2, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = xts_serpent_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, - }, - }, -}, { - .cra_name = "ecb(serpent)", - .cra_driver_name = "ecb-serpent-avx2", - .cra_priority = 600, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(srp_algs[5].cra_list), - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "cbc(serpent)", - .cra_driver_name = "cbc-serpent-avx2", - .cra_priority = 600, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(srp_algs[6].cra_list), - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = __ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "ctr(serpent)", - .cra_driver_name = "ctr-serpent-avx2", - .cra_priority = 600, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(srp_algs[7].cra_list), - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_encrypt, - .geniv = "chainiv", - }, - }, -}, { - .cra_name = "lrw(serpent)", - .cra_driver_name = "lrw-serpent-avx2", - .cra_priority = 600, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(srp_algs[8].cra_list), - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE + - SERPENT_BLOCK_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE + - SERPENT_BLOCK_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "xts(serpent)", - .cra_driver_name = "xts-serpent-avx2", - .cra_priority = 600, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(srp_algs[9].cra_list), - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE * 2, - .max_keysize = SERPENT_MAX_KEY_SIZE * 2, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, +static struct skcipher_alg serpent_algs[] = { + { + .base.cra_name = "__ecb(serpent)", + .base.cra_driver_name = "__ecb-serpent-avx2", + .base.cra_priority = 600, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = SERPENT_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct serpent_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .setkey = serpent_setkey_skcipher, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, { + .base.cra_name = "__cbc(serpent)", + .base.cra_driver_name = "__cbc-serpent-avx2", + .base.cra_priority = 600, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = SERPENT_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct serpent_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = serpent_setkey_skcipher, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, { + .base.cra_name = "__ctr(serpent)", + .base.cra_driver_name = "__ctr-serpent-avx2", + .base.cra_priority = 600, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct serpent_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .chunksize = SERPENT_BLOCK_SIZE, + .setkey = serpent_setkey_skcipher, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, { + .base.cra_name = "__xts(serpent)", + .base.cra_driver_name = "__xts-serpent-avx2", + .base.cra_priority = 600, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = SERPENT_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct serpent_xts_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = 2 * SERPENT_MIN_KEY_SIZE, + .max_keysize = 2 * SERPENT_MAX_KEY_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = xts_serpent_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, }, -} }; +}; + +static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)]; static int __init init(void) { @@ -548,12 +261,15 @@ static int __init init(void) return -ENODEV; } - return crypto_register_algs(srp_algs, ARRAY_SIZE(srp_algs)); + return simd_register_skciphers_compat(serpent_algs, + ARRAY_SIZE(serpent_algs), + serpent_simd_algs); } static void __exit fini(void) { - crypto_unregister_algs(srp_algs, ARRAY_SIZE(srp_algs)); + simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs), + serpent_simd_algs); } module_init(init); diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index 6f778d3daa22..458567ecf76c 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -24,21 +24,15 @@ */ #include <linux/module.h> -#include <linux/hardirq.h> #include <linux/types.h> #include <linux/crypto.h> #include <linux/err.h> -#include <crypto/ablk_helper.h> #include <crypto/algapi.h> +#include <crypto/internal/simd.h> #include <crypto/serpent.h> -#include <crypto/cryptd.h> -#include <crypto/b128ops.h> -#include <crypto/ctr.h> -#include <crypto/lrw.h> #include <crypto/xts.h> -#include <asm/fpu/api.h> -#include <asm/crypto/serpent-avx.h> #include <asm/crypto/glue_helper.h> +#include <asm/crypto/serpent-avx.h> /* 8-way parallel cipher functions */ asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst, @@ -91,6 +85,31 @@ void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) } EXPORT_SYMBOL_GPL(serpent_xts_dec); +static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, + const u8 *key, unsigned int keylen) +{ + return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen); +} + +int xts_serpent_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keylen) +{ + struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + int err; + + err = xts_verify_key(tfm, key, keylen); + if (err) + return err; + + /* first half of xts-key is for crypt */ + err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2); + if (err) + return err; + + /* second half of xts-key is for tweak */ + return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); +} +EXPORT_SYMBOL_GPL(xts_serpent_setkey); static const struct common_glue_ctx serpent_enc = { .num_funcs = 2, @@ -170,423 +189,113 @@ static const struct common_glue_ctx serpent_dec_xts = { } } }; -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ecb_encrypt(struct skcipher_request *req) { - return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes); + return glue_ecb_req_128bit(&serpent_enc, req); } -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ecb_decrypt(struct skcipher_request *req) { - return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes); + return glue_ecb_req_128bit(&serpent_dec, req); } -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int cbc_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc, - dst, src, nbytes); + return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(__serpent_encrypt), + req); } -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int cbc_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src, - nbytes); + return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req); } -static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ctr_crypt(struct skcipher_request *req) { - return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes); + return glue_ctr_req_128bit(&serpent_ctr, req); } -static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes) +static int xts_encrypt(struct skcipher_request *req) { - return glue_fpu_begin(SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS, - NULL, fpu_enabled, nbytes); -} + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); -static inline void serpent_fpu_end(bool fpu_enabled) -{ - glue_fpu_end(fpu_enabled); + return glue_xts_req_128bit(&serpent_enc_xts, req, + XTS_TWEAK_CAST(__serpent_encrypt), + &ctx->tweak_ctx, &ctx->crypt_ctx); } -struct crypt_priv { - struct serpent_ctx *ctx; - bool fpu_enabled; -}; - -static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) -{ - const unsigned int bsize = SERPENT_BLOCK_SIZE; - struct crypt_priv *ctx = priv; - int i; - - ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); - - if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { - serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst); - return; - } - - for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) - __serpent_encrypt(ctx->ctx, srcdst, srcdst); -} - -static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +static int xts_decrypt(struct skcipher_request *req) { - const unsigned int bsize = SERPENT_BLOCK_SIZE; - struct crypt_priv *ctx = priv; - int i; - - ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { - serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst); - return; - } - - for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) - __serpent_decrypt(ctx->ctx, srcdst, srcdst); -} - -int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) -{ - struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm); - int err; - - err = __serpent_setkey(&ctx->serpent_ctx, key, keylen - - SERPENT_BLOCK_SIZE); - if (err) - return err; - - return lrw_init_table(&ctx->lrw_table, key + keylen - - SERPENT_BLOCK_SIZE); -} -EXPORT_SYMBOL_GPL(lrw_serpent_setkey); - -static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[SERPENT_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->serpent_ctx, - .fpu_enabled = false, - }; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = &crypt_ctx, - .crypt_fn = encrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = lrw_crypt(desc, dst, src, nbytes, &req); - serpent_fpu_end(crypt_ctx.fpu_enabled); - - return ret; + return glue_xts_req_128bit(&serpent_dec_xts, req, + XTS_TWEAK_CAST(__serpent_encrypt), + &ctx->tweak_ctx, &ctx->crypt_ctx); } -static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[SERPENT_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->serpent_ctx, - .fpu_enabled = false, - }; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = &crypt_ctx, - .crypt_fn = decrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = lrw_crypt(desc, dst, src, nbytes, &req); - serpent_fpu_end(crypt_ctx.fpu_enabled); - - return ret; -} - -void lrw_serpent_exit_tfm(struct crypto_tfm *tfm) -{ - struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm); - - lrw_free_table(&ctx->lrw_table); -} -EXPORT_SYMBOL_GPL(lrw_serpent_exit_tfm); - -int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) -{ - struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm); - int err; - - err = xts_check_key(tfm, key, keylen); - if (err) - return err; - - /* first half of xts-key is for crypt */ - err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2); - if (err) - return err; - - /* second half of xts-key is for tweak */ - return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); -} -EXPORT_SYMBOL_GPL(xts_serpent_setkey); - -static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - - return glue_xts_crypt_128bit(&serpent_enc_xts, desc, dst, src, nbytes, - XTS_TWEAK_CAST(__serpent_encrypt), - &ctx->tweak_ctx, &ctx->crypt_ctx); -} - -static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - - return glue_xts_crypt_128bit(&serpent_dec_xts, desc, dst, src, nbytes, - XTS_TWEAK_CAST(__serpent_encrypt), - &ctx->tweak_ctx, &ctx->crypt_ctx); -} - -static struct crypto_alg serpent_algs[10] = { { - .cra_name = "__ecb-serpent-avx", - .cra_driver_name = "__driver-ecb-serpent-avx", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct serpent_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .setkey = serpent_setkey, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}, { - .cra_name = "__cbc-serpent-avx", - .cra_driver_name = "__driver-cbc-serpent-avx", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct serpent_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .setkey = serpent_setkey, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}, { - .cra_name = "__ctr-serpent-avx", - .cra_driver_name = "__driver-ctr-serpent-avx", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct serpent_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = serpent_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, - }, -}, { - .cra_name = "__lrw-serpent-avx", - .cra_driver_name = "__driver-lrw-serpent-avx", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct serpent_lrw_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_exit = lrw_serpent_exit_tfm, - .cra_u = { - .blkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE + - SERPENT_BLOCK_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE + - SERPENT_BLOCK_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = lrw_serpent_setkey, - .encrypt = lrw_encrypt, - .decrypt = lrw_decrypt, - }, - }, -}, { - .cra_name = "__xts-serpent-avx", - .cra_driver_name = "__driver-xts-serpent-avx", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct serpent_xts_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE * 2, - .max_keysize = SERPENT_MAX_KEY_SIZE * 2, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = xts_serpent_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, - }, - }, -}, { - .cra_name = "ecb(serpent)", - .cra_driver_name = "ecb-serpent-avx", - .cra_priority = 500, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "cbc(serpent)", - .cra_driver_name = "cbc-serpent-avx", - .cra_priority = 500, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = __ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "ctr(serpent)", - .cra_driver_name = "ctr-serpent-avx", - .cra_priority = 500, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_encrypt, - .geniv = "chainiv", - }, - }, -}, { - .cra_name = "lrw(serpent)", - .cra_driver_name = "lrw-serpent-avx", - .cra_priority = 500, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE + - SERPENT_BLOCK_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE + - SERPENT_BLOCK_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "xts(serpent)", - .cra_driver_name = "xts-serpent-avx", - .cra_priority = 500, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE * 2, - .max_keysize = SERPENT_MAX_KEY_SIZE * 2, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, +static struct skcipher_alg serpent_algs[] = { + { + .base.cra_name = "__ecb(serpent)", + .base.cra_driver_name = "__ecb-serpent-avx", + .base.cra_priority = 500, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = SERPENT_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct serpent_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .setkey = serpent_setkey_skcipher, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, { + .base.cra_name = "__cbc(serpent)", + .base.cra_driver_name = "__cbc-serpent-avx", + .base.cra_priority = 500, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = SERPENT_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct serpent_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = serpent_setkey_skcipher, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, { + .base.cra_name = "__ctr(serpent)", + .base.cra_driver_name = "__ctr-serpent-avx", + .base.cra_priority = 500, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct serpent_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .chunksize = SERPENT_BLOCK_SIZE, + .setkey = serpent_setkey_skcipher, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, { + .base.cra_name = "__xts(serpent)", + .base.cra_driver_name = "__xts-serpent-avx", + .base.cra_priority = 500, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = SERPENT_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct serpent_xts_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = 2 * SERPENT_MIN_KEY_SIZE, + .max_keysize = 2 * SERPENT_MAX_KEY_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = xts_serpent_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, }, -} }; +}; + +static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)]; static int __init serpent_init(void) { @@ -598,12 +307,15 @@ static int __init serpent_init(void) return -ENODEV; } - return crypto_register_algs(serpent_algs, ARRAY_SIZE(serpent_algs)); + return simd_register_skciphers_compat(serpent_algs, + ARRAY_SIZE(serpent_algs), + serpent_simd_algs); } static void __exit serpent_exit(void) { - crypto_unregister_algs(serpent_algs, ARRAY_SIZE(serpent_algs)); + simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs), + serpent_simd_algs); } module_init(serpent_init); diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index ac0e831943f5..3dafe137596a 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -30,21 +30,22 @@ */ #include <linux/module.h> -#include <linux/hardirq.h> #include <linux/types.h> #include <linux/crypto.h> #include <linux/err.h> -#include <crypto/ablk_helper.h> #include <crypto/algapi.h> -#include <crypto/serpent.h> -#include <crypto/cryptd.h> #include <crypto/b128ops.h> -#include <crypto/ctr.h> -#include <crypto/lrw.h> -#include <crypto/xts.h> +#include <crypto/internal/simd.h> +#include <crypto/serpent.h> #include <asm/crypto/serpent-sse2.h> #include <asm/crypto/glue_helper.h> +static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, + const u8 *key, unsigned int keylen) +{ + return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen); +} + static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) { u128 ivs[SERPENT_PARALLEL_BLOCKS - 1]; @@ -139,464 +140,79 @@ static const struct common_glue_ctx serpent_dec_cbc = { } } }; -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes); -} - -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes); -} - -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ecb_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc, - dst, src, nbytes); + return glue_ecb_req_128bit(&serpent_enc, req); } -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ecb_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src, - nbytes); + return glue_ecb_req_128bit(&serpent_dec, req); } -static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int cbc_encrypt(struct skcipher_request *req) { - return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes); + return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(__serpent_encrypt), + req); } -static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes) +static int cbc_decrypt(struct skcipher_request *req) { - return glue_fpu_begin(SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS, - NULL, fpu_enabled, nbytes); + return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req); } -static inline void serpent_fpu_end(bool fpu_enabled) +static int ctr_crypt(struct skcipher_request *req) { - glue_fpu_end(fpu_enabled); + return glue_ctr_req_128bit(&serpent_ctr, req); } -struct crypt_priv { - struct serpent_ctx *ctx; - bool fpu_enabled; -}; - -static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) -{ - const unsigned int bsize = SERPENT_BLOCK_SIZE; - struct crypt_priv *ctx = priv; - int i; - - ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); - - if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { - serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst); - return; - } - - for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) - __serpent_encrypt(ctx->ctx, srcdst, srcdst); -} - -static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) -{ - const unsigned int bsize = SERPENT_BLOCK_SIZE; - struct crypt_priv *ctx = priv; - int i; - - ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); - - if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { - serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst); - return; - } - - for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) - __serpent_decrypt(ctx->ctx, srcdst, srcdst); -} - -struct serpent_lrw_ctx { - struct lrw_table_ctx lrw_table; - struct serpent_ctx serpent_ctx; -}; - -static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) -{ - struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm); - int err; - - err = __serpent_setkey(&ctx->serpent_ctx, key, keylen - - SERPENT_BLOCK_SIZE); - if (err) - return err; - - return lrw_init_table(&ctx->lrw_table, key + keylen - - SERPENT_BLOCK_SIZE); -} - -static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[SERPENT_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->serpent_ctx, - .fpu_enabled = false, - }; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = &crypt_ctx, - .crypt_fn = encrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = lrw_crypt(desc, dst, src, nbytes, &req); - serpent_fpu_end(crypt_ctx.fpu_enabled); - - return ret; -} - -static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[SERPENT_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->serpent_ctx, - .fpu_enabled = false, - }; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = &crypt_ctx, - .crypt_fn = decrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = lrw_crypt(desc, dst, src, nbytes, &req); - serpent_fpu_end(crypt_ctx.fpu_enabled); - - return ret; -} - -static void lrw_exit_tfm(struct crypto_tfm *tfm) -{ - struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm); - - lrw_free_table(&ctx->lrw_table); -} - -struct serpent_xts_ctx { - struct serpent_ctx tweak_ctx; - struct serpent_ctx crypt_ctx; +static struct skcipher_alg serpent_algs[] = { + { + .base.cra_name = "__ecb(serpent)", + .base.cra_driver_name = "__ecb-serpent-sse2", + .base.cra_priority = 400, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = SERPENT_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct serpent_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .setkey = serpent_setkey_skcipher, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, { + .base.cra_name = "__cbc(serpent)", + .base.cra_driver_name = "__cbc-serpent-sse2", + .base.cra_priority = 400, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = SERPENT_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct serpent_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = serpent_setkey_skcipher, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, { + .base.cra_name = "__ctr(serpent)", + .base.cra_driver_name = "__ctr-serpent-sse2", + .base.cra_priority = 400, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct serpent_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .chunksize = SERPENT_BLOCK_SIZE, + .setkey = serpent_setkey_skcipher, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, }; -static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) -{ - struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm); - int err; - - err = xts_check_key(tfm, key, keylen); - if (err) - return err; - - /* first half of xts-key is for crypt */ - err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2); - if (err) - return err; - - /* second half of xts-key is for tweak */ - return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); -} - -static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - le128 buf[SERPENT_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->crypt_ctx, - .fpu_enabled = false, - }; - struct xts_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .tweak_ctx = &ctx->tweak_ctx, - .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt), - .crypt_ctx = &crypt_ctx, - .crypt_fn = encrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = xts_crypt(desc, dst, src, nbytes, &req); - serpent_fpu_end(crypt_ctx.fpu_enabled); - - return ret; -} - -static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - le128 buf[SERPENT_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->crypt_ctx, - .fpu_enabled = false, - }; - struct xts_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .tweak_ctx = &ctx->tweak_ctx, - .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt), - .crypt_ctx = &crypt_ctx, - .crypt_fn = decrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = xts_crypt(desc, dst, src, nbytes, &req); - serpent_fpu_end(crypt_ctx.fpu_enabled); - - return ret; -} - -static struct crypto_alg serpent_algs[10] = { { - .cra_name = "__ecb-serpent-sse2", - .cra_driver_name = "__driver-ecb-serpent-sse2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct serpent_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .setkey = serpent_setkey, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}, { - .cra_name = "__cbc-serpent-sse2", - .cra_driver_name = "__driver-cbc-serpent-sse2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct serpent_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .setkey = serpent_setkey, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}, { - .cra_name = "__ctr-serpent-sse2", - .cra_driver_name = "__driver-ctr-serpent-sse2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct serpent_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = serpent_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, - }, -}, { - .cra_name = "__lrw-serpent-sse2", - .cra_driver_name = "__driver-lrw-serpent-sse2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct serpent_lrw_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_exit = lrw_exit_tfm, - .cra_u = { - .blkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE + - SERPENT_BLOCK_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE + - SERPENT_BLOCK_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = lrw_serpent_setkey, - .encrypt = lrw_encrypt, - .decrypt = lrw_decrypt, - }, - }, -}, { - .cra_name = "__xts-serpent-sse2", - .cra_driver_name = "__driver-xts-serpent-sse2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct serpent_xts_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE * 2, - .max_keysize = SERPENT_MAX_KEY_SIZE * 2, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = xts_serpent_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, - }, - }, -}, { - .cra_name = "ecb(serpent)", - .cra_driver_name = "ecb-serpent-sse2", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "cbc(serpent)", - .cra_driver_name = "cbc-serpent-sse2", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = __ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "ctr(serpent)", - .cra_driver_name = "ctr-serpent-sse2", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_encrypt, - .geniv = "chainiv", - }, - }, -}, { - .cra_name = "lrw(serpent)", - .cra_driver_name = "lrw-serpent-sse2", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE + - SERPENT_BLOCK_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE + - SERPENT_BLOCK_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "xts(serpent)", - .cra_driver_name = "xts-serpent-sse2", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE * 2, - .max_keysize = SERPENT_MAX_KEY_SIZE * 2, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -} }; +static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)]; static int __init serpent_sse2_init(void) { @@ -605,12 +221,15 @@ static int __init serpent_sse2_init(void) return -ENODEV; } - return crypto_register_algs(serpent_algs, ARRAY_SIZE(serpent_algs)); + return simd_register_skciphers_compat(serpent_algs, + ARRAY_SIZE(serpent_algs), + serpent_simd_algs); } static void __exit serpent_sse2_exit(void) { - crypto_unregister_algs(serpent_algs, ARRAY_SIZE(serpent_algs)); + simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs), + serpent_simd_algs); } module_init(serpent_sse2_init); diff --git a/arch/x86/crypto/sha1-mb/sha1_mb.c b/arch/x86/crypto/sha1-mb/sha1_mb.c index acf9fdf01671..e17655ffde79 100644 --- a/arch/x86/crypto/sha1-mb/sha1_mb.c +++ b/arch/x86/crypto/sha1-mb/sha1_mb.c @@ -106,13 +106,6 @@ static asmlinkage struct job_sha1* (*sha1_job_mgr_flush) static asmlinkage struct job_sha1* (*sha1_job_mgr_get_comp_job) (struct sha1_mb_mgr *state); -static inline void sha1_init_digest(uint32_t *digest) -{ - static const uint32_t initial_digest[SHA1_DIGEST_LENGTH] = {SHA1_H0, - SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }; - memcpy(digest, initial_digest, sizeof(initial_digest)); -} - static inline uint32_t sha1_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len) { @@ -244,11 +237,8 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr, uint32_t len, int flags) { - if (flags & (~HASH_ENTIRE)) { - /* - * User should not pass anything other than FIRST, UPDATE, or - * LAST - */ + if (flags & ~(HASH_UPDATE | HASH_LAST)) { + /* User should not pass anything other than UPDATE or LAST */ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; return ctx; } @@ -259,24 +249,12 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr, return ctx; } - if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { /* Cannot update a finished job. */ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; return ctx; } - - if (flags & HASH_FIRST) { - /* Init digest */ - sha1_init_digest(ctx->job.result_digest); - - /* Reset byte counter */ - ctx->total_length = 0; - - /* Clear extra blocks */ - ctx->partial_block_buffer_length = 0; - } - /* * If we made it here, there were no errors during this call to * submit diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h b/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h index 13590ccf965c..9454bd16f9f8 100644 --- a/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h +++ b/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h @@ -57,11 +57,9 @@ #include "sha1_mb_mgr.h" #define HASH_UPDATE 0x00 -#define HASH_FIRST 0x01 -#define HASH_LAST 0x02 -#define HASH_ENTIRE 0x03 -#define HASH_DONE 0x04 -#define HASH_FINAL 0x08 +#define HASH_LAST 0x01 +#define HASH_DONE 0x02 +#define HASH_FINAL 0x04 #define HASH_CTX_STS_IDLE 0x00 #define HASH_CTX_STS_PROCESSING 0x01 diff --git a/arch/x86/crypto/sha256-mb/sha256_mb.c b/arch/x86/crypto/sha256-mb/sha256_mb.c index 7926a226b120..4c46ac1b6653 100644 --- a/arch/x86/crypto/sha256-mb/sha256_mb.c +++ b/arch/x86/crypto/sha256-mb/sha256_mb.c @@ -106,14 +106,6 @@ static asmlinkage struct job_sha256* (*sha256_job_mgr_flush) static asmlinkage struct job_sha256* (*sha256_job_mgr_get_comp_job) (struct sha256_mb_mgr *state); -inline void sha256_init_digest(uint32_t *digest) -{ - static const uint32_t initial_digest[SHA256_DIGEST_LENGTH] = { - SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3, - SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7}; - memcpy(digest, initial_digest, sizeof(initial_digest)); -} - inline uint32_t sha256_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len) { @@ -245,10 +237,8 @@ static struct sha256_hash_ctx *sha256_ctx_mgr_submit(struct sha256_ctx_mgr *mgr, uint32_t len, int flags) { - if (flags & (~HASH_ENTIRE)) { - /* User should not pass anything other than FIRST, UPDATE - * or LAST - */ + if (flags & ~(HASH_UPDATE | HASH_LAST)) { + /* User should not pass anything other than UPDATE or LAST */ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; return ctx; } @@ -259,23 +249,12 @@ static struct sha256_hash_ctx *sha256_ctx_mgr_submit(struct sha256_ctx_mgr *mgr, return ctx; } - if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { /* Cannot update a finished job. */ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; return ctx; } - if (flags & HASH_FIRST) { - /* Init digest */ - sha256_init_digest(ctx->job.result_digest); - - /* Reset byte counter */ - ctx->total_length = 0; - - /* Clear extra blocks */ - ctx->partial_block_buffer_length = 0; - } - /* If we made it here, there was no error during this call to submit */ ctx->error = HASH_CTX_ERROR_NONE; diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h b/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h index aabb30320af0..7c432543dc7f 100644 --- a/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h +++ b/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h @@ -57,11 +57,9 @@ #include "sha256_mb_mgr.h" #define HASH_UPDATE 0x00 -#define HASH_FIRST 0x01 -#define HASH_LAST 0x02 -#define HASH_ENTIRE 0x03 -#define HASH_DONE 0x04 -#define HASH_FINAL 0x08 +#define HASH_LAST 0x01 +#define HASH_DONE 0x02 +#define HASH_FINAL 0x04 #define HASH_CTX_STS_IDLE 0x00 #define HASH_CTX_STS_PROCESSING 0x01 diff --git a/arch/x86/crypto/sha512-mb/sha512_mb.c b/arch/x86/crypto/sha512-mb/sha512_mb.c index 458409b7568d..39e2bbdc1836 100644 --- a/arch/x86/crypto/sha512-mb/sha512_mb.c +++ b/arch/x86/crypto/sha512-mb/sha512_mb.c @@ -107,15 +107,6 @@ static asmlinkage struct job_sha512* (*sha512_job_mgr_flush) static asmlinkage struct job_sha512* (*sha512_job_mgr_get_comp_job) (struct sha512_mb_mgr *state); -inline void sha512_init_digest(uint64_t *digest) -{ - static const uint64_t initial_digest[SHA512_DIGEST_LENGTH] = { - SHA512_H0, SHA512_H1, SHA512_H2, - SHA512_H3, SHA512_H4, SHA512_H5, - SHA512_H6, SHA512_H7 }; - memcpy(digest, initial_digest, sizeof(initial_digest)); -} - inline uint32_t sha512_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len) { @@ -263,11 +254,8 @@ static struct sha512_hash_ctx mgr = cstate->mgr; spin_lock_irqsave(&cstate->work_lock, irqflags); - if (flags & (~HASH_ENTIRE)) { - /* - * User should not pass anything other than FIRST, UPDATE, or - * LAST - */ + if (flags & ~(HASH_UPDATE | HASH_LAST)) { + /* User should not pass anything other than UPDATE or LAST */ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; goto unlock; } @@ -278,24 +266,12 @@ static struct sha512_hash_ctx goto unlock; } - if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { /* Cannot update a finished job. */ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; goto unlock; } - - if (flags & HASH_FIRST) { - /* Init digest */ - sha512_init_digest(ctx->job.result_digest); - - /* Reset byte counter */ - ctx->total_length = 0; - - /* Clear extra blocks */ - ctx->partial_block_buffer_length = 0; - } - /* * If we made it here, there were no errors during this call to * submit diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h b/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h index e4653f5eec3f..e5c465bd821e 100644 --- a/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h +++ b/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h @@ -57,11 +57,9 @@ #include "sha512_mb_mgr.h" #define HASH_UPDATE 0x00 -#define HASH_FIRST 0x01 -#define HASH_LAST 0x02 -#define HASH_ENTIRE 0x03 -#define HASH_DONE 0x04 -#define HASH_FINAL 0x08 +#define HASH_LAST 0x01 +#define HASH_DONE 0x02 +#define HASH_FINAL 0x04 #define HASH_CTX_STS_IDLE 0x00 #define HASH_CTX_STS_PROCESSING 0x01 diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c index 36870b26067a..d08805032f01 100644 --- a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c +++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c @@ -57,10 +57,12 @@ void sha512_mb_mgr_init_avx2(struct sha512_mb_mgr *state) { unsigned int j; - state->lens[0] = 0; - state->lens[1] = 1; - state->lens[2] = 2; - state->lens[3] = 3; + /* initially all lanes are unused */ + state->lens[0] = 0xFFFFFFFF00000000; + state->lens[1] = 0xFFFFFFFF00000001; + state->lens[2] = 0xFFFFFFFF00000002; + state->lens[3] = 0xFFFFFFFF00000003; + state->unused_lanes = 0xFF03020100; for (j = 0; j < 4; j++) state->ldata[j].job_in_lane = NULL; diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index b7a3904b953c..66d989230d10 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -24,24 +24,15 @@ */ #include <linux/module.h> -#include <linux/hardirq.h> #include <linux/types.h> #include <linux/crypto.h> #include <linux/err.h> -#include <crypto/ablk_helper.h> #include <crypto/algapi.h> +#include <crypto/internal/simd.h> #include <crypto/twofish.h> -#include <crypto/cryptd.h> -#include <crypto/b128ops.h> -#include <crypto/ctr.h> -#include <crypto/lrw.h> #include <crypto/xts.h> -#include <asm/fpu/api.h> -#include <asm/crypto/twofish.h> #include <asm/crypto/glue_helper.h> -#include <crypto/scatterwalk.h> -#include <linux/workqueue.h> -#include <linux/spinlock.h> +#include <asm/crypto/twofish.h> #define TWOFISH_PARALLEL_BLOCKS 8 @@ -61,6 +52,12 @@ asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst, asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst, const u8 *src, le128 *iv); +static int twofish_setkey_skcipher(struct crypto_skcipher *tfm, + const u8 *key, unsigned int keylen) +{ + return twofish_setkey(&tfm->base, key, keylen); +} + static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, const u8 *src) { @@ -79,6 +76,31 @@ static void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) GLUE_FUNC_CAST(twofish_dec_blk)); } +struct twofish_xts_ctx { + struct twofish_ctx tweak_ctx; + struct twofish_ctx crypt_ctx; +}; + +static int xts_twofish_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keylen) +{ + struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + u32 *flags = &tfm->base.crt_flags; + int err; + + err = xts_verify_key(tfm, key, keylen); + if (err) + return err; + + /* first half of xts-key is for crypt */ + err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2, flags); + if (err) + return err; + + /* second half of xts-key is for tweak */ + return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2, + flags); +} static const struct common_glue_ctx twofish_enc = { .num_funcs = 3, @@ -170,389 +192,113 @@ static const struct common_glue_ctx twofish_dec_xts = { } } }; -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes); -} - -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes); -} - -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ecb_encrypt(struct skcipher_request *req) { - return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc, - dst, src, nbytes); + return glue_ecb_req_128bit(&twofish_enc, req); } -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ecb_decrypt(struct skcipher_request *req) { - return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src, - nbytes); + return glue_ecb_req_128bit(&twofish_dec, req); } -static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int cbc_encrypt(struct skcipher_request *req) { - return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes); + return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(twofish_enc_blk), + req); } -static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes) +static int cbc_decrypt(struct skcipher_request *req) { - return glue_fpu_begin(TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS, NULL, - fpu_enabled, nbytes); + return glue_cbc_decrypt_req_128bit(&twofish_dec_cbc, req); } -static inline void twofish_fpu_end(bool fpu_enabled) +static int ctr_crypt(struct skcipher_request *req) { - glue_fpu_end(fpu_enabled); + return glue_ctr_req_128bit(&twofish_ctr, req); } -struct crypt_priv { - struct twofish_ctx *ctx; - bool fpu_enabled; -}; - -static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +static int xts_encrypt(struct skcipher_request *req) { - const unsigned int bsize = TF_BLOCK_SIZE; - struct crypt_priv *ctx = priv; - int i; - - ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); - - if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { - twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst); - return; - } - - for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) - twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - nbytes %= bsize * 3; - - for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) - twofish_enc_blk(ctx->ctx, srcdst, srcdst); + return glue_xts_req_128bit(&twofish_enc_xts, req, + XTS_TWEAK_CAST(twofish_enc_blk), + &ctx->tweak_ctx, &ctx->crypt_ctx); } -static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +static int xts_decrypt(struct skcipher_request *req) { - const unsigned int bsize = TF_BLOCK_SIZE; - struct crypt_priv *ctx = priv; - int i; - - ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); - - if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { - twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst); - return; - } + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm); - for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) - twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst); - - nbytes %= bsize * 3; - - for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) - twofish_dec_blk(ctx->ctx, srcdst, srcdst); + return glue_xts_req_128bit(&twofish_dec_xts, req, + XTS_TWEAK_CAST(twofish_enc_blk), + &ctx->tweak_ctx, &ctx->crypt_ctx); } -static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[TWOFISH_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->twofish_ctx, - .fpu_enabled = false, - }; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = &crypt_ctx, - .crypt_fn = encrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = lrw_crypt(desc, dst, src, nbytes, &req); - twofish_fpu_end(crypt_ctx.fpu_enabled); - - return ret; -} - -static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[TWOFISH_PARALLEL_BLOCKS]; - struct crypt_priv crypt_ctx = { - .ctx = &ctx->twofish_ctx, - .fpu_enabled = false, - }; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = &crypt_ctx, - .crypt_fn = decrypt_callback, - }; - int ret; - - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - ret = lrw_crypt(desc, dst, src, nbytes, &req); - twofish_fpu_end(crypt_ctx.fpu_enabled); - - return ret; -} - -static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - - return glue_xts_crypt_128bit(&twofish_enc_xts, desc, dst, src, nbytes, - XTS_TWEAK_CAST(twofish_enc_blk), - &ctx->tweak_ctx, &ctx->crypt_ctx); -} - -static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - - return glue_xts_crypt_128bit(&twofish_dec_xts, desc, dst, src, nbytes, - XTS_TWEAK_CAST(twofish_enc_blk), - &ctx->tweak_ctx, &ctx->crypt_ctx); -} - -static struct crypto_alg twofish_algs[10] = { { - .cra_name = "__ecb-twofish-avx", - .cra_driver_name = "__driver-ecb-twofish-avx", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct twofish_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .setkey = twofish_setkey, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}, { - .cra_name = "__cbc-twofish-avx", - .cra_driver_name = "__driver-cbc-twofish-avx", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct twofish_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .setkey = twofish_setkey, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}, { - .cra_name = "__ctr-twofish-avx", - .cra_driver_name = "__driver-ctr-twofish-avx", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct twofish_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = twofish_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, - }, -}, { - .cra_name = "__lrw-twofish-avx", - .cra_driver_name = "__driver-lrw-twofish-avx", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct twofish_lrw_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_exit = lrw_twofish_exit_tfm, - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE + - TF_BLOCK_SIZE, - .max_keysize = TF_MAX_KEY_SIZE + - TF_BLOCK_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = lrw_twofish_setkey, - .encrypt = lrw_encrypt, - .decrypt = lrw_decrypt, - }, - }, -}, { - .cra_name = "__xts-twofish-avx", - .cra_driver_name = "__driver-xts-twofish-avx", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | - CRYPTO_ALG_INTERNAL, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct twofish_xts_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE * 2, - .max_keysize = TF_MAX_KEY_SIZE * 2, - .ivsize = TF_BLOCK_SIZE, - .setkey = xts_twofish_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, - }, - }, -}, { - .cra_name = "ecb(twofish)", - .cra_driver_name = "ecb-twofish-avx", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "cbc(twofish)", - .cra_driver_name = "cbc-twofish-avx", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = __ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "ctr(twofish)", - .cra_driver_name = "ctr-twofish-avx", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_encrypt, - .geniv = "chainiv", - }, - }, -}, { - .cra_name = "lrw(twofish)", - .cra_driver_name = "lrw-twofish-avx", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = TF_MIN_KEY_SIZE + - TF_BLOCK_SIZE, - .max_keysize = TF_MAX_KEY_SIZE + - TF_BLOCK_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}, { - .cra_name = "xts(twofish)", - .cra_driver_name = "xts-twofish-avx", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = TF_MIN_KEY_SIZE * 2, - .max_keysize = TF_MAX_KEY_SIZE * 2, - .ivsize = TF_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, +static struct skcipher_alg twofish_algs[] = { + { + .base.cra_name = "__ecb(twofish)", + .base.cra_driver_name = "__ecb-twofish-avx", + .base.cra_priority = 400, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = TF_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct twofish_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .setkey = twofish_setkey_skcipher, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, { + .base.cra_name = "__cbc(twofish)", + .base.cra_driver_name = "__cbc-twofish-avx", + .base.cra_priority = 400, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = TF_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct twofish_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .ivsize = TF_BLOCK_SIZE, + .setkey = twofish_setkey_skcipher, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, { + .base.cra_name = "__ctr(twofish)", + .base.cra_driver_name = "__ctr-twofish-avx", + .base.cra_priority = 400, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct twofish_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .ivsize = TF_BLOCK_SIZE, + .chunksize = TF_BLOCK_SIZE, + .setkey = twofish_setkey_skcipher, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, { + .base.cra_name = "__xts(twofish)", + .base.cra_driver_name = "__xts-twofish-avx", + .base.cra_priority = 400, + .base.cra_flags = CRYPTO_ALG_INTERNAL, + .base.cra_blocksize = TF_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct twofish_xts_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = 2 * TF_MIN_KEY_SIZE, + .max_keysize = 2 * TF_MAX_KEY_SIZE, + .ivsize = TF_BLOCK_SIZE, + .setkey = xts_twofish_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, }, -} }; +}; + +static struct simd_skcipher_alg *twofish_simd_algs[ARRAY_SIZE(twofish_algs)]; static int __init twofish_init(void) { @@ -563,12 +309,15 @@ static int __init twofish_init(void) return -ENODEV; } - return crypto_register_algs(twofish_algs, ARRAY_SIZE(twofish_algs)); + return simd_register_skciphers_compat(twofish_algs, + ARRAY_SIZE(twofish_algs), + twofish_simd_algs); } static void __exit twofish_exit(void) { - crypto_unregister_algs(twofish_algs, ARRAY_SIZE(twofish_algs)); + simd_unregister_skciphers(twofish_algs, ARRAY_SIZE(twofish_algs), + twofish_simd_algs); } module_init(twofish_init); diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 243e90a4b5d9..571485502ec8 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -20,22 +20,26 @@ * */ -#include <asm/processor.h> +#include <asm/crypto/glue_helper.h> +#include <asm/crypto/twofish.h> +#include <crypto/algapi.h> +#include <crypto/b128ops.h> +#include <crypto/internal/skcipher.h> +#include <crypto/twofish.h> #include <linux/crypto.h> #include <linux/init.h> #include <linux/module.h> #include <linux/types.h> -#include <crypto/algapi.h> -#include <crypto/twofish.h> -#include <crypto/b128ops.h> -#include <asm/crypto/twofish.h> -#include <asm/crypto/glue_helper.h> -#include <crypto/lrw.h> -#include <crypto/xts.h> EXPORT_SYMBOL_GPL(__twofish_enc_blk_3way); EXPORT_SYMBOL_GPL(twofish_dec_blk_3way); +static int twofish_setkey_skcipher(struct crypto_skcipher *tfm, + const u8 *key, unsigned int keylen) +{ + return twofish_setkey(&tfm->base, key, keylen); +} + static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, const u8 *src) { @@ -151,284 +155,74 @@ static const struct common_glue_ctx twofish_dec_cbc = { } } }; -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes); -} - -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes); -} - -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc, - dst, src, nbytes); -} - -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src, - nbytes); -} - -static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes); -} - -static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +static int ecb_encrypt(struct skcipher_request *req) { - const unsigned int bsize = TF_BLOCK_SIZE; - struct twofish_ctx *ctx = priv; - int i; - - if (nbytes == 3 * bsize) { - twofish_enc_blk_3way(ctx, srcdst, srcdst); - return; - } - - for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) - twofish_enc_blk(ctx, srcdst, srcdst); -} - -static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) -{ - const unsigned int bsize = TF_BLOCK_SIZE; - struct twofish_ctx *ctx = priv; - int i; - - if (nbytes == 3 * bsize) { - twofish_dec_blk_3way(ctx, srcdst, srcdst); - return; - } - - for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) - twofish_dec_blk(ctx, srcdst, srcdst); -} - -int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) -{ - struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm); - int err; - - err = __twofish_setkey(&ctx->twofish_ctx, key, keylen - TF_BLOCK_SIZE, - &tfm->crt_flags); - if (err) - return err; - - return lrw_init_table(&ctx->lrw_table, key + keylen - TF_BLOCK_SIZE); + return glue_ecb_req_128bit(&twofish_enc, req); } -EXPORT_SYMBOL_GPL(lrw_twofish_setkey); -static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ecb_decrypt(struct skcipher_request *req) { - struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[3]; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = &ctx->twofish_ctx, - .crypt_fn = encrypt_callback, - }; - - return lrw_crypt(desc, dst, src, nbytes, &req); + return glue_ecb_req_128bit(&twofish_dec, req); } -static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int cbc_encrypt(struct skcipher_request *req) { - struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - be128 buf[3]; - struct lrw_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .table_ctx = &ctx->lrw_table, - .crypt_ctx = &ctx->twofish_ctx, - .crypt_fn = decrypt_callback, - }; - - return lrw_crypt(desc, dst, src, nbytes, &req); + return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(twofish_enc_blk), + req); } -void lrw_twofish_exit_tfm(struct crypto_tfm *tfm) +static int cbc_decrypt(struct skcipher_request *req) { - struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm); - - lrw_free_table(&ctx->lrw_table); -} -EXPORT_SYMBOL_GPL(lrw_twofish_exit_tfm); - -int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) -{ - struct twofish_xts_ctx *ctx = crypto_tfm_ctx(tfm); - u32 *flags = &tfm->crt_flags; - int err; - - err = xts_check_key(tfm, key, keylen); - if (err) - return err; - - /* first half of xts-key is for crypt */ - err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2, flags); - if (err) - return err; - - /* second half of xts-key is for tweak */ - return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2, - flags); -} -EXPORT_SYMBOL_GPL(xts_twofish_setkey); - -static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - le128 buf[3]; - struct xts_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .tweak_ctx = &ctx->tweak_ctx, - .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk), - .crypt_ctx = &ctx->crypt_ctx, - .crypt_fn = encrypt_callback, - }; - - return xts_crypt(desc, dst, src, nbytes, &req); + return glue_cbc_decrypt_req_128bit(&twofish_dec_cbc, req); } -static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ctr_crypt(struct skcipher_request *req) { - struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - le128 buf[3]; - struct xts_crypt_req req = { - .tbuf = buf, - .tbuflen = sizeof(buf), - - .tweak_ctx = &ctx->tweak_ctx, - .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk), - .crypt_ctx = &ctx->crypt_ctx, - .crypt_fn = decrypt_callback, - }; - - return xts_crypt(desc, dst, src, nbytes, &req); + return glue_ctr_req_128bit(&twofish_ctr, req); } -static struct crypto_alg tf_algs[5] = { { - .cra_name = "ecb(twofish)", - .cra_driver_name = "ecb-twofish-3way", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct twofish_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .setkey = twofish_setkey, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}, { - .cra_name = "cbc(twofish)", - .cra_driver_name = "cbc-twofish-3way", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct twofish_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = twofish_setkey, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}, { - .cra_name = "ctr(twofish)", - .cra_driver_name = "ctr-twofish-3way", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct twofish_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = twofish_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, - }, -}, { - .cra_name = "lrw(twofish)", - .cra_driver_name = "lrw-twofish-3way", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct twofish_lrw_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_exit = lrw_twofish_exit_tfm, - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE + TF_BLOCK_SIZE, - .max_keysize = TF_MAX_KEY_SIZE + TF_BLOCK_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = lrw_twofish_setkey, - .encrypt = lrw_encrypt, - .decrypt = lrw_decrypt, - }, - }, -}, { - .cra_name = "xts(twofish)", - .cra_driver_name = "xts-twofish-3way", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct twofish_xts_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE * 2, - .max_keysize = TF_MAX_KEY_SIZE * 2, - .ivsize = TF_BLOCK_SIZE, - .setkey = xts_twofish_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, - }, +static struct skcipher_alg tf_skciphers[] = { + { + .base.cra_name = "ecb(twofish)", + .base.cra_driver_name = "ecb-twofish-3way", + .base.cra_priority = 300, + .base.cra_blocksize = TF_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct twofish_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .setkey = twofish_setkey_skcipher, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, { + .base.cra_name = "cbc(twofish)", + .base.cra_driver_name = "cbc-twofish-3way", + .base.cra_priority = 300, + .base.cra_blocksize = TF_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct twofish_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .ivsize = TF_BLOCK_SIZE, + .setkey = twofish_setkey_skcipher, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, { + .base.cra_name = "ctr(twofish)", + .base.cra_driver_name = "ctr-twofish-3way", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct twofish_ctx), + .base.cra_module = THIS_MODULE, + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .ivsize = TF_BLOCK_SIZE, + .chunksize = TF_BLOCK_SIZE, + .setkey = twofish_setkey_skcipher, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, }, -} }; +}; static bool is_blacklisted_cpu(void) { @@ -478,12 +272,13 @@ static int __init init(void) return -ENODEV; } - return crypto_register_algs(tf_algs, ARRAY_SIZE(tf_algs)); + return crypto_register_skciphers(tf_skciphers, + ARRAY_SIZE(tf_skciphers)); } static void __exit fini(void) { - crypto_unregister_algs(tf_algs, ARRAY_SIZE(tf_algs)); + crypto_unregister_skciphers(tf_skciphers, ARRAY_SIZE(tf_skciphers)); } module_init(init); diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 3f48f695d5e6..352e70cd33e8 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -97,80 +97,80 @@ For 32-bit we have the following conventions - kernel is built with #define SIZEOF_PTREGS 21*8 - .macro ALLOC_PT_GPREGS_ON_STACK - addq $-(15*8), %rsp - .endm - - .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1 - .if \r11 - movq %r11, 6*8+\offset(%rsp) - .endif - .if \r8910 - movq %r10, 7*8+\offset(%rsp) - movq %r9, 8*8+\offset(%rsp) - movq %r8, 9*8+\offset(%rsp) - .endif - .if \rax - movq %rax, 10*8+\offset(%rsp) +.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0 + /* + * Push registers and sanitize registers of values that a + * speculation attack might otherwise want to exploit. The + * lower registers are likely clobbered well before they + * could be put to use in a speculative execution gadget. + * Interleave XOR with PUSH for better uop scheduling: + */ + .if \save_ret + pushq %rsi /* pt_regs->si */ + movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */ + movq %rdi, 8(%rsp) /* pt_regs->di (overwriting original return address) */ + .else + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ .endif - .if \rcx - movq %rcx, 11*8+\offset(%rsp) + pushq \rdx /* pt_regs->dx */ + xorl %edx, %edx /* nospec dx */ + pushq %rcx /* pt_regs->cx */ + xorl %ecx, %ecx /* nospec cx */ + pushq \rax /* pt_regs->ax */ + pushq %r8 /* pt_regs->r8 */ + xorl %r8d, %r8d /* nospec r8 */ + pushq %r9 /* pt_regs->r9 */ + xorl %r9d, %r9d /* nospec r9 */ + pushq %r10 /* pt_regs->r10 */ + xorl %r10d, %r10d /* nospec r10 */ + pushq %r11 /* pt_regs->r11 */ + xorl %r11d, %r11d /* nospec r11*/ + pushq %rbx /* pt_regs->rbx */ + xorl %ebx, %ebx /* nospec rbx*/ + pushq %rbp /* pt_regs->rbp */ + xorl %ebp, %ebp /* nospec rbp*/ + pushq %r12 /* pt_regs->r12 */ + xorl %r12d, %r12d /* nospec r12*/ + pushq %r13 /* pt_regs->r13 */ + xorl %r13d, %r13d /* nospec r13*/ + pushq %r14 /* pt_regs->r14 */ + xorl %r14d, %r14d /* nospec r14*/ + pushq %r15 /* pt_regs->r15 */ + xorl %r15d, %r15d /* nospec r15*/ + UNWIND_HINT_REGS + .if \save_ret + pushq %rsi /* return address on top of stack */ .endif - movq %rdx, 12*8+\offset(%rsp) - movq %rsi, 13*8+\offset(%rsp) - movq %rdi, 14*8+\offset(%rsp) - UNWIND_HINT_REGS offset=\offset extra=0 - .endm - .macro SAVE_C_REGS offset=0 - SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 - .endm - .macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0 - SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1 - .endm - .macro SAVE_C_REGS_EXCEPT_R891011 - SAVE_C_REGS_HELPER 0, 1, 1, 0, 0 - .endm - .macro SAVE_C_REGS_EXCEPT_RCX_R891011 - SAVE_C_REGS_HELPER 0, 1, 0, 0, 0 - .endm - .macro SAVE_C_REGS_EXCEPT_RAX_RCX_R11 - SAVE_C_REGS_HELPER 0, 0, 0, 1, 0 - .endm - - .macro SAVE_EXTRA_REGS offset=0 - movq %r15, 0*8+\offset(%rsp) - movq %r14, 1*8+\offset(%rsp) - movq %r13, 2*8+\offset(%rsp) - movq %r12, 3*8+\offset(%rsp) - movq %rbp, 4*8+\offset(%rsp) - movq %rbx, 5*8+\offset(%rsp) - UNWIND_HINT_REGS offset=\offset - .endm - - .macro POP_EXTRA_REGS +.endm + +.macro POP_REGS pop_rdi=1 skip_r11rcx=0 popq %r15 popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx - .endm - - .macro POP_C_REGS + .if \skip_r11rcx + popq %rsi + .else popq %r11 + .endif popq %r10 popq %r9 popq %r8 popq %rax + .if \skip_r11rcx + popq %rsi + .else popq %rcx + .endif popq %rdx popq %rsi + .if \pop_rdi popq %rdi - .endm - - .macro icebp - .byte 0xf1 - .endm + .endif +.endm /* * This is a sneaky trick to help the unwinder find pt_regs on the stack. The @@ -178,17 +178,12 @@ For 32-bit we have the following conventions - kernel is built with * is just setting the LSB, which makes it an invalid stack address and is also * a signal to the unwinder that it's a pt_regs pointer in disguise. * - * NOTE: This macro must be used *after* SAVE_EXTRA_REGS because it corrupts + * NOTE: This macro must be used *after* PUSH_AND_CLEAR_REGS because it corrupts * the original rbp. */ .macro ENCODE_FRAME_POINTER ptregs_offset=0 #ifdef CONFIG_FRAME_POINTER - .if \ptregs_offset - leaq \ptregs_offset(%rsp), %rbp - .else - mov %rsp, %rbp - .endif - orq $0x1, %rbp + leaq 1+\ptregs_offset(%rsp), %rbp #endif .endm diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 74f6eee15179..fbf6a6c3fd2d 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -266,14 +266,13 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs) } #ifdef CONFIG_X86_64 -__visible void do_syscall_64(struct pt_regs *regs) +__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs) { - struct thread_info *ti = current_thread_info(); - unsigned long nr = regs->orig_ax; + struct thread_info *ti; enter_from_user_mode(); local_irq_enable(); - + ti = current_thread_info(); if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) nr = syscall_trace_enter(regs); @@ -282,11 +281,10 @@ __visible void do_syscall_64(struct pt_regs *regs) * table. The only functional difference is the x32 bit in * regs->orig_ax, which changes the behavior of some syscalls. */ - if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) { - nr = array_index_nospec(nr & __SYSCALL_MASK, NR_syscalls); - regs->ax = sys_call_table[nr]( - regs->di, regs->si, regs->dx, - regs->r10, regs->r8, regs->r9); + nr &= __SYSCALL_MASK; + if (likely(nr < NR_syscalls)) { + nr = array_index_nospec(nr, NR_syscalls); + regs->ax = sys_call_table[nr](regs); } syscall_return_slowpath(regs); @@ -321,6 +319,9 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) if (likely(nr < IA32_NR_syscalls)) { nr = array_index_nospec(nr, IA32_NR_syscalls); +#ifdef CONFIG_IA32_EMULATION + regs->ax = ia32_sys_call_table[nr](regs); +#else /* * It's possible that a 32-bit syscall implementation * takes a 64-bit parameter but nonetheless assumes that @@ -331,6 +332,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) (unsigned int)regs->bx, (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->si, (unsigned int)regs->di, (unsigned int)regs->bp); +#endif /* CONFIG_IA32_EMULATION */ } syscall_return_slowpath(regs); diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 16c2c022540d..bef8e2b202a8 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -252,8 +252,7 @@ ENTRY(__switch_to_asm) * exist, overwrite the RSB with entries which capture * speculative execution to prevent attack. */ - /* Clobbers %ebx */ - FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW + FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW #endif /* restore callee-saved registers */ @@ -903,6 +902,9 @@ BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, BUILD_INTERRUPT3(hyperv_reenlightenment_vector, HYPERV_REENLIGHTENMENT_VECTOR, hyperv_reenlightenment_intr) +BUILD_INTERRUPT3(hv_stimer0_callback_vector, HYPERV_STIMER0_VECTOR, + hv_stimer0_vector_handler) + #endif /* CONFIG_HYPERV */ ENTRY(page_fault) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 30c8c5344c4a..3166b9674429 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -55,7 +55,7 @@ END(native_usergs_sysret64) .macro TRACE_IRQS_FLAGS flags:req #ifdef CONFIG_TRACE_IRQFLAGS - bt $9, \flags /* interrupts off? */ + btl $9, \flags /* interrupts off? */ jnc 1f TRACE_IRQS_ON 1: @@ -213,7 +213,7 @@ ENTRY(entry_SYSCALL_64) swapgs /* - * This path is not taken when PAGE_TABLE_ISOLATION is disabled so it + * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it * is not required to switch CR3. */ movq %rsp, PER_CPU_VAR(rsp_scratch) @@ -227,27 +227,14 @@ ENTRY(entry_SYSCALL_64) pushq %rcx /* pt_regs->ip */ GLOBAL(entry_SYSCALL_64_after_hwframe) pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - pushq %rdx /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx */ - pushq $-ENOSYS /* pt_regs->ax */ - pushq %r8 /* pt_regs->r8 */ - pushq %r9 /* pt_regs->r9 */ - pushq %r10 /* pt_regs->r10 */ - pushq %r11 /* pt_regs->r11 */ - pushq %rbx /* pt_regs->rbx */ - pushq %rbp /* pt_regs->rbp */ - pushq %r12 /* pt_regs->r12 */ - pushq %r13 /* pt_regs->r13 */ - pushq %r14 /* pt_regs->r14 */ - pushq %r15 /* pt_regs->r15 */ - UNWIND_HINT_REGS + + PUSH_AND_CLEAR_REGS rax=$-ENOSYS TRACE_IRQS_OFF /* IRQs are off. */ - movq %rsp, %rdi + movq %rax, %rdi + movq %rsp, %rsi call do_syscall_64 /* returns with IRQs disabled */ TRACE_IRQS_IRETQ /* we're about to change IF */ @@ -274,8 +261,13 @@ GLOBAL(entry_SYSCALL_64_after_hwframe) * Change top bits to match most significant bit (47th or 56th bit * depending on paging mode) in the address. */ +#ifdef CONFIG_X86_5LEVEL + ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \ + "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57 +#else shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx +#endif /* If this changed %rcx, it was not canonical */ cmpq %rcx, %r11 @@ -321,15 +313,7 @@ GLOBAL(entry_SYSCALL_64_after_hwframe) syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ UNWIND_HINT_EMPTY - POP_EXTRA_REGS - popq %rsi /* skip r11 */ - popq %r10 - popq %r9 - popq %r8 - popq %rax - popq %rsi /* skip rcx */ - popq %rdx - popq %rsi + POP_REGS pop_rdi=0 skip_r11rcx=1 /* * Now all regs are restored except RSP and RDI. @@ -386,8 +370,7 @@ ENTRY(__switch_to_asm) * exist, overwrite the RSB with entries which capture * speculative execution to prevent attack. */ - /* Clobbers %rbx */ - FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW + FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW #endif /* restore callee-saved registers */ @@ -471,9 +454,19 @@ END(irq_entries_start) * * The invariant is that, if irq_count != -1, then the IRQ stack is in use. */ -.macro ENTER_IRQ_STACK regs=1 old_rsp +.macro ENTER_IRQ_STACK regs=1 old_rsp save_ret=0 DEBUG_ENTRY_ASSERT_IRQS_OFF + + .if \save_ret + /* + * If save_ret is set, the original stack contains one additional + * entry -- the return address. Therefore, move the address one + * entry below %rsp to \old_rsp. + */ + leaq 8(%rsp), \old_rsp + .else movq %rsp, \old_rsp + .endif .if \regs UNWIND_HINT_REGS base=\old_rsp @@ -519,6 +512,15 @@ END(irq_entries_start) .if \regs UNWIND_HINT_REGS indirect=1 .endif + + .if \save_ret + /* + * Push the return address to the stack. This return address can + * be found at the "real" original RSP, which was offset by 8 at + * the beginning of this macro. + */ + pushq -8(\old_rsp) + .endif .endm /* @@ -542,29 +544,65 @@ END(irq_entries_start) .endm /* - * Interrupt entry/exit. + * Interrupt entry helper function. * - * Interrupt entry points save only callee clobbered registers in fast path. - * - * Entry runs with interrupts off. + * Entry runs with interrupts off. Stack layout at entry: + * +----------------------------------------------------+ + * | regs->ss | + * | regs->rsp | + * | regs->eflags | + * | regs->cs | + * | regs->ip | + * +----------------------------------------------------+ + * | regs->orig_ax = ~(interrupt number) | + * +----------------------------------------------------+ + * | return address | + * +----------------------------------------------------+ */ - -/* 0(%rsp): ~(interrupt number) */ - .macro interrupt func +ENTRY(interrupt_entry) + UNWIND_HINT_FUNC + ASM_CLAC cld - testb $3, CS-ORIG_RAX(%rsp) + testb $3, CS-ORIG_RAX+8(%rsp) jz 1f SWAPGS - call switch_to_thread_stack + + /* + * Switch to the thread stack. The IRET frame and orig_ax are + * on the stack, as well as the return address. RDI..R12 are + * not (yet) on the stack and space has not (yet) been + * allocated for them. + */ + pushq %rdi + + /* Need to switch before accessing the thread stack. */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi + movq %rsp, %rdi + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + + /* + * We have RDI, return address, and orig_ax on the stack on + * top of the IRET frame. That means offset=24 + */ + UNWIND_HINT_IRET_REGS base=%rdi offset=24 + + pushq 7*8(%rdi) /* regs->ss */ + pushq 6*8(%rdi) /* regs->rsp */ + pushq 5*8(%rdi) /* regs->eflags */ + pushq 4*8(%rdi) /* regs->cs */ + pushq 3*8(%rdi) /* regs->ip */ + pushq 2*8(%rdi) /* regs->orig_ax */ + pushq 8(%rdi) /* return address */ + UNWIND_HINT_FUNC + + movq (%rdi), %rdi 1: - ALLOC_PT_GPREGS_ON_STACK - SAVE_C_REGS - SAVE_EXTRA_REGS - ENCODE_FRAME_POINTER + PUSH_AND_CLEAR_REGS save_ret=1 + ENCODE_FRAME_POINTER 8 - testb $3, CS(%rsp) + testb $3, CS+8(%rsp) jz 1f /* @@ -572,7 +610,7 @@ END(irq_entries_start) * * We need to tell lockdep that IRQs are off. We can't do this until * we fix gsbase, and we should do it before enter_from_user_mode - * (which can take locks). Since TRACE_IRQS_OFF idempotent, + * (which can take locks). Since TRACE_IRQS_OFF is idempotent, * the simplest way to handle it is to just call it twice if * we enter from user mode. There's no reason to optimize this since * TRACE_IRQS_OFF is a no-op if lockdep is off. @@ -582,12 +620,15 @@ END(irq_entries_start) CALL_enter_from_user_mode 1: - ENTER_IRQ_STACK old_rsp=%rdi + ENTER_IRQ_STACK old_rsp=%rdi save_ret=1 /* We entered an interrupt context - irqs are off: */ TRACE_IRQS_OFF - call \func /* rdi points to pt_regs */ - .endm + ret +END(interrupt_entry) + + +/* Interrupt entry/exit. */ /* * The interrupt stubs push (~vector+0x80) onto the stack and @@ -595,9 +636,10 @@ END(irq_entries_start) */ .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: - ASM_CLAC addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */ - interrupt do_IRQ + call interrupt_entry + UNWIND_HINT_REGS indirect=1 + call do_IRQ /* rdi points to pt_regs */ /* 0(%rsp): old RSP */ ret_from_intr: DISABLE_INTERRUPTS(CLBR_ANY) @@ -622,15 +664,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) ud2 1: #endif - POP_EXTRA_REGS - popq %r11 - popq %r10 - popq %r9 - popq %r8 - popq %rax - popq %rcx - popq %rdx - popq %rsi + POP_REGS pop_rdi=0 /* * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS. @@ -688,8 +722,7 @@ GLOBAL(restore_regs_and_return_to_kernel) ud2 1: #endif - POP_EXTRA_REGS - POP_C_REGS + POP_REGS addq $8, %rsp /* skip regs->orig_ax */ /* * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization @@ -799,10 +832,11 @@ END(common_interrupt) .macro apicinterrupt3 num sym do_sym ENTRY(\sym) UNWIND_HINT_IRET_REGS - ASM_CLAC pushq $~(\num) .Lcommon_\sym: - interrupt \do_sym + call interrupt_entry + UNWIND_HINT_REGS indirect=1 + call \do_sym /* rdi points to pt_regs */ jmp ret_from_intr END(\sym) .endm @@ -865,34 +899,6 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) -/* - * Switch to the thread stack. This is called with the IRET frame and - * orig_ax on the stack. (That is, RDI..R12 are not on the stack and - * space has not been allocated for them.) - */ -ENTRY(switch_to_thread_stack) - UNWIND_HINT_FUNC - - pushq %rdi - /* Need to switch before accessing the thread stack. */ - SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi - movq %rsp, %rdi - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI - - pushq 7*8(%rdi) /* regs->ss */ - pushq 6*8(%rdi) /* regs->rsp */ - pushq 5*8(%rdi) /* regs->eflags */ - pushq 4*8(%rdi) /* regs->cs */ - pushq 3*8(%rdi) /* regs->ip */ - pushq 2*8(%rdi) /* regs->orig_ax */ - pushq 8(%rdi) /* return address */ - UNWIND_HINT_FUNC - - movq (%rdi), %rdi - ret -END(switch_to_thread_stack) - .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 @@ -908,10 +914,8 @@ ENTRY(\sym) pushq $-1 /* ORIG_RAX: no syscall to restart */ .endif - ALLOC_PT_GPREGS_ON_STACK - - .if \paranoid < 2 - testb $3, CS(%rsp) /* If coming from userspace, switch stacks */ + .if \paranoid == 1 + testb $3, CS-ORIG_RAX(%rsp) /* If coming from userspace, switch stacks */ jnz .Lfrom_usermode_switch_stack_\@ .endif @@ -957,7 +961,7 @@ ENTRY(\sym) jmp error_exit .endif - .if \paranoid < 2 + .if \paranoid == 1 /* * Entry from userspace. Switch stacks and treat it * as a normal entry. This means that paranoid handlers @@ -1121,9 +1125,7 @@ ENTRY(xen_failsafe_callback) addq $0x30, %rsp UNWIND_HINT_IRET_REGS pushq $-1 /* orig_ax = -1 => not a system call */ - ALLOC_PT_GPREGS_ON_STACK - SAVE_C_REGS - SAVE_EXTRA_REGS + PUSH_AND_CLEAR_REGS ENCODE_FRAME_POINTER jmp error_exit END(xen_failsafe_callback) @@ -1139,10 +1141,13 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ apicinterrupt3 HYPERV_REENLIGHTENMENT_VECTOR \ hyperv_reenlightenment_vector hyperv_reenlightenment_intr + +apicinterrupt3 HYPERV_STIMER0_VECTOR \ + hv_stimer0_callback_vector hv_stimer0_vector_handler #endif /* CONFIG_HYPERV */ idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK -idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry int3 do_int3 has_error_code=0 idtentry stack_segment do_stack_segment has_error_code=1 #ifdef CONFIG_XEN @@ -1170,8 +1175,7 @@ idtentry machine_check do_mce has_error_code=0 paranoid=1 ENTRY(paranoid_entry) UNWIND_HINT_FUNC cld - SAVE_C_REGS 8 - SAVE_EXTRA_REGS 8 + PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 movl $1, %ebx movl $MSR_GS_BASE, %ecx @@ -1211,21 +1215,20 @@ ENTRY(paranoid_exit) jmp .Lparanoid_exit_restore .Lparanoid_exit_no_swapgs: TRACE_IRQS_IRETQ_DEBUG + RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 .Lparanoid_exit_restore: jmp restore_regs_and_return_to_kernel END(paranoid_exit) /* - * Save all registers in pt_regs, and switch gs if needed. + * Save all registers in pt_regs, and switch GS if needed. * Return: EBX=0: came from user mode; EBX=1: otherwise */ ENTRY(error_entry) UNWIND_HINT_FUNC cld - SAVE_C_REGS 8 - SAVE_EXTRA_REGS 8 + PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 - xorl %ebx, %ebx testb $3, CS+8(%rsp) jz .Lerror_kernelspace @@ -1406,22 +1409,7 @@ ENTRY(nmi) pushq 1*8(%rdx) /* pt_regs->rip */ UNWIND_HINT_IRET_REGS pushq $-1 /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - pushq (%rdx) /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx */ - pushq %rax /* pt_regs->ax */ - pushq %r8 /* pt_regs->r8 */ - pushq %r9 /* pt_regs->r9 */ - pushq %r10 /* pt_regs->r10 */ - pushq %r11 /* pt_regs->r11 */ - pushq %rbx /* pt_regs->rbx */ - pushq %rbp /* pt_regs->rbp */ - pushq %r12 /* pt_regs->r12 */ - pushq %r13 /* pt_regs->r13 */ - pushq %r14 /* pt_regs->r14 */ - pushq %r15 /* pt_regs->r15 */ - UNWIND_HINT_REGS + PUSH_AND_CLEAR_REGS rdx=(%rdx) ENCODE_FRAME_POINTER /* @@ -1631,7 +1619,6 @@ end_repeat_nmi: * frame to point back to repeat_nmi. */ pushq $-1 /* ORIG_RAX: no syscall to restart */ - ALLOC_PT_GPREGS_ON_STACK /* * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit @@ -1655,8 +1642,7 @@ end_repeat_nmi: nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: - POP_EXTRA_REGS - POP_C_REGS + POP_REGS /* * Skip orig_ax and the "outermost" frame to point RSP at the "iret" diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 98d5358e4041..9de7f1e1dede 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -84,16 +84,26 @@ ENTRY(entry_SYSENTER_compat) pushq %rdx /* pt_regs->dx */ pushq %rcx /* pt_regs->cx */ pushq $-ENOSYS /* pt_regs->ax */ - pushq $0 /* pt_regs->r8 = 0 */ - pushq $0 /* pt_regs->r9 = 0 */ - pushq $0 /* pt_regs->r10 = 0 */ - pushq $0 /* pt_regs->r11 = 0 */ + pushq %r8 /* pt_regs->r8 */ + xorl %r8d, %r8d /* nospec r8 */ + pushq %r9 /* pt_regs->r9 */ + xorl %r9d, %r9d /* nospec r9 */ + pushq %r10 /* pt_regs->r10 */ + xorl %r10d, %r10d /* nospec r10 */ + pushq %r11 /* pt_regs->r11 */ + xorl %r11d, %r11d /* nospec r11 */ pushq %rbx /* pt_regs->rbx */ + xorl %ebx, %ebx /* nospec rbx */ pushq %rbp /* pt_regs->rbp (will be overwritten) */ + xorl %ebp, %ebp /* nospec rbp */ pushq $0 /* pt_regs->r12 = 0 */ + xorl %r12d, %r12d /* nospec r12 */ pushq $0 /* pt_regs->r13 = 0 */ + xorl %r13d, %r13d /* nospec r13 */ pushq $0 /* pt_regs->r14 = 0 */ + xorl %r14d, %r14d /* nospec r14 */ pushq $0 /* pt_regs->r15 = 0 */ + xorl %r15d, %r15d /* nospec r15 */ cld /* @@ -210,19 +220,32 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe) pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ + xorl %esi, %esi /* nospec si */ pushq %rdx /* pt_regs->dx */ + xorl %edx, %edx /* nospec dx */ pushq %rbp /* pt_regs->cx (stashed in bp) */ + xorl %ecx, %ecx /* nospec cx */ pushq $-ENOSYS /* pt_regs->ax */ pushq $0 /* pt_regs->r8 = 0 */ + xorl %r8d, %r8d /* nospec r8 */ pushq $0 /* pt_regs->r9 = 0 */ + xorl %r9d, %r9d /* nospec r9 */ pushq $0 /* pt_regs->r10 = 0 */ + xorl %r10d, %r10d /* nospec r10 */ pushq $0 /* pt_regs->r11 = 0 */ + xorl %r11d, %r11d /* nospec r11 */ pushq %rbx /* pt_regs->rbx */ + xorl %ebx, %ebx /* nospec rbx */ pushq %rbp /* pt_regs->rbp (will be overwritten) */ + xorl %ebp, %ebp /* nospec rbp */ pushq $0 /* pt_regs->r12 = 0 */ + xorl %r12d, %r12d /* nospec r12 */ pushq $0 /* pt_regs->r13 = 0 */ + xorl %r13d, %r13d /* nospec r13 */ pushq $0 /* pt_regs->r14 = 0 */ + xorl %r14d, %r14d /* nospec r14 */ pushq $0 /* pt_regs->r15 = 0 */ + xorl %r15d, %r15d /* nospec r15 */ /* * User mode is traced as though IRQs are on, and SYSENTER @@ -278,9 +301,9 @@ sysret32_from_system_call: */ SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9 - xorq %r8, %r8 - xorq %r9, %r9 - xorq %r10, %r10 + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d swapgs sysretl END(entry_SYSCALL_compat) @@ -327,26 +350,50 @@ ENTRY(entry_INT80_compat) */ movl %eax, %eax + /* switch to thread stack expects orig_ax and rdi to be pushed */ pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ - /* switch to thread stack expects orig_ax to be pushed */ - call switch_to_thread_stack + /* Need to switch before accessing the thread stack. */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi + movq %rsp, %rdi + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - pushq %rdi /* pt_regs->di */ + pushq 6*8(%rdi) /* regs->ss */ + pushq 5*8(%rdi) /* regs->rsp */ + pushq 4*8(%rdi) /* regs->eflags */ + pushq 3*8(%rdi) /* regs->cs */ + pushq 2*8(%rdi) /* regs->ip */ + pushq 1*8(%rdi) /* regs->orig_ax */ + + pushq (%rdi) /* pt_regs->di */ pushq %rsi /* pt_regs->si */ + xorl %esi, %esi /* nospec si */ pushq %rdx /* pt_regs->dx */ + xorl %edx, %edx /* nospec dx */ pushq %rcx /* pt_regs->cx */ + xorl %ecx, %ecx /* nospec cx */ pushq $-ENOSYS /* pt_regs->ax */ pushq $0 /* pt_regs->r8 = 0 */ + xorl %r8d, %r8d /* nospec r8 */ pushq $0 /* pt_regs->r9 = 0 */ + xorl %r9d, %r9d /* nospec r9 */ pushq $0 /* pt_regs->r10 = 0 */ + xorl %r10d, %r10d /* nospec r10 */ pushq $0 /* pt_regs->r11 = 0 */ + xorl %r11d, %r11d /* nospec r11 */ pushq %rbx /* pt_regs->rbx */ + xorl %ebx, %ebx /* nospec rbx */ pushq %rbp /* pt_regs->rbp */ + xorl %ebp, %ebp /* nospec rbp */ pushq %r12 /* pt_regs->r12 */ + xorl %r12d, %r12d /* nospec r12 */ pushq %r13 /* pt_regs->r13 */ + xorl %r13d, %r13d /* nospec r13 */ pushq %r14 /* pt_regs->r14 */ + xorl %r14d, %r14d /* nospec r14 */ pushq %r15 /* pt_regs->r15 */ + xorl %r15d, %r15d /* nospec r15 */ cld /* @@ -363,15 +410,3 @@ ENTRY(entry_INT80_compat) TRACE_IRQS_ON jmp swapgs_restore_regs_and_return_to_usermode END(entry_INT80_compat) - -ENTRY(stub32_clone) - /* - * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr). - * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val). - * - * The native 64-bit kernel's sys_clone() implements the latter, - * so we need to swap arguments here before calling it: - */ - xchg %r8, %rcx - jmp sys_clone -ENDPROC(stub32_clone) diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 95c294963612..aa3336a7cb15 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -7,14 +7,23 @@ #include <asm/asm-offsets.h> #include <asm/syscall.h> -#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#ifdef CONFIG_IA32_EMULATION +/* On X86_64, we use struct pt_regs * to pass parameters to syscalls */ +#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(const struct pt_regs *); + +/* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */ +extern asmlinkage long sys_ni_syscall(const struct pt_regs *); + +#else /* CONFIG_IA32_EMULATION */ +#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); +extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); +#endif /* CONFIG_IA32_EMULATION */ + #include <asm/syscalls_32.h> #undef __SYSCALL_I386 #define __SYSCALL_I386(nr, sym, qual) [nr] = sym, -extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); - __visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = { /* * Smells like a compiler bug -- it doesn't work diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index c176d2fab1da..d5252bc1e380 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -7,14 +7,14 @@ #include <asm/asm-offsets.h> #include <asm/syscall.h> -#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); +/* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */ +extern asmlinkage long sys_ni_syscall(const struct pt_regs *); +#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(const struct pt_regs *); #include <asm/syscalls_64.h> #undef __SYSCALL_64 #define __SYSCALL_64(nr, sym, qual) [nr] = sym, -extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); - asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* * Smells like a compiler bug -- it doesn't work diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 448ac2161112..d6b27dab1b30 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -4,390 +4,395 @@ # The format is: # <number> <abi> <name> <entry point> <compat entry point> # +# The __ia32_sys and __ia32_compat_sys stubs are created on-the-fly for +# sys_*() system calls and compat_sys_*() compat system calls if +# IA32_EMULATION is defined, and expect struct pt_regs *regs as their only +# parameter. +# # The abi is always "i386" for this file. # -0 i386 restart_syscall sys_restart_syscall -1 i386 exit sys_exit -2 i386 fork sys_fork sys_fork -3 i386 read sys_read -4 i386 write sys_write -5 i386 open sys_open compat_sys_open -6 i386 close sys_close -7 i386 waitpid sys_waitpid sys32_waitpid -8 i386 creat sys_creat -9 i386 link sys_link -10 i386 unlink sys_unlink -11 i386 execve sys_execve compat_sys_execve -12 i386 chdir sys_chdir -13 i386 time sys_time compat_sys_time -14 i386 mknod sys_mknod -15 i386 chmod sys_chmod -16 i386 lchown sys_lchown16 +0 i386 restart_syscall sys_restart_syscall __ia32_sys_restart_syscall +1 i386 exit sys_exit __ia32_sys_exit +2 i386 fork sys_fork __ia32_sys_fork +3 i386 read sys_read __ia32_sys_read +4 i386 write sys_write __ia32_sys_write +5 i386 open sys_open __ia32_compat_sys_open +6 i386 close sys_close __ia32_sys_close +7 i386 waitpid sys_waitpid __ia32_sys_waitpid +8 i386 creat sys_creat __ia32_sys_creat +9 i386 link sys_link __ia32_sys_link +10 i386 unlink sys_unlink __ia32_sys_unlink +11 i386 execve sys_execve __ia32_compat_sys_execve +12 i386 chdir sys_chdir __ia32_sys_chdir +13 i386 time sys_time __ia32_compat_sys_time +14 i386 mknod sys_mknod __ia32_sys_mknod +15 i386 chmod sys_chmod __ia32_sys_chmod +16 i386 lchown sys_lchown16 __ia32_sys_lchown16 17 i386 break -18 i386 oldstat sys_stat -19 i386 lseek sys_lseek compat_sys_lseek -20 i386 getpid sys_getpid -21 i386 mount sys_mount compat_sys_mount -22 i386 umount sys_oldumount -23 i386 setuid sys_setuid16 -24 i386 getuid sys_getuid16 -25 i386 stime sys_stime compat_sys_stime -26 i386 ptrace sys_ptrace compat_sys_ptrace -27 i386 alarm sys_alarm -28 i386 oldfstat sys_fstat -29 i386 pause sys_pause -30 i386 utime sys_utime compat_sys_utime +18 i386 oldstat sys_stat __ia32_sys_stat +19 i386 lseek sys_lseek __ia32_compat_sys_lseek +20 i386 getpid sys_getpid __ia32_sys_getpid +21 i386 mount sys_mount __ia32_compat_sys_mount +22 i386 umount sys_oldumount __ia32_sys_oldumount +23 i386 setuid sys_setuid16 __ia32_sys_setuid16 +24 i386 getuid sys_getuid16 __ia32_sys_getuid16 +25 i386 stime sys_stime __ia32_compat_sys_stime +26 i386 ptrace sys_ptrace __ia32_compat_sys_ptrace +27 i386 alarm sys_alarm __ia32_sys_alarm +28 i386 oldfstat sys_fstat __ia32_sys_fstat +29 i386 pause sys_pause __ia32_sys_pause +30 i386 utime sys_utime __ia32_compat_sys_utime 31 i386 stty 32 i386 gtty -33 i386 access sys_access -34 i386 nice sys_nice +33 i386 access sys_access __ia32_sys_access +34 i386 nice sys_nice __ia32_sys_nice 35 i386 ftime -36 i386 sync sys_sync -37 i386 kill sys_kill -38 i386 rename sys_rename -39 i386 mkdir sys_mkdir -40 i386 rmdir sys_rmdir -41 i386 dup sys_dup -42 i386 pipe sys_pipe -43 i386 times sys_times compat_sys_times +36 i386 sync sys_sync __ia32_sys_sync +37 i386 kill sys_kill __ia32_sys_kill +38 i386 rename sys_rename __ia32_sys_rename +39 i386 mkdir sys_mkdir __ia32_sys_mkdir +40 i386 rmdir sys_rmdir __ia32_sys_rmdir +41 i386 dup sys_dup __ia32_sys_dup +42 i386 pipe sys_pipe __ia32_sys_pipe +43 i386 times sys_times __ia32_compat_sys_times 44 i386 prof -45 i386 brk sys_brk -46 i386 setgid sys_setgid16 -47 i386 getgid sys_getgid16 -48 i386 signal sys_signal -49 i386 geteuid sys_geteuid16 -50 i386 getegid sys_getegid16 -51 i386 acct sys_acct -52 i386 umount2 sys_umount +45 i386 brk sys_brk __ia32_sys_brk +46 i386 setgid sys_setgid16 __ia32_sys_setgid16 +47 i386 getgid sys_getgid16 __ia32_sys_getgid16 +48 i386 signal sys_signal __ia32_sys_signal +49 i386 geteuid sys_geteuid16 __ia32_sys_geteuid16 +50 i386 getegid sys_getegid16 __ia32_sys_getegid16 +51 i386 acct sys_acct __ia32_sys_acct +52 i386 umount2 sys_umount __ia32_sys_umount 53 i386 lock -54 i386 ioctl sys_ioctl compat_sys_ioctl -55 i386 fcntl sys_fcntl compat_sys_fcntl64 +54 i386 ioctl sys_ioctl __ia32_compat_sys_ioctl +55 i386 fcntl sys_fcntl __ia32_compat_sys_fcntl64 56 i386 mpx -57 i386 setpgid sys_setpgid +57 i386 setpgid sys_setpgid __ia32_sys_setpgid 58 i386 ulimit -59 i386 oldolduname sys_olduname -60 i386 umask sys_umask -61 i386 chroot sys_chroot -62 i386 ustat sys_ustat compat_sys_ustat -63 i386 dup2 sys_dup2 -64 i386 getppid sys_getppid -65 i386 getpgrp sys_getpgrp -66 i386 setsid sys_setsid -67 i386 sigaction sys_sigaction compat_sys_sigaction -68 i386 sgetmask sys_sgetmask -69 i386 ssetmask sys_ssetmask -70 i386 setreuid sys_setreuid16 -71 i386 setregid sys_setregid16 -72 i386 sigsuspend sys_sigsuspend sys_sigsuspend -73 i386 sigpending sys_sigpending compat_sys_sigpending -74 i386 sethostname sys_sethostname -75 i386 setrlimit sys_setrlimit compat_sys_setrlimit -76 i386 getrlimit sys_old_getrlimit compat_sys_old_getrlimit -77 i386 getrusage sys_getrusage compat_sys_getrusage -78 i386 gettimeofday sys_gettimeofday compat_sys_gettimeofday -79 i386 settimeofday sys_settimeofday compat_sys_settimeofday -80 i386 getgroups sys_getgroups16 -81 i386 setgroups sys_setgroups16 -82 i386 select sys_old_select compat_sys_old_select -83 i386 symlink sys_symlink -84 i386 oldlstat sys_lstat -85 i386 readlink sys_readlink -86 i386 uselib sys_uselib -87 i386 swapon sys_swapon -88 i386 reboot sys_reboot -89 i386 readdir sys_old_readdir compat_sys_old_readdir -90 i386 mmap sys_old_mmap sys32_mmap -91 i386 munmap sys_munmap -92 i386 truncate sys_truncate compat_sys_truncate -93 i386 ftruncate sys_ftruncate compat_sys_ftruncate -94 i386 fchmod sys_fchmod -95 i386 fchown sys_fchown16 -96 i386 getpriority sys_getpriority -97 i386 setpriority sys_setpriority +59 i386 oldolduname sys_olduname __ia32_sys_olduname +60 i386 umask sys_umask __ia32_sys_umask +61 i386 chroot sys_chroot __ia32_sys_chroot +62 i386 ustat sys_ustat __ia32_compat_sys_ustat +63 i386 dup2 sys_dup2 __ia32_sys_dup2 +64 i386 getppid sys_getppid __ia32_sys_getppid +65 i386 getpgrp sys_getpgrp __ia32_sys_getpgrp +66 i386 setsid sys_setsid __ia32_sys_setsid +67 i386 sigaction sys_sigaction __ia32_compat_sys_sigaction +68 i386 sgetmask sys_sgetmask __ia32_sys_sgetmask +69 i386 ssetmask sys_ssetmask __ia32_sys_ssetmask +70 i386 setreuid sys_setreuid16 __ia32_sys_setreuid16 +71 i386 setregid sys_setregid16 __ia32_sys_setregid16 +72 i386 sigsuspend sys_sigsuspend __ia32_sys_sigsuspend +73 i386 sigpending sys_sigpending __ia32_compat_sys_sigpending +74 i386 sethostname sys_sethostname __ia32_sys_sethostname +75 i386 setrlimit sys_setrlimit __ia32_compat_sys_setrlimit +76 i386 getrlimit sys_old_getrlimit __ia32_compat_sys_old_getrlimit +77 i386 getrusage sys_getrusage __ia32_compat_sys_getrusage +78 i386 gettimeofday sys_gettimeofday __ia32_compat_sys_gettimeofday +79 i386 settimeofday sys_settimeofday __ia32_compat_sys_settimeofday +80 i386 getgroups sys_getgroups16 __ia32_sys_getgroups16 +81 i386 setgroups sys_setgroups16 __ia32_sys_setgroups16 +82 i386 select sys_old_select __ia32_compat_sys_old_select +83 i386 symlink sys_symlink __ia32_sys_symlink +84 i386 oldlstat sys_lstat __ia32_sys_lstat +85 i386 readlink sys_readlink __ia32_sys_readlink +86 i386 uselib sys_uselib __ia32_sys_uselib +87 i386 swapon sys_swapon __ia32_sys_swapon +88 i386 reboot sys_reboot __ia32_sys_reboot +89 i386 readdir sys_old_readdir __ia32_compat_sys_old_readdir +90 i386 mmap sys_old_mmap __ia32_compat_sys_x86_mmap +91 i386 munmap sys_munmap __ia32_sys_munmap +92 i386 truncate sys_truncate __ia32_compat_sys_truncate +93 i386 ftruncate sys_ftruncate __ia32_compat_sys_ftruncate +94 i386 fchmod sys_fchmod __ia32_sys_fchmod +95 i386 fchown sys_fchown16 __ia32_sys_fchown16 +96 i386 getpriority sys_getpriority __ia32_sys_getpriority +97 i386 setpriority sys_setpriority __ia32_sys_setpriority 98 i386 profil -99 i386 statfs sys_statfs compat_sys_statfs -100 i386 fstatfs sys_fstatfs compat_sys_fstatfs -101 i386 ioperm sys_ioperm -102 i386 socketcall sys_socketcall compat_sys_socketcall -103 i386 syslog sys_syslog -104 i386 setitimer sys_setitimer compat_sys_setitimer -105 i386 getitimer sys_getitimer compat_sys_getitimer -106 i386 stat sys_newstat compat_sys_newstat -107 i386 lstat sys_newlstat compat_sys_newlstat -108 i386 fstat sys_newfstat compat_sys_newfstat -109 i386 olduname sys_uname -110 i386 iopl sys_iopl -111 i386 vhangup sys_vhangup +99 i386 statfs sys_statfs __ia32_compat_sys_statfs +100 i386 fstatfs sys_fstatfs __ia32_compat_sys_fstatfs +101 i386 ioperm sys_ioperm __ia32_sys_ioperm +102 i386 socketcall sys_socketcall __ia32_compat_sys_socketcall +103 i386 syslog sys_syslog __ia32_sys_syslog +104 i386 setitimer sys_setitimer __ia32_compat_sys_setitimer +105 i386 getitimer sys_getitimer __ia32_compat_sys_getitimer +106 i386 stat sys_newstat __ia32_compat_sys_newstat +107 i386 lstat sys_newlstat __ia32_compat_sys_newlstat +108 i386 fstat sys_newfstat __ia32_compat_sys_newfstat +109 i386 olduname sys_uname __ia32_sys_uname +110 i386 iopl sys_iopl __ia32_sys_iopl +111 i386 vhangup sys_vhangup __ia32_sys_vhangup 112 i386 idle 113 i386 vm86old sys_vm86old sys_ni_syscall -114 i386 wait4 sys_wait4 compat_sys_wait4 -115 i386 swapoff sys_swapoff -116 i386 sysinfo sys_sysinfo compat_sys_sysinfo -117 i386 ipc sys_ipc compat_sys_ipc -118 i386 fsync sys_fsync +114 i386 wait4 sys_wait4 __ia32_compat_sys_wait4 +115 i386 swapoff sys_swapoff __ia32_sys_swapoff +116 i386 sysinfo sys_sysinfo __ia32_compat_sys_sysinfo +117 i386 ipc sys_ipc __ia32_compat_sys_ipc +118 i386 fsync sys_fsync __ia32_sys_fsync 119 i386 sigreturn sys_sigreturn sys32_sigreturn -120 i386 clone sys_clone stub32_clone -121 i386 setdomainname sys_setdomainname -122 i386 uname sys_newuname -123 i386 modify_ldt sys_modify_ldt -124 i386 adjtimex sys_adjtimex compat_sys_adjtimex -125 i386 mprotect sys_mprotect -126 i386 sigprocmask sys_sigprocmask compat_sys_sigprocmask +120 i386 clone sys_clone __ia32_compat_sys_x86_clone +121 i386 setdomainname sys_setdomainname __ia32_sys_setdomainname +122 i386 uname sys_newuname __ia32_sys_newuname +123 i386 modify_ldt sys_modify_ldt __ia32_sys_modify_ldt +124 i386 adjtimex sys_adjtimex __ia32_compat_sys_adjtimex +125 i386 mprotect sys_mprotect __ia32_sys_mprotect +126 i386 sigprocmask sys_sigprocmask __ia32_compat_sys_sigprocmask 127 i386 create_module -128 i386 init_module sys_init_module -129 i386 delete_module sys_delete_module +128 i386 init_module sys_init_module __ia32_sys_init_module +129 i386 delete_module sys_delete_module __ia32_sys_delete_module 130 i386 get_kernel_syms -131 i386 quotactl sys_quotactl sys32_quotactl -132 i386 getpgid sys_getpgid -133 i386 fchdir sys_fchdir -134 i386 bdflush sys_bdflush -135 i386 sysfs sys_sysfs -136 i386 personality sys_personality +131 i386 quotactl sys_quotactl __ia32_compat_sys_quotactl32 +132 i386 getpgid sys_getpgid __ia32_sys_getpgid +133 i386 fchdir sys_fchdir __ia32_sys_fchdir +134 i386 bdflush sys_bdflush __ia32_sys_bdflush +135 i386 sysfs sys_sysfs __ia32_sys_sysfs +136 i386 personality sys_personality __ia32_sys_personality 137 i386 afs_syscall -138 i386 setfsuid sys_setfsuid16 -139 i386 setfsgid sys_setfsgid16 -140 i386 _llseek sys_llseek -141 i386 getdents sys_getdents compat_sys_getdents -142 i386 _newselect sys_select compat_sys_select -143 i386 flock sys_flock -144 i386 msync sys_msync -145 i386 readv sys_readv compat_sys_readv -146 i386 writev sys_writev compat_sys_writev -147 i386 getsid sys_getsid -148 i386 fdatasync sys_fdatasync -149 i386 _sysctl sys_sysctl compat_sys_sysctl -150 i386 mlock sys_mlock -151 i386 munlock sys_munlock -152 i386 mlockall sys_mlockall -153 i386 munlockall sys_munlockall -154 i386 sched_setparam sys_sched_setparam -155 i386 sched_getparam sys_sched_getparam -156 i386 sched_setscheduler sys_sched_setscheduler -157 i386 sched_getscheduler sys_sched_getscheduler -158 i386 sched_yield sys_sched_yield -159 i386 sched_get_priority_max sys_sched_get_priority_max -160 i386 sched_get_priority_min sys_sched_get_priority_min -161 i386 sched_rr_get_interval sys_sched_rr_get_interval compat_sys_sched_rr_get_interval -162 i386 nanosleep sys_nanosleep compat_sys_nanosleep -163 i386 mremap sys_mremap -164 i386 setresuid sys_setresuid16 -165 i386 getresuid sys_getresuid16 +138 i386 setfsuid sys_setfsuid16 __ia32_sys_setfsuid16 +139 i386 setfsgid sys_setfsgid16 __ia32_sys_setfsgid16 +140 i386 _llseek sys_llseek __ia32_sys_llseek +141 i386 getdents sys_getdents __ia32_compat_sys_getdents +142 i386 _newselect sys_select __ia32_compat_sys_select +143 i386 flock sys_flock __ia32_sys_flock +144 i386 msync sys_msync __ia32_sys_msync +145 i386 readv sys_readv __ia32_compat_sys_readv +146 i386 writev sys_writev __ia32_compat_sys_writev +147 i386 getsid sys_getsid __ia32_sys_getsid +148 i386 fdatasync sys_fdatasync __ia32_sys_fdatasync +149 i386 _sysctl sys_sysctl __ia32_compat_sys_sysctl +150 i386 mlock sys_mlock __ia32_sys_mlock +151 i386 munlock sys_munlock __ia32_sys_munlock +152 i386 mlockall sys_mlockall __ia32_sys_mlockall +153 i386 munlockall sys_munlockall __ia32_sys_munlockall +154 i386 sched_setparam sys_sched_setparam __ia32_sys_sched_setparam +155 i386 sched_getparam sys_sched_getparam __ia32_sys_sched_getparam +156 i386 sched_setscheduler sys_sched_setscheduler __ia32_sys_sched_setscheduler +157 i386 sched_getscheduler sys_sched_getscheduler __ia32_sys_sched_getscheduler +158 i386 sched_yield sys_sched_yield __ia32_sys_sched_yield +159 i386 sched_get_priority_max sys_sched_get_priority_max __ia32_sys_sched_get_priority_max +160 i386 sched_get_priority_min sys_sched_get_priority_min __ia32_sys_sched_get_priority_min +161 i386 sched_rr_get_interval sys_sched_rr_get_interval __ia32_compat_sys_sched_rr_get_interval +162 i386 nanosleep sys_nanosleep __ia32_compat_sys_nanosleep +163 i386 mremap sys_mremap __ia32_sys_mremap +164 i386 setresuid sys_setresuid16 __ia32_sys_setresuid16 +165 i386 getresuid sys_getresuid16 __ia32_sys_getresuid16 166 i386 vm86 sys_vm86 sys_ni_syscall 167 i386 query_module -168 i386 poll sys_poll +168 i386 poll sys_poll __ia32_sys_poll 169 i386 nfsservctl -170 i386 setresgid sys_setresgid16 -171 i386 getresgid sys_getresgid16 -172 i386 prctl sys_prctl +170 i386 setresgid sys_setresgid16 __ia32_sys_setresgid16 +171 i386 getresgid sys_getresgid16 __ia32_sys_getresgid16 +172 i386 prctl sys_prctl __ia32_sys_prctl 173 i386 rt_sigreturn sys_rt_sigreturn sys32_rt_sigreturn -174 i386 rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction -175 i386 rt_sigprocmask sys_rt_sigprocmask -176 i386 rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending -177 i386 rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait -178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo -179 i386 rt_sigsuspend sys_rt_sigsuspend -180 i386 pread64 sys_pread64 sys32_pread -181 i386 pwrite64 sys_pwrite64 sys32_pwrite -182 i386 chown sys_chown16 -183 i386 getcwd sys_getcwd -184 i386 capget sys_capget -185 i386 capset sys_capset -186 i386 sigaltstack sys_sigaltstack compat_sys_sigaltstack -187 i386 sendfile sys_sendfile compat_sys_sendfile +174 i386 rt_sigaction sys_rt_sigaction __ia32_compat_sys_rt_sigaction +175 i386 rt_sigprocmask sys_rt_sigprocmask __ia32_sys_rt_sigprocmask +176 i386 rt_sigpending sys_rt_sigpending __ia32_compat_sys_rt_sigpending +177 i386 rt_sigtimedwait sys_rt_sigtimedwait __ia32_compat_sys_rt_sigtimedwait +178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo __ia32_compat_sys_rt_sigqueueinfo +179 i386 rt_sigsuspend sys_rt_sigsuspend __ia32_sys_rt_sigsuspend +180 i386 pread64 sys_pread64 __ia32_compat_sys_x86_pread +181 i386 pwrite64 sys_pwrite64 __ia32_compat_sys_x86_pwrite +182 i386 chown sys_chown16 __ia32_sys_chown16 +183 i386 getcwd sys_getcwd __ia32_sys_getcwd +184 i386 capget sys_capget __ia32_sys_capget +185 i386 capset sys_capset __ia32_sys_capset +186 i386 sigaltstack sys_sigaltstack __ia32_compat_sys_sigaltstack +187 i386 sendfile sys_sendfile __ia32_compat_sys_sendfile 188 i386 getpmsg 189 i386 putpmsg -190 i386 vfork sys_vfork sys_vfork -191 i386 ugetrlimit sys_getrlimit compat_sys_getrlimit -192 i386 mmap2 sys_mmap_pgoff -193 i386 truncate64 sys_truncate64 sys32_truncate64 -194 i386 ftruncate64 sys_ftruncate64 sys32_ftruncate64 -195 i386 stat64 sys_stat64 sys32_stat64 -196 i386 lstat64 sys_lstat64 sys32_lstat64 -197 i386 fstat64 sys_fstat64 sys32_fstat64 -198 i386 lchown32 sys_lchown -199 i386 getuid32 sys_getuid -200 i386 getgid32 sys_getgid -201 i386 geteuid32 sys_geteuid -202 i386 getegid32 sys_getegid -203 i386 setreuid32 sys_setreuid -204 i386 setregid32 sys_setregid -205 i386 getgroups32 sys_getgroups -206 i386 setgroups32 sys_setgroups -207 i386 fchown32 sys_fchown -208 i386 setresuid32 sys_setresuid -209 i386 getresuid32 sys_getresuid -210 i386 setresgid32 sys_setresgid -211 i386 getresgid32 sys_getresgid -212 i386 chown32 sys_chown -213 i386 setuid32 sys_setuid -214 i386 setgid32 sys_setgid -215 i386 setfsuid32 sys_setfsuid -216 i386 setfsgid32 sys_setfsgid -217 i386 pivot_root sys_pivot_root -218 i386 mincore sys_mincore -219 i386 madvise sys_madvise -220 i386 getdents64 sys_getdents64 -221 i386 fcntl64 sys_fcntl64 compat_sys_fcntl64 +190 i386 vfork sys_vfork __ia32_sys_vfork +191 i386 ugetrlimit sys_getrlimit __ia32_compat_sys_getrlimit +192 i386 mmap2 sys_mmap_pgoff __ia32_sys_mmap_pgoff +193 i386 truncate64 sys_truncate64 __ia32_compat_sys_x86_truncate64 +194 i386 ftruncate64 sys_ftruncate64 __ia32_compat_sys_x86_ftruncate64 +195 i386 stat64 sys_stat64 __ia32_compat_sys_x86_stat64 +196 i386 lstat64 sys_lstat64 __ia32_compat_sys_x86_lstat64 +197 i386 fstat64 sys_fstat64 __ia32_compat_sys_x86_fstat64 +198 i386 lchown32 sys_lchown __ia32_sys_lchown +199 i386 getuid32 sys_getuid __ia32_sys_getuid +200 i386 getgid32 sys_getgid __ia32_sys_getgid +201 i386 geteuid32 sys_geteuid __ia32_sys_geteuid +202 i386 getegid32 sys_getegid __ia32_sys_getegid +203 i386 setreuid32 sys_setreuid __ia32_sys_setreuid +204 i386 setregid32 sys_setregid __ia32_sys_setregid +205 i386 getgroups32 sys_getgroups __ia32_sys_getgroups +206 i386 setgroups32 sys_setgroups __ia32_sys_setgroups +207 i386 fchown32 sys_fchown __ia32_sys_fchown +208 i386 setresuid32 sys_setresuid __ia32_sys_setresuid +209 i386 getresuid32 sys_getresuid __ia32_sys_getresuid +210 i386 setresgid32 sys_setresgid __ia32_sys_setresgid +211 i386 getresgid32 sys_getresgid __ia32_sys_getresgid +212 i386 chown32 sys_chown __ia32_sys_chown +213 i386 setuid32 sys_setuid __ia32_sys_setuid +214 i386 setgid32 sys_setgid __ia32_sys_setgid +215 i386 setfsuid32 sys_setfsuid __ia32_sys_setfsuid +216 i386 setfsgid32 sys_setfsgid __ia32_sys_setfsgid +217 i386 pivot_root sys_pivot_root __ia32_sys_pivot_root +218 i386 mincore sys_mincore __ia32_sys_mincore +219 i386 madvise sys_madvise __ia32_sys_madvise +220 i386 getdents64 sys_getdents64 __ia32_sys_getdents64 +221 i386 fcntl64 sys_fcntl64 __ia32_compat_sys_fcntl64 # 222 is unused # 223 is unused -224 i386 gettid sys_gettid -225 i386 readahead sys_readahead sys32_readahead -226 i386 setxattr sys_setxattr -227 i386 lsetxattr sys_lsetxattr -228 i386 fsetxattr sys_fsetxattr -229 i386 getxattr sys_getxattr -230 i386 lgetxattr sys_lgetxattr -231 i386 fgetxattr sys_fgetxattr -232 i386 listxattr sys_listxattr -233 i386 llistxattr sys_llistxattr -234 i386 flistxattr sys_flistxattr -235 i386 removexattr sys_removexattr -236 i386 lremovexattr sys_lremovexattr -237 i386 fremovexattr sys_fremovexattr -238 i386 tkill sys_tkill -239 i386 sendfile64 sys_sendfile64 -240 i386 futex sys_futex compat_sys_futex -241 i386 sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity -242 i386 sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity -243 i386 set_thread_area sys_set_thread_area -244 i386 get_thread_area sys_get_thread_area -245 i386 io_setup sys_io_setup compat_sys_io_setup -246 i386 io_destroy sys_io_destroy -247 i386 io_getevents sys_io_getevents compat_sys_io_getevents -248 i386 io_submit sys_io_submit compat_sys_io_submit -249 i386 io_cancel sys_io_cancel -250 i386 fadvise64 sys_fadvise64 sys32_fadvise64 +224 i386 gettid sys_gettid __ia32_sys_gettid +225 i386 readahead sys_readahead __ia32_compat_sys_x86_readahead +226 i386 setxattr sys_setxattr __ia32_sys_setxattr +227 i386 lsetxattr sys_lsetxattr __ia32_sys_lsetxattr +228 i386 fsetxattr sys_fsetxattr __ia32_sys_fsetxattr +229 i386 getxattr sys_getxattr __ia32_sys_getxattr +230 i386 lgetxattr sys_lgetxattr __ia32_sys_lgetxattr +231 i386 fgetxattr sys_fgetxattr __ia32_sys_fgetxattr +232 i386 listxattr sys_listxattr __ia32_sys_listxattr +233 i386 llistxattr sys_llistxattr __ia32_sys_llistxattr +234 i386 flistxattr sys_flistxattr __ia32_sys_flistxattr +235 i386 removexattr sys_removexattr __ia32_sys_removexattr +236 i386 lremovexattr sys_lremovexattr __ia32_sys_lremovexattr +237 i386 fremovexattr sys_fremovexattr __ia32_sys_fremovexattr +238 i386 tkill sys_tkill __ia32_sys_tkill +239 i386 sendfile64 sys_sendfile64 __ia32_sys_sendfile64 +240 i386 futex sys_futex __ia32_compat_sys_futex +241 i386 sched_setaffinity sys_sched_setaffinity __ia32_compat_sys_sched_setaffinity +242 i386 sched_getaffinity sys_sched_getaffinity __ia32_compat_sys_sched_getaffinity +243 i386 set_thread_area sys_set_thread_area __ia32_sys_set_thread_area +244 i386 get_thread_area sys_get_thread_area __ia32_sys_get_thread_area +245 i386 io_setup sys_io_setup __ia32_compat_sys_io_setup +246 i386 io_destroy sys_io_destroy __ia32_sys_io_destroy +247 i386 io_getevents sys_io_getevents __ia32_compat_sys_io_getevents +248 i386 io_submit sys_io_submit __ia32_compat_sys_io_submit +249 i386 io_cancel sys_io_cancel __ia32_sys_io_cancel +250 i386 fadvise64 sys_fadvise64 __ia32_compat_sys_x86_fadvise64 # 251 is available for reuse (was briefly sys_set_zone_reclaim) -252 i386 exit_group sys_exit_group -253 i386 lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie -254 i386 epoll_create sys_epoll_create -255 i386 epoll_ctl sys_epoll_ctl -256 i386 epoll_wait sys_epoll_wait -257 i386 remap_file_pages sys_remap_file_pages -258 i386 set_tid_address sys_set_tid_address -259 i386 timer_create sys_timer_create compat_sys_timer_create -260 i386 timer_settime sys_timer_settime compat_sys_timer_settime -261 i386 timer_gettime sys_timer_gettime compat_sys_timer_gettime -262 i386 timer_getoverrun sys_timer_getoverrun -263 i386 timer_delete sys_timer_delete -264 i386 clock_settime sys_clock_settime compat_sys_clock_settime -265 i386 clock_gettime sys_clock_gettime compat_sys_clock_gettime -266 i386 clock_getres sys_clock_getres compat_sys_clock_getres -267 i386 clock_nanosleep sys_clock_nanosleep compat_sys_clock_nanosleep -268 i386 statfs64 sys_statfs64 compat_sys_statfs64 -269 i386 fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 -270 i386 tgkill sys_tgkill -271 i386 utimes sys_utimes compat_sys_utimes -272 i386 fadvise64_64 sys_fadvise64_64 sys32_fadvise64_64 +252 i386 exit_group sys_exit_group __ia32_sys_exit_group +253 i386 lookup_dcookie sys_lookup_dcookie __ia32_compat_sys_lookup_dcookie +254 i386 epoll_create sys_epoll_create __ia32_sys_epoll_create +255 i386 epoll_ctl sys_epoll_ctl __ia32_sys_epoll_ctl +256 i386 epoll_wait sys_epoll_wait __ia32_sys_epoll_wait +257 i386 remap_file_pages sys_remap_file_pages __ia32_sys_remap_file_pages +258 i386 set_tid_address sys_set_tid_address __ia32_sys_set_tid_address +259 i386 timer_create sys_timer_create __ia32_compat_sys_timer_create +260 i386 timer_settime sys_timer_settime __ia32_compat_sys_timer_settime +261 i386 timer_gettime sys_timer_gettime __ia32_compat_sys_timer_gettime +262 i386 timer_getoverrun sys_timer_getoverrun __ia32_sys_timer_getoverrun +263 i386 timer_delete sys_timer_delete __ia32_sys_timer_delete +264 i386 clock_settime sys_clock_settime __ia32_compat_sys_clock_settime +265 i386 clock_gettime sys_clock_gettime __ia32_compat_sys_clock_gettime +266 i386 clock_getres sys_clock_getres __ia32_compat_sys_clock_getres +267 i386 clock_nanosleep sys_clock_nanosleep __ia32_compat_sys_clock_nanosleep +268 i386 statfs64 sys_statfs64 __ia32_compat_sys_statfs64 +269 i386 fstatfs64 sys_fstatfs64 __ia32_compat_sys_fstatfs64 +270 i386 tgkill sys_tgkill __ia32_sys_tgkill +271 i386 utimes sys_utimes __ia32_compat_sys_utimes +272 i386 fadvise64_64 sys_fadvise64_64 __ia32_compat_sys_x86_fadvise64_64 273 i386 vserver -274 i386 mbind sys_mbind -275 i386 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy -276 i386 set_mempolicy sys_set_mempolicy -277 i386 mq_open sys_mq_open compat_sys_mq_open -278 i386 mq_unlink sys_mq_unlink -279 i386 mq_timedsend sys_mq_timedsend compat_sys_mq_timedsend -280 i386 mq_timedreceive sys_mq_timedreceive compat_sys_mq_timedreceive -281 i386 mq_notify sys_mq_notify compat_sys_mq_notify -282 i386 mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr -283 i386 kexec_load sys_kexec_load compat_sys_kexec_load -284 i386 waitid sys_waitid compat_sys_waitid +274 i386 mbind sys_mbind __ia32_sys_mbind +275 i386 get_mempolicy sys_get_mempolicy __ia32_compat_sys_get_mempolicy +276 i386 set_mempolicy sys_set_mempolicy __ia32_sys_set_mempolicy +277 i386 mq_open sys_mq_open __ia32_compat_sys_mq_open +278 i386 mq_unlink sys_mq_unlink __ia32_sys_mq_unlink +279 i386 mq_timedsend sys_mq_timedsend __ia32_compat_sys_mq_timedsend +280 i386 mq_timedreceive sys_mq_timedreceive __ia32_compat_sys_mq_timedreceive +281 i386 mq_notify sys_mq_notify __ia32_compat_sys_mq_notify +282 i386 mq_getsetattr sys_mq_getsetattr __ia32_compat_sys_mq_getsetattr +283 i386 kexec_load sys_kexec_load __ia32_compat_sys_kexec_load +284 i386 waitid sys_waitid __ia32_compat_sys_waitid # 285 sys_setaltroot -286 i386 add_key sys_add_key -287 i386 request_key sys_request_key -288 i386 keyctl sys_keyctl compat_sys_keyctl -289 i386 ioprio_set sys_ioprio_set -290 i386 ioprio_get sys_ioprio_get -291 i386 inotify_init sys_inotify_init -292 i386 inotify_add_watch sys_inotify_add_watch -293 i386 inotify_rm_watch sys_inotify_rm_watch -294 i386 migrate_pages sys_migrate_pages -295 i386 openat sys_openat compat_sys_openat -296 i386 mkdirat sys_mkdirat -297 i386 mknodat sys_mknodat -298 i386 fchownat sys_fchownat -299 i386 futimesat sys_futimesat compat_sys_futimesat -300 i386 fstatat64 sys_fstatat64 sys32_fstatat -301 i386 unlinkat sys_unlinkat -302 i386 renameat sys_renameat -303 i386 linkat sys_linkat -304 i386 symlinkat sys_symlinkat -305 i386 readlinkat sys_readlinkat -306 i386 fchmodat sys_fchmodat -307 i386 faccessat sys_faccessat -308 i386 pselect6 sys_pselect6 compat_sys_pselect6 -309 i386 ppoll sys_ppoll compat_sys_ppoll -310 i386 unshare sys_unshare -311 i386 set_robust_list sys_set_robust_list compat_sys_set_robust_list -312 i386 get_robust_list sys_get_robust_list compat_sys_get_robust_list -313 i386 splice sys_splice -314 i386 sync_file_range sys_sync_file_range sys32_sync_file_range -315 i386 tee sys_tee -316 i386 vmsplice sys_vmsplice compat_sys_vmsplice -317 i386 move_pages sys_move_pages compat_sys_move_pages -318 i386 getcpu sys_getcpu -319 i386 epoll_pwait sys_epoll_pwait -320 i386 utimensat sys_utimensat compat_sys_utimensat -321 i386 signalfd sys_signalfd compat_sys_signalfd -322 i386 timerfd_create sys_timerfd_create -323 i386 eventfd sys_eventfd -324 i386 fallocate sys_fallocate sys32_fallocate -325 i386 timerfd_settime sys_timerfd_settime compat_sys_timerfd_settime -326 i386 timerfd_gettime sys_timerfd_gettime compat_sys_timerfd_gettime -327 i386 signalfd4 sys_signalfd4 compat_sys_signalfd4 -328 i386 eventfd2 sys_eventfd2 -329 i386 epoll_create1 sys_epoll_create1 -330 i386 dup3 sys_dup3 -331 i386 pipe2 sys_pipe2 -332 i386 inotify_init1 sys_inotify_init1 -333 i386 preadv sys_preadv compat_sys_preadv -334 i386 pwritev sys_pwritev compat_sys_pwritev -335 i386 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo -336 i386 perf_event_open sys_perf_event_open -337 i386 recvmmsg sys_recvmmsg compat_sys_recvmmsg -338 i386 fanotify_init sys_fanotify_init -339 i386 fanotify_mark sys_fanotify_mark compat_sys_fanotify_mark -340 i386 prlimit64 sys_prlimit64 -341 i386 name_to_handle_at sys_name_to_handle_at -342 i386 open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at -343 i386 clock_adjtime sys_clock_adjtime compat_sys_clock_adjtime -344 i386 syncfs sys_syncfs -345 i386 sendmmsg sys_sendmmsg compat_sys_sendmmsg -346 i386 setns sys_setns -347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv -348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev -349 i386 kcmp sys_kcmp -350 i386 finit_module sys_finit_module -351 i386 sched_setattr sys_sched_setattr -352 i386 sched_getattr sys_sched_getattr -353 i386 renameat2 sys_renameat2 -354 i386 seccomp sys_seccomp -355 i386 getrandom sys_getrandom -356 i386 memfd_create sys_memfd_create -357 i386 bpf sys_bpf -358 i386 execveat sys_execveat compat_sys_execveat -359 i386 socket sys_socket -360 i386 socketpair sys_socketpair -361 i386 bind sys_bind -362 i386 connect sys_connect -363 i386 listen sys_listen -364 i386 accept4 sys_accept4 -365 i386 getsockopt sys_getsockopt compat_sys_getsockopt -366 i386 setsockopt sys_setsockopt compat_sys_setsockopt -367 i386 getsockname sys_getsockname -368 i386 getpeername sys_getpeername -369 i386 sendto sys_sendto -370 i386 sendmsg sys_sendmsg compat_sys_sendmsg -371 i386 recvfrom sys_recvfrom compat_sys_recvfrom -372 i386 recvmsg sys_recvmsg compat_sys_recvmsg -373 i386 shutdown sys_shutdown -374 i386 userfaultfd sys_userfaultfd -375 i386 membarrier sys_membarrier -376 i386 mlock2 sys_mlock2 -377 i386 copy_file_range sys_copy_file_range -378 i386 preadv2 sys_preadv2 compat_sys_preadv2 -379 i386 pwritev2 sys_pwritev2 compat_sys_pwritev2 -380 i386 pkey_mprotect sys_pkey_mprotect -381 i386 pkey_alloc sys_pkey_alloc -382 i386 pkey_free sys_pkey_free -383 i386 statx sys_statx -384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl +286 i386 add_key sys_add_key __ia32_sys_add_key +287 i386 request_key sys_request_key __ia32_sys_request_key +288 i386 keyctl sys_keyctl __ia32_compat_sys_keyctl +289 i386 ioprio_set sys_ioprio_set __ia32_sys_ioprio_set +290 i386 ioprio_get sys_ioprio_get __ia32_sys_ioprio_get +291 i386 inotify_init sys_inotify_init __ia32_sys_inotify_init +292 i386 inotify_add_watch sys_inotify_add_watch __ia32_sys_inotify_add_watch +293 i386 inotify_rm_watch sys_inotify_rm_watch __ia32_sys_inotify_rm_watch +294 i386 migrate_pages sys_migrate_pages __ia32_sys_migrate_pages +295 i386 openat sys_openat __ia32_compat_sys_openat +296 i386 mkdirat sys_mkdirat __ia32_sys_mkdirat +297 i386 mknodat sys_mknodat __ia32_sys_mknodat +298 i386 fchownat sys_fchownat __ia32_sys_fchownat +299 i386 futimesat sys_futimesat __ia32_compat_sys_futimesat +300 i386 fstatat64 sys_fstatat64 __ia32_compat_sys_x86_fstatat +301 i386 unlinkat sys_unlinkat __ia32_sys_unlinkat +302 i386 renameat sys_renameat __ia32_sys_renameat +303 i386 linkat sys_linkat __ia32_sys_linkat +304 i386 symlinkat sys_symlinkat __ia32_sys_symlinkat +305 i386 readlinkat sys_readlinkat __ia32_sys_readlinkat +306 i386 fchmodat sys_fchmodat __ia32_sys_fchmodat +307 i386 faccessat sys_faccessat __ia32_sys_faccessat +308 i386 pselect6 sys_pselect6 __ia32_compat_sys_pselect6 +309 i386 ppoll sys_ppoll __ia32_compat_sys_ppoll +310 i386 unshare sys_unshare __ia32_sys_unshare +311 i386 set_robust_list sys_set_robust_list __ia32_compat_sys_set_robust_list +312 i386 get_robust_list sys_get_robust_list __ia32_compat_sys_get_robust_list +313 i386 splice sys_splice __ia32_sys_splice +314 i386 sync_file_range sys_sync_file_range __ia32_compat_sys_x86_sync_file_range +315 i386 tee sys_tee __ia32_sys_tee +316 i386 vmsplice sys_vmsplice __ia32_compat_sys_vmsplice +317 i386 move_pages sys_move_pages __ia32_compat_sys_move_pages +318 i386 getcpu sys_getcpu __ia32_sys_getcpu +319 i386 epoll_pwait sys_epoll_pwait __ia32_sys_epoll_pwait +320 i386 utimensat sys_utimensat __ia32_compat_sys_utimensat +321 i386 signalfd sys_signalfd __ia32_compat_sys_signalfd +322 i386 timerfd_create sys_timerfd_create __ia32_sys_timerfd_create +323 i386 eventfd sys_eventfd __ia32_sys_eventfd +324 i386 fallocate sys_fallocate __ia32_compat_sys_x86_fallocate +325 i386 timerfd_settime sys_timerfd_settime __ia32_compat_sys_timerfd_settime +326 i386 timerfd_gettime sys_timerfd_gettime __ia32_compat_sys_timerfd_gettime +327 i386 signalfd4 sys_signalfd4 __ia32_compat_sys_signalfd4 +328 i386 eventfd2 sys_eventfd2 __ia32_sys_eventfd2 +329 i386 epoll_create1 sys_epoll_create1 __ia32_sys_epoll_create1 +330 i386 dup3 sys_dup3 __ia32_sys_dup3 +331 i386 pipe2 sys_pipe2 __ia32_sys_pipe2 +332 i386 inotify_init1 sys_inotify_init1 __ia32_sys_inotify_init1 +333 i386 preadv sys_preadv __ia32_compat_sys_preadv +334 i386 pwritev sys_pwritev __ia32_compat_sys_pwritev +335 i386 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo __ia32_compat_sys_rt_tgsigqueueinfo +336 i386 perf_event_open sys_perf_event_open __ia32_sys_perf_event_open +337 i386 recvmmsg sys_recvmmsg __ia32_compat_sys_recvmmsg +338 i386 fanotify_init sys_fanotify_init __ia32_sys_fanotify_init +339 i386 fanotify_mark sys_fanotify_mark __ia32_compat_sys_fanotify_mark +340 i386 prlimit64 sys_prlimit64 __ia32_sys_prlimit64 +341 i386 name_to_handle_at sys_name_to_handle_at __ia32_sys_name_to_handle_at +342 i386 open_by_handle_at sys_open_by_handle_at __ia32_compat_sys_open_by_handle_at +343 i386 clock_adjtime sys_clock_adjtime __ia32_compat_sys_clock_adjtime +344 i386 syncfs sys_syncfs __ia32_sys_syncfs +345 i386 sendmmsg sys_sendmmsg __ia32_compat_sys_sendmmsg +346 i386 setns sys_setns __ia32_sys_setns +347 i386 process_vm_readv sys_process_vm_readv __ia32_compat_sys_process_vm_readv +348 i386 process_vm_writev sys_process_vm_writev __ia32_compat_sys_process_vm_writev +349 i386 kcmp sys_kcmp __ia32_sys_kcmp +350 i386 finit_module sys_finit_module __ia32_sys_finit_module +351 i386 sched_setattr sys_sched_setattr __ia32_sys_sched_setattr +352 i386 sched_getattr sys_sched_getattr __ia32_sys_sched_getattr +353 i386 renameat2 sys_renameat2 __ia32_sys_renameat2 +354 i386 seccomp sys_seccomp __ia32_sys_seccomp +355 i386 getrandom sys_getrandom __ia32_sys_getrandom +356 i386 memfd_create sys_memfd_create __ia32_sys_memfd_create +357 i386 bpf sys_bpf __ia32_sys_bpf +358 i386 execveat sys_execveat __ia32_compat_sys_execveat +359 i386 socket sys_socket __ia32_sys_socket +360 i386 socketpair sys_socketpair __ia32_sys_socketpair +361 i386 bind sys_bind __ia32_sys_bind +362 i386 connect sys_connect __ia32_sys_connect +363 i386 listen sys_listen __ia32_sys_listen +364 i386 accept4 sys_accept4 __ia32_sys_accept4 +365 i386 getsockopt sys_getsockopt __ia32_compat_sys_getsockopt +366 i386 setsockopt sys_setsockopt __ia32_compat_sys_setsockopt +367 i386 getsockname sys_getsockname __ia32_sys_getsockname +368 i386 getpeername sys_getpeername __ia32_sys_getpeername +369 i386 sendto sys_sendto __ia32_sys_sendto +370 i386 sendmsg sys_sendmsg __ia32_compat_sys_sendmsg +371 i386 recvfrom sys_recvfrom __ia32_compat_sys_recvfrom +372 i386 recvmsg sys_recvmsg __ia32_compat_sys_recvmsg +373 i386 shutdown sys_shutdown __ia32_sys_shutdown +374 i386 userfaultfd sys_userfaultfd __ia32_sys_userfaultfd +375 i386 membarrier sys_membarrier __ia32_sys_membarrier +376 i386 mlock2 sys_mlock2 __ia32_sys_mlock2 +377 i386 copy_file_range sys_copy_file_range __ia32_sys_copy_file_range +378 i386 preadv2 sys_preadv2 __ia32_compat_sys_preadv2 +379 i386 pwritev2 sys_pwritev2 __ia32_compat_sys_pwritev2 +380 i386 pkey_mprotect sys_pkey_mprotect __ia32_sys_pkey_mprotect +381 i386 pkey_alloc sys_pkey_alloc __ia32_sys_pkey_alloc +382 i386 pkey_free sys_pkey_free __ia32_sys_pkey_free +383 i386 statx sys_statx __ia32_sys_statx +384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 5aef183e2f85..4dfe42666d0c 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -4,379 +4,383 @@ # The format is: # <number> <abi> <name> <entry point> # +# The __x64_sys_*() stubs are created on-the-fly for sys_*() system calls +# # The abi is "common", "64" or "x32" for this file. # -0 common read sys_read -1 common write sys_write -2 common open sys_open -3 common close sys_close -4 common stat sys_newstat -5 common fstat sys_newfstat -6 common lstat sys_newlstat -7 common poll sys_poll -8 common lseek sys_lseek -9 common mmap sys_mmap -10 common mprotect sys_mprotect -11 common munmap sys_munmap -12 common brk sys_brk -13 64 rt_sigaction sys_rt_sigaction -14 common rt_sigprocmask sys_rt_sigprocmask -15 64 rt_sigreturn sys_rt_sigreturn/ptregs -16 64 ioctl sys_ioctl -17 common pread64 sys_pread64 -18 common pwrite64 sys_pwrite64 -19 64 readv sys_readv -20 64 writev sys_writev -21 common access sys_access -22 common pipe sys_pipe -23 common select sys_select -24 common sched_yield sys_sched_yield -25 common mremap sys_mremap -26 common msync sys_msync -27 common mincore sys_mincore -28 common madvise sys_madvise -29 common shmget sys_shmget -30 common shmat sys_shmat -31 common shmctl sys_shmctl -32 common dup sys_dup -33 common dup2 sys_dup2 -34 common pause sys_pause -35 common nanosleep sys_nanosleep -36 common getitimer sys_getitimer -37 common alarm sys_alarm -38 common setitimer sys_setitimer -39 common getpid sys_getpid -40 common sendfile sys_sendfile64 -41 common socket sys_socket -42 common connect sys_connect -43 common accept sys_accept -44 common sendto sys_sendto -45 64 recvfrom sys_recvfrom -46 64 sendmsg sys_sendmsg -47 64 recvmsg sys_recvmsg -48 common shutdown sys_shutdown -49 common bind sys_bind -50 common listen sys_listen -51 common getsockname sys_getsockname -52 common getpeername sys_getpeername -53 common socketpair sys_socketpair -54 64 setsockopt sys_setsockopt -55 64 getsockopt sys_getsockopt -56 common clone sys_clone/ptregs -57 common fork sys_fork/ptregs -58 common vfork sys_vfork/ptregs -59 64 execve sys_execve/ptregs -60 common exit sys_exit -61 common wait4 sys_wait4 -62 common kill sys_kill -63 common uname sys_newuname -64 common semget sys_semget -65 common semop sys_semop -66 common semctl sys_semctl -67 common shmdt sys_shmdt -68 common msgget sys_msgget -69 common msgsnd sys_msgsnd -70 common msgrcv sys_msgrcv -71 common msgctl sys_msgctl -72 common fcntl sys_fcntl -73 common flock sys_flock -74 common fsync sys_fsync -75 common fdatasync sys_fdatasync -76 common truncate sys_truncate -77 common ftruncate sys_ftruncate -78 common getdents sys_getdents -79 common getcwd sys_getcwd -80 common chdir sys_chdir -81 common fchdir sys_fchdir -82 common rename sys_rename -83 common mkdir sys_mkdir -84 common rmdir sys_rmdir -85 common creat sys_creat -86 common link sys_link -87 common unlink sys_unlink -88 common symlink sys_symlink -89 common readlink sys_readlink -90 common chmod sys_chmod -91 common fchmod sys_fchmod -92 common chown sys_chown -93 common fchown sys_fchown -94 common lchown sys_lchown -95 common umask sys_umask -96 common gettimeofday sys_gettimeofday -97 common getrlimit sys_getrlimit -98 common getrusage sys_getrusage -99 common sysinfo sys_sysinfo -100 common times sys_times -101 64 ptrace sys_ptrace -102 common getuid sys_getuid -103 common syslog sys_syslog -104 common getgid sys_getgid -105 common setuid sys_setuid -106 common setgid sys_setgid -107 common geteuid sys_geteuid -108 common getegid sys_getegid -109 common setpgid sys_setpgid -110 common getppid sys_getppid -111 common getpgrp sys_getpgrp -112 common setsid sys_setsid -113 common setreuid sys_setreuid -114 common setregid sys_setregid -115 common getgroups sys_getgroups -116 common setgroups sys_setgroups -117 common setresuid sys_setresuid -118 common getresuid sys_getresuid -119 common setresgid sys_setresgid -120 common getresgid sys_getresgid -121 common getpgid sys_getpgid -122 common setfsuid sys_setfsuid -123 common setfsgid sys_setfsgid -124 common getsid sys_getsid -125 common capget sys_capget -126 common capset sys_capset -127 64 rt_sigpending sys_rt_sigpending -128 64 rt_sigtimedwait sys_rt_sigtimedwait -129 64 rt_sigqueueinfo sys_rt_sigqueueinfo -130 common rt_sigsuspend sys_rt_sigsuspend -131 64 sigaltstack sys_sigaltstack -132 common utime sys_utime -133 common mknod sys_mknod +0 common read __x64_sys_read +1 common write __x64_sys_write +2 common open __x64_sys_open +3 common close __x64_sys_close +4 common stat __x64_sys_newstat +5 common fstat __x64_sys_newfstat +6 common lstat __x64_sys_newlstat +7 common poll __x64_sys_poll +8 common lseek __x64_sys_lseek +9 common mmap __x64_sys_mmap +10 common mprotect __x64_sys_mprotect +11 common munmap __x64_sys_munmap +12 common brk __x64_sys_brk +13 64 rt_sigaction __x64_sys_rt_sigaction +14 common rt_sigprocmask __x64_sys_rt_sigprocmask +15 64 rt_sigreturn __x64_sys_rt_sigreturn/ptregs +16 64 ioctl __x64_sys_ioctl +17 common pread64 __x64_sys_pread64 +18 common pwrite64 __x64_sys_pwrite64 +19 64 readv __x64_sys_readv +20 64 writev __x64_sys_writev +21 common access __x64_sys_access +22 common pipe __x64_sys_pipe +23 common select __x64_sys_select +24 common sched_yield __x64_sys_sched_yield +25 common mremap __x64_sys_mremap +26 common msync __x64_sys_msync +27 common mincore __x64_sys_mincore +28 common madvise __x64_sys_madvise +29 common shmget __x64_sys_shmget +30 common shmat __x64_sys_shmat +31 common shmctl __x64_sys_shmctl +32 common dup __x64_sys_dup +33 common dup2 __x64_sys_dup2 +34 common pause __x64_sys_pause +35 common nanosleep __x64_sys_nanosleep +36 common getitimer __x64_sys_getitimer +37 common alarm __x64_sys_alarm +38 common setitimer __x64_sys_setitimer +39 common getpid __x64_sys_getpid +40 common sendfile __x64_sys_sendfile64 +41 common socket __x64_sys_socket +42 common connect __x64_sys_connect +43 common accept __x64_sys_accept +44 common sendto __x64_sys_sendto +45 64 recvfrom __x64_sys_recvfrom +46 64 sendmsg __x64_sys_sendmsg +47 64 recvmsg __x64_sys_recvmsg +48 common shutdown __x64_sys_shutdown +49 common bind __x64_sys_bind +50 common listen __x64_sys_listen +51 common getsockname __x64_sys_getsockname +52 common getpeername __x64_sys_getpeername +53 common socketpair __x64_sys_socketpair +54 64 setsockopt __x64_sys_setsockopt +55 64 getsockopt __x64_sys_getsockopt +56 common clone __x64_sys_clone/ptregs +57 common fork __x64_sys_fork/ptregs +58 common vfork __x64_sys_vfork/ptregs +59 64 execve __x64_sys_execve/ptregs +60 common exit __x64_sys_exit +61 common wait4 __x64_sys_wait4 +62 common kill __x64_sys_kill +63 common uname __x64_sys_newuname +64 common semget __x64_sys_semget +65 common semop __x64_sys_semop +66 common semctl __x64_sys_semctl +67 common shmdt __x64_sys_shmdt +68 common msgget __x64_sys_msgget +69 common msgsnd __x64_sys_msgsnd +70 common msgrcv __x64_sys_msgrcv +71 common msgctl __x64_sys_msgctl +72 common fcntl __x64_sys_fcntl +73 common flock __x64_sys_flock +74 common fsync __x64_sys_fsync +75 common fdatasync __x64_sys_fdatasync +76 common truncate __x64_sys_truncate +77 common ftruncate __x64_sys_ftruncate +78 common getdents __x64_sys_getdents +79 common getcwd __x64_sys_getcwd +80 common chdir __x64_sys_chdir +81 common fchdir __x64_sys_fchdir +82 common rename __x64_sys_rename +83 common mkdir __x64_sys_mkdir +84 common rmdir __x64_sys_rmdir +85 common creat __x64_sys_creat +86 common link __x64_sys_link +87 common unlink __x64_sys_unlink +88 common symlink __x64_sys_symlink +89 common readlink __x64_sys_readlink +90 common chmod __x64_sys_chmod +91 common fchmod __x64_sys_fchmod +92 common chown __x64_sys_chown +93 common fchown __x64_sys_fchown +94 common lchown __x64_sys_lchown +95 common umask __x64_sys_umask +96 common gettimeofday __x64_sys_gettimeofday +97 common getrlimit __x64_sys_getrlimit +98 common getrusage __x64_sys_getrusage +99 common sysinfo __x64_sys_sysinfo +100 common times __x64_sys_times +101 64 ptrace __x64_sys_ptrace +102 common getuid __x64_sys_getuid +103 common syslog __x64_sys_syslog +104 common getgid __x64_sys_getgid +105 common setuid __x64_sys_setuid +106 common setgid __x64_sys_setgid +107 common geteuid __x64_sys_geteuid +108 common getegid __x64_sys_getegid +109 common setpgid __x64_sys_setpgid +110 common getppid __x64_sys_getppid +111 common getpgrp __x64_sys_getpgrp +112 common setsid __x64_sys_setsid +113 common setreuid __x64_sys_setreuid +114 common setregid __x64_sys_setregid +115 common getgroups __x64_sys_getgroups +116 common setgroups __x64_sys_setgroups +117 common setresuid __x64_sys_setresuid +118 common getresuid __x64_sys_getresuid +119 common setresgid __x64_sys_setresgid +120 common getresgid __x64_sys_getresgid +121 common getpgid __x64_sys_getpgid +122 common setfsuid __x64_sys_setfsuid +123 common setfsgid __x64_sys_setfsgid +124 common getsid __x64_sys_getsid +125 common capget __x64_sys_capget +126 common capset __x64_sys_capset +127 64 rt_sigpending __x64_sys_rt_sigpending +128 64 rt_sigtimedwait __x64_sys_rt_sigtimedwait +129 64 rt_sigqueueinfo __x64_sys_rt_sigqueueinfo +130 common rt_sigsuspend __x64_sys_rt_sigsuspend +131 64 sigaltstack __x64_sys_sigaltstack +132 common utime __x64_sys_utime +133 common mknod __x64_sys_mknod 134 64 uselib -135 common personality sys_personality -136 common ustat sys_ustat -137 common statfs sys_statfs -138 common fstatfs sys_fstatfs -139 common sysfs sys_sysfs -140 common getpriority sys_getpriority -141 common setpriority sys_setpriority -142 common sched_setparam sys_sched_setparam -143 common sched_getparam sys_sched_getparam -144 common sched_setscheduler sys_sched_setscheduler -145 common sched_getscheduler sys_sched_getscheduler -146 common sched_get_priority_max sys_sched_get_priority_max -147 common sched_get_priority_min sys_sched_get_priority_min -148 common sched_rr_get_interval sys_sched_rr_get_interval -149 common mlock sys_mlock -150 common munlock sys_munlock -151 common mlockall sys_mlockall -152 common munlockall sys_munlockall -153 common vhangup sys_vhangup -154 common modify_ldt sys_modify_ldt -155 common pivot_root sys_pivot_root -156 64 _sysctl sys_sysctl -157 common prctl sys_prctl -158 common arch_prctl sys_arch_prctl -159 common adjtimex sys_adjtimex -160 common setrlimit sys_setrlimit -161 common chroot sys_chroot -162 common sync sys_sync -163 common acct sys_acct -164 common settimeofday sys_settimeofday -165 common mount sys_mount -166 common umount2 sys_umount -167 common swapon sys_swapon -168 common swapoff sys_swapoff -169 common reboot sys_reboot -170 common sethostname sys_sethostname -171 common setdomainname sys_setdomainname -172 common iopl sys_iopl/ptregs -173 common ioperm sys_ioperm +135 common personality __x64_sys_personality +136 common ustat __x64_sys_ustat +137 common statfs __x64_sys_statfs +138 common fstatfs __x64_sys_fstatfs +139 common sysfs __x64_sys_sysfs +140 common getpriority __x64_sys_getpriority +141 common setpriority __x64_sys_setpriority +142 common sched_setparam __x64_sys_sched_setparam +143 common sched_getparam __x64_sys_sched_getparam +144 common sched_setscheduler __x64_sys_sched_setscheduler +145 common sched_getscheduler __x64_sys_sched_getscheduler +146 common sched_get_priority_max __x64_sys_sched_get_priority_max +147 common sched_get_priority_min __x64_sys_sched_get_priority_min +148 common sched_rr_get_interval __x64_sys_sched_rr_get_interval +149 common mlock __x64_sys_mlock +150 common munlock __x64_sys_munlock +151 common mlockall __x64_sys_mlockall +152 common munlockall __x64_sys_munlockall +153 common vhangup __x64_sys_vhangup +154 common modify_ldt __x64_sys_modify_ldt +155 common pivot_root __x64_sys_pivot_root +156 64 _sysctl __x64_sys_sysctl +157 common prctl __x64_sys_prctl +158 common arch_prctl __x64_sys_arch_prctl +159 common adjtimex __x64_sys_adjtimex +160 common setrlimit __x64_sys_setrlimit +161 common chroot __x64_sys_chroot +162 common sync __x64_sys_sync +163 common acct __x64_sys_acct +164 common settimeofday __x64_sys_settimeofday +165 common mount __x64_sys_mount +166 common umount2 __x64_sys_umount +167 common swapon __x64_sys_swapon +168 common swapoff __x64_sys_swapoff +169 common reboot __x64_sys_reboot +170 common sethostname __x64_sys_sethostname +171 common setdomainname __x64_sys_setdomainname +172 common iopl __x64_sys_iopl/ptregs +173 common ioperm __x64_sys_ioperm 174 64 create_module -175 common init_module sys_init_module -176 common delete_module sys_delete_module +175 common init_module __x64_sys_init_module +176 common delete_module __x64_sys_delete_module 177 64 get_kernel_syms 178 64 query_module -179 common quotactl sys_quotactl +179 common quotactl __x64_sys_quotactl 180 64 nfsservctl 181 common getpmsg 182 common putpmsg 183 common afs_syscall 184 common tuxcall 185 common security -186 common gettid sys_gettid -187 common readahead sys_readahead -188 common setxattr sys_setxattr -189 common lsetxattr sys_lsetxattr -190 common fsetxattr sys_fsetxattr -191 common getxattr sys_getxattr -192 common lgetxattr sys_lgetxattr -193 common fgetxattr sys_fgetxattr -194 common listxattr sys_listxattr -195 common llistxattr sys_llistxattr -196 common flistxattr sys_flistxattr -197 common removexattr sys_removexattr -198 common lremovexattr sys_lremovexattr -199 common fremovexattr sys_fremovexattr -200 common tkill sys_tkill -201 common time sys_time -202 common futex sys_futex -203 common sched_setaffinity sys_sched_setaffinity -204 common sched_getaffinity sys_sched_getaffinity +186 common gettid __x64_sys_gettid +187 common readahead __x64_sys_readahead +188 common setxattr __x64_sys_setxattr +189 common lsetxattr __x64_sys_lsetxattr +190 common fsetxattr __x64_sys_fsetxattr +191 common getxattr __x64_sys_getxattr +192 common lgetxattr __x64_sys_lgetxattr +193 common fgetxattr __x64_sys_fgetxattr +194 common listxattr __x64_sys_listxattr +195 common llistxattr __x64_sys_llistxattr +196 common flistxattr __x64_sys_flistxattr +197 common removexattr __x64_sys_removexattr +198 common lremovexattr __x64_sys_lremovexattr +199 common fremovexattr __x64_sys_fremovexattr +200 common tkill __x64_sys_tkill +201 common time __x64_sys_time +202 common futex __x64_sys_futex +203 common sched_setaffinity __x64_sys_sched_setaffinity +204 common sched_getaffinity __x64_sys_sched_getaffinity 205 64 set_thread_area -206 64 io_setup sys_io_setup -207 common io_destroy sys_io_destroy -208 common io_getevents sys_io_getevents -209 64 io_submit sys_io_submit -210 common io_cancel sys_io_cancel +206 64 io_setup __x64_sys_io_setup +207 common io_destroy __x64_sys_io_destroy +208 common io_getevents __x64_sys_io_getevents +209 64 io_submit __x64_sys_io_submit +210 common io_cancel __x64_sys_io_cancel 211 64 get_thread_area -212 common lookup_dcookie sys_lookup_dcookie -213 common epoll_create sys_epoll_create +212 common lookup_dcookie __x64_sys_lookup_dcookie +213 common epoll_create __x64_sys_epoll_create 214 64 epoll_ctl_old 215 64 epoll_wait_old -216 common remap_file_pages sys_remap_file_pages -217 common getdents64 sys_getdents64 -218 common set_tid_address sys_set_tid_address -219 common restart_syscall sys_restart_syscall -220 common semtimedop sys_semtimedop -221 common fadvise64 sys_fadvise64 -222 64 timer_create sys_timer_create -223 common timer_settime sys_timer_settime -224 common timer_gettime sys_timer_gettime -225 common timer_getoverrun sys_timer_getoverrun -226 common timer_delete sys_timer_delete -227 common clock_settime sys_clock_settime -228 common clock_gettime sys_clock_gettime -229 common clock_getres sys_clock_getres -230 common clock_nanosleep sys_clock_nanosleep -231 common exit_group sys_exit_group -232 common epoll_wait sys_epoll_wait -233 common epoll_ctl sys_epoll_ctl -234 common tgkill sys_tgkill -235 common utimes sys_utimes +216 common remap_file_pages __x64_sys_remap_file_pages +217 common getdents64 __x64_sys_getdents64 +218 common set_tid_address __x64_sys_set_tid_address +219 common restart_syscall __x64_sys_restart_syscall +220 common semtimedop __x64_sys_semtimedop +221 common fadvise64 __x64_sys_fadvise64 +222 64 timer_create __x64_sys_timer_create +223 common timer_settime __x64_sys_timer_settime +224 common timer_gettime __x64_sys_timer_gettime +225 common timer_getoverrun __x64_sys_timer_getoverrun +226 common timer_delete __x64_sys_timer_delete +227 common clock_settime __x64_sys_clock_settime +228 common clock_gettime __x64_sys_clock_gettime +229 common clock_getres __x64_sys_clock_getres +230 common clock_nanosleep __x64_sys_clock_nanosleep +231 common exit_group __x64_sys_exit_group +232 common epoll_wait __x64_sys_epoll_wait +233 common epoll_ctl __x64_sys_epoll_ctl +234 common tgkill __x64_sys_tgkill +235 common utimes __x64_sys_utimes 236 64 vserver -237 common mbind sys_mbind -238 common set_mempolicy sys_set_mempolicy -239 common get_mempolicy sys_get_mempolicy -240 common mq_open sys_mq_open -241 common mq_unlink sys_mq_unlink -242 common mq_timedsend sys_mq_timedsend -243 common mq_timedreceive sys_mq_timedreceive -244 64 mq_notify sys_mq_notify -245 common mq_getsetattr sys_mq_getsetattr -246 64 kexec_load sys_kexec_load -247 64 waitid sys_waitid -248 common add_key sys_add_key -249 common request_key sys_request_key -250 common keyctl sys_keyctl -251 common ioprio_set sys_ioprio_set -252 common ioprio_get sys_ioprio_get -253 common inotify_init sys_inotify_init -254 common inotify_add_watch sys_inotify_add_watch -255 common inotify_rm_watch sys_inotify_rm_watch -256 common migrate_pages sys_migrate_pages -257 common openat sys_openat -258 common mkdirat sys_mkdirat -259 common mknodat sys_mknodat -260 common fchownat sys_fchownat -261 common futimesat sys_futimesat -262 common newfstatat sys_newfstatat -263 common unlinkat sys_unlinkat -264 common renameat sys_renameat -265 common linkat sys_linkat -266 common symlinkat sys_symlinkat -267 common readlinkat sys_readlinkat -268 common fchmodat sys_fchmodat -269 common faccessat sys_faccessat -270 common pselect6 sys_pselect6 -271 common ppoll sys_ppoll -272 common unshare sys_unshare -273 64 set_robust_list sys_set_robust_list -274 64 get_robust_list sys_get_robust_list -275 common splice sys_splice -276 common tee sys_tee -277 common sync_file_range sys_sync_file_range -278 64 vmsplice sys_vmsplice -279 64 move_pages sys_move_pages -280 common utimensat sys_utimensat -281 common epoll_pwait sys_epoll_pwait -282 common signalfd sys_signalfd -283 common timerfd_create sys_timerfd_create -284 common eventfd sys_eventfd -285 common fallocate sys_fallocate -286 common timerfd_settime sys_timerfd_settime -287 common timerfd_gettime sys_timerfd_gettime -288 common accept4 sys_accept4 -289 common signalfd4 sys_signalfd4 -290 common eventfd2 sys_eventfd2 -291 common epoll_create1 sys_epoll_create1 -292 common dup3 sys_dup3 -293 common pipe2 sys_pipe2 -294 common inotify_init1 sys_inotify_init1 -295 64 preadv sys_preadv -296 64 pwritev sys_pwritev -297 64 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo -298 common perf_event_open sys_perf_event_open -299 64 recvmmsg sys_recvmmsg -300 common fanotify_init sys_fanotify_init -301 common fanotify_mark sys_fanotify_mark -302 common prlimit64 sys_prlimit64 -303 common name_to_handle_at sys_name_to_handle_at -304 common open_by_handle_at sys_open_by_handle_at -305 common clock_adjtime sys_clock_adjtime -306 common syncfs sys_syncfs -307 64 sendmmsg sys_sendmmsg -308 common setns sys_setns -309 common getcpu sys_getcpu -310 64 process_vm_readv sys_process_vm_readv -311 64 process_vm_writev sys_process_vm_writev -312 common kcmp sys_kcmp -313 common finit_module sys_finit_module -314 common sched_setattr sys_sched_setattr -315 common sched_getattr sys_sched_getattr -316 common renameat2 sys_renameat2 -317 common seccomp sys_seccomp -318 common getrandom sys_getrandom -319 common memfd_create sys_memfd_create -320 common kexec_file_load sys_kexec_file_load -321 common bpf sys_bpf -322 64 execveat sys_execveat/ptregs -323 common userfaultfd sys_userfaultfd -324 common membarrier sys_membarrier -325 common mlock2 sys_mlock2 -326 common copy_file_range sys_copy_file_range -327 64 preadv2 sys_preadv2 -328 64 pwritev2 sys_pwritev2 -329 common pkey_mprotect sys_pkey_mprotect -330 common pkey_alloc sys_pkey_alloc -331 common pkey_free sys_pkey_free -332 common statx sys_statx +237 common mbind __x64_sys_mbind +238 common set_mempolicy __x64_sys_set_mempolicy +239 common get_mempolicy __x64_sys_get_mempolicy +240 common mq_open __x64_sys_mq_open +241 common mq_unlink __x64_sys_mq_unlink +242 common mq_timedsend __x64_sys_mq_timedsend +243 common mq_timedreceive __x64_sys_mq_timedreceive +244 64 mq_notify __x64_sys_mq_notify +245 common mq_getsetattr __x64_sys_mq_getsetattr +246 64 kexec_load __x64_sys_kexec_load +247 64 waitid __x64_sys_waitid +248 common add_key __x64_sys_add_key +249 common request_key __x64_sys_request_key +250 common keyctl __x64_sys_keyctl +251 common ioprio_set __x64_sys_ioprio_set +252 common ioprio_get __x64_sys_ioprio_get +253 common inotify_init __x64_sys_inotify_init +254 common inotify_add_watch __x64_sys_inotify_add_watch +255 common inotify_rm_watch __x64_sys_inotify_rm_watch +256 common migrate_pages __x64_sys_migrate_pages +257 common openat __x64_sys_openat +258 common mkdirat __x64_sys_mkdirat +259 common mknodat __x64_sys_mknodat +260 common fchownat __x64_sys_fchownat +261 common futimesat __x64_sys_futimesat +262 common newfstatat __x64_sys_newfstatat +263 common unlinkat __x64_sys_unlinkat +264 common renameat __x64_sys_renameat +265 common linkat __x64_sys_linkat +266 common symlinkat __x64_sys_symlinkat +267 common readlinkat __x64_sys_readlinkat +268 common fchmodat __x64_sys_fchmodat +269 common faccessat __x64_sys_faccessat +270 common pselect6 __x64_sys_pselect6 +271 common ppoll __x64_sys_ppoll +272 common unshare __x64_sys_unshare +273 64 set_robust_list __x64_sys_set_robust_list +274 64 get_robust_list __x64_sys_get_robust_list +275 common splice __x64_sys_splice +276 common tee __x64_sys_tee +277 common sync_file_range __x64_sys_sync_file_range +278 64 vmsplice __x64_sys_vmsplice +279 64 move_pages __x64_sys_move_pages +280 common utimensat __x64_sys_utimensat +281 common epoll_pwait __x64_sys_epoll_pwait +282 common signalfd __x64_sys_signalfd +283 common timerfd_create __x64_sys_timerfd_create +284 common eventfd __x64_sys_eventfd +285 common fallocate __x64_sys_fallocate +286 common timerfd_settime __x64_sys_timerfd_settime +287 common timerfd_gettime __x64_sys_timerfd_gettime +288 common accept4 __x64_sys_accept4 +289 common signalfd4 __x64_sys_signalfd4 +290 common eventfd2 __x64_sys_eventfd2 +291 common epoll_create1 __x64_sys_epoll_create1 +292 common dup3 __x64_sys_dup3 +293 common pipe2 __x64_sys_pipe2 +294 common inotify_init1 __x64_sys_inotify_init1 +295 64 preadv __x64_sys_preadv +296 64 pwritev __x64_sys_pwritev +297 64 rt_tgsigqueueinfo __x64_sys_rt_tgsigqueueinfo +298 common perf_event_open __x64_sys_perf_event_open +299 64 recvmmsg __x64_sys_recvmmsg +300 common fanotify_init __x64_sys_fanotify_init +301 common fanotify_mark __x64_sys_fanotify_mark +302 common prlimit64 __x64_sys_prlimit64 +303 common name_to_handle_at __x64_sys_name_to_handle_at +304 common open_by_handle_at __x64_sys_open_by_handle_at +305 common clock_adjtime __x64_sys_clock_adjtime +306 common syncfs __x64_sys_syncfs +307 64 sendmmsg __x64_sys_sendmmsg +308 common setns __x64_sys_setns +309 common getcpu __x64_sys_getcpu +310 64 process_vm_readv __x64_sys_process_vm_readv +311 64 process_vm_writev __x64_sys_process_vm_writev +312 common kcmp __x64_sys_kcmp +313 common finit_module __x64_sys_finit_module +314 common sched_setattr __x64_sys_sched_setattr +315 common sched_getattr __x64_sys_sched_getattr +316 common renameat2 __x64_sys_renameat2 +317 common seccomp __x64_sys_seccomp +318 common getrandom __x64_sys_getrandom +319 common memfd_create __x64_sys_memfd_create +320 common kexec_file_load __x64_sys_kexec_file_load +321 common bpf __x64_sys_bpf +322 64 execveat __x64_sys_execveat/ptregs +323 common userfaultfd __x64_sys_userfaultfd +324 common membarrier __x64_sys_membarrier +325 common mlock2 __x64_sys_mlock2 +326 common copy_file_range __x64_sys_copy_file_range +327 64 preadv2 __x64_sys_preadv2 +328 64 pwritev2 __x64_sys_pwritev2 +329 common pkey_mprotect __x64_sys_pkey_mprotect +330 common pkey_alloc __x64_sys_pkey_alloc +331 common pkey_free __x64_sys_pkey_free +332 common statx __x64_sys_statx # # x32-specific system call numbers start at 512 to avoid cache impact -# for native 64-bit operation. +# for native 64-bit operation. The __x32_compat_sys stubs are created +# on-the-fly for compat_sys_*() compatibility system calls if X86_X32 +# is defined. # -512 x32 rt_sigaction compat_sys_rt_sigaction +512 x32 rt_sigaction __x32_compat_sys_rt_sigaction 513 x32 rt_sigreturn sys32_x32_rt_sigreturn -514 x32 ioctl compat_sys_ioctl -515 x32 readv compat_sys_readv -516 x32 writev compat_sys_writev -517 x32 recvfrom compat_sys_recvfrom -518 x32 sendmsg compat_sys_sendmsg -519 x32 recvmsg compat_sys_recvmsg -520 x32 execve compat_sys_execve/ptregs -521 x32 ptrace compat_sys_ptrace -522 x32 rt_sigpending compat_sys_rt_sigpending -523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait -524 x32 rt_sigqueueinfo compat_sys_rt_sigqueueinfo -525 x32 sigaltstack compat_sys_sigaltstack -526 x32 timer_create compat_sys_timer_create -527 x32 mq_notify compat_sys_mq_notify -528 x32 kexec_load compat_sys_kexec_load -529 x32 waitid compat_sys_waitid -530 x32 set_robust_list compat_sys_set_robust_list -531 x32 get_robust_list compat_sys_get_robust_list -532 x32 vmsplice compat_sys_vmsplice -533 x32 move_pages compat_sys_move_pages -534 x32 preadv compat_sys_preadv64 -535 x32 pwritev compat_sys_pwritev64 -536 x32 rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo -537 x32 recvmmsg compat_sys_recvmmsg -538 x32 sendmmsg compat_sys_sendmmsg -539 x32 process_vm_readv compat_sys_process_vm_readv -540 x32 process_vm_writev compat_sys_process_vm_writev -541 x32 setsockopt compat_sys_setsockopt -542 x32 getsockopt compat_sys_getsockopt -543 x32 io_setup compat_sys_io_setup -544 x32 io_submit compat_sys_io_submit -545 x32 execveat compat_sys_execveat/ptregs -546 x32 preadv2 compat_sys_preadv64v2 -547 x32 pwritev2 compat_sys_pwritev64v2 +514 x32 ioctl __x32_compat_sys_ioctl +515 x32 readv __x32_compat_sys_readv +516 x32 writev __x32_compat_sys_writev +517 x32 recvfrom __x32_compat_sys_recvfrom +518 x32 sendmsg __x32_compat_sys_sendmsg +519 x32 recvmsg __x32_compat_sys_recvmsg +520 x32 execve __x32_compat_sys_execve/ptregs +521 x32 ptrace __x32_compat_sys_ptrace +522 x32 rt_sigpending __x32_compat_sys_rt_sigpending +523 x32 rt_sigtimedwait __x32_compat_sys_rt_sigtimedwait +524 x32 rt_sigqueueinfo __x32_compat_sys_rt_sigqueueinfo +525 x32 sigaltstack __x32_compat_sys_sigaltstack +526 x32 timer_create __x32_compat_sys_timer_create +527 x32 mq_notify __x32_compat_sys_mq_notify +528 x32 kexec_load __x32_compat_sys_kexec_load +529 x32 waitid __x32_compat_sys_waitid +530 x32 set_robust_list __x32_compat_sys_set_robust_list +531 x32 get_robust_list __x32_compat_sys_get_robust_list +532 x32 vmsplice __x32_compat_sys_vmsplice +533 x32 move_pages __x32_compat_sys_move_pages +534 x32 preadv __x32_compat_sys_preadv64 +535 x32 pwritev __x32_compat_sys_pwritev64 +536 x32 rt_tgsigqueueinfo __x32_compat_sys_rt_tgsigqueueinfo +537 x32 recvmmsg __x32_compat_sys_recvmmsg +538 x32 sendmmsg __x32_compat_sys_sendmmsg +539 x32 process_vm_readv __x32_compat_sys_process_vm_readv +540 x32 process_vm_writev __x32_compat_sys_process_vm_writev +541 x32 setsockopt __x32_compat_sys_setsockopt +542 x32 getsockopt __x32_compat_sys_getsockopt +543 x32 io_setup __x32_compat_sys_io_setup +544 x32 io_submit __x32_compat_sys_io_submit +545 x32 execveat __x32_compat_sys_execveat/ptregs +546 x32 preadv2 __x32_compat_sys_preadv64v2 +547 x32 pwritev2 __x32_compat_sys_pwritev64v2 diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh index d71ef4bd3615..94fcd1951aca 100644 --- a/arch/x86/entry/syscalls/syscalltbl.sh +++ b/arch/x86/entry/syscalls/syscalltbl.sh @@ -25,15 +25,27 @@ emit() { nr="$2" entry="$3" compat="$4" + umlentry="" if [ "$abi" = "64" -a -n "$compat" ]; then echo "a compat entry for a 64-bit syscall makes no sense" >&2 exit 1 fi + # For CONFIG_UML, we need to strip the __x64_sys prefix + if [ "$abi" = "64" -a "${entry}" != "${entry#__x64_sys}" ]; then + umlentry="sys${entry#__x64_sys}" + fi + if [ -z "$compat" ]; then - if [ -n "$entry" ]; then + if [ -n "$entry" -a -z "$umlentry" ]; then syscall_macro "$abi" "$nr" "$entry" + elif [ -n "$umlentry" ]; then # implies -n "$entry" + echo "#ifdef CONFIG_X86" + syscall_macro "$abi" "$nr" "$entry" + echo "#else /* CONFIG_UML */" + syscall_macro "$abi" "$nr" "$umlentry" + echo "#endif" fi else echo "#ifdef CONFIG_X86_32" diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 1943aebadede..d998a487c9b1 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -42,9 +42,7 @@ vdso_img_cfiles := $(vdso_img-y:%=vdso-image-%.c) vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg) obj-y += $(vdso_img_objs) targets += $(vdso_img_cfiles) -targets += $(vdso_img_sodbg) -.SECONDARY: $(vdso_img-y:%=$(obj)/vdso-image-%.c) \ - $(vdso_img-y:%=$(obj)/vdso%.so) +targets += $(vdso_img_sodbg) $(vdso_img-y:%=vdso%.so) export CPPFLAGS_vdso.lds += -P -C diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c index 7780bbfb06ef..9242b28418d5 100644 --- a/arch/x86/entry/vdso/vdso32/vclock_gettime.c +++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c @@ -5,8 +5,6 @@ #undef CONFIG_OPTIMIZE_INLINING #endif -#undef CONFIG_X86_PPRO_FENCE - #ifdef CONFIG_X86_64 /* diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 577fa8adb785..70b7845434cb 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -42,10 +42,8 @@ #define CREATE_TRACE_POINTS #include "vsyscall_trace.h" -static enum { EMULATE, NATIVE, NONE } vsyscall_mode = -#if defined(CONFIG_LEGACY_VSYSCALL_NATIVE) - NATIVE; -#elif defined(CONFIG_LEGACY_VSYSCALL_NONE) +static enum { EMULATE, NONE } vsyscall_mode = +#ifdef CONFIG_LEGACY_VSYSCALL_NONE NONE; #else EMULATE; @@ -56,8 +54,6 @@ static int __init vsyscall_setup(char *str) if (str) { if (!strcmp("emulate", str)) vsyscall_mode = EMULATE; - else if (!strcmp("native", str)) - vsyscall_mode = NATIVE; else if (!strcmp("none", str)) vsyscall_mode = NONE; else @@ -131,6 +127,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) int vsyscall_nr, syscall_nr, tmp; int prev_sig_on_uaccess_err; long ret; + unsigned long orig_dx; /* * No point in checking CS -- the only way to get here is a user mode @@ -139,10 +136,6 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) WARN_ON_ONCE(address != regs->ip); - /* This should be unreachable in NATIVE mode. */ - if (WARN_ON(vsyscall_mode == NATIVE)) - return false; - if (vsyscall_mode == NONE) { warn_bad_vsyscall(KERN_INFO, regs, "vsyscall attempted with vsyscall=none"); @@ -235,19 +228,22 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) ret = -EFAULT; switch (vsyscall_nr) { case 0: - ret = sys_gettimeofday( - (struct timeval __user *)regs->di, - (struct timezone __user *)regs->si); + /* this decodes regs->di and regs->si on its own */ + ret = __x64_sys_gettimeofday(regs); break; case 1: - ret = sys_time((time_t __user *)regs->di); + /* this decodes regs->di on its own */ + ret = __x64_sys_time(regs); break; case 2: - ret = sys_getcpu((unsigned __user *)regs->di, - (unsigned __user *)regs->si, - NULL); + /* while we could clobber regs->dx, we didn't in the past... */ + orig_dx = regs->dx; + regs->dx = 0; + /* this decodes regs->di, regs->si and regs->dx on its own */ + ret = __x64_sys_getcpu(regs); + regs->dx = orig_dx; break; } @@ -355,7 +351,7 @@ void __init set_vsyscall_pgtable_user_bits(pgd_t *root) set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); p4d = p4d_offset(pgd, VSYSCALL_ADDR); #if CONFIG_PGTABLE_LEVELS >= 5 - p4d->p4d |= _PAGE_USER; + set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER)); #endif pud = pud_offset(p4d, VSYSCALL_ADDR); set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER)); @@ -370,9 +366,7 @@ void __init map_vsyscall(void) if (vsyscall_mode != NONE) { __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, - vsyscall_mode == NATIVE - ? PAGE_KERNEL_VSYSCALL - : PAGE_KERNEL_VVAR); + PAGE_KERNEL_VVAR); set_vsyscall_pgtable_user_bits(swapper_pg_dir); } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 140d33288e78..a6006e7bb729 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -48,7 +48,7 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; -struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE; +DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key); u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] @@ -990,7 +990,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, if (!dogrp) return n; - list_for_each_entry(event, &leader->sibling_list, group_entry) { + for_each_sibling_event(event, leader) { if (!is_x86_event(event) || event->state <= PERF_EVENT_STATE_OFF) continue; @@ -1156,16 +1156,13 @@ int x86_perf_event_set_period(struct perf_event *event) per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; - if (!(hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) || - local64_read(&hwc->prev_count) != (u64)-left) { - /* - * The hw event starts counting from this event offset, - * mark it to be able to extra future deltas: - */ - local64_set(&hwc->prev_count, (u64)-left); + /* + * The hw event starts counting from this event offset, + * mark it to be able to extra future deltas: + */ + local64_set(&hwc->prev_count, (u64)-left); - wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); - } + wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); /* * Due to erratum on certan cpu we need @@ -1884,6 +1881,8 @@ early_initcall(init_hw_perf_events); static inline void x86_pmu_read(struct perf_event *event) { + if (x86_pmu.read) + return x86_pmu.read(event); x86_perf_event_update(event); } @@ -2118,7 +2117,8 @@ static int x86_pmu_event_init(struct perf_event *event) event->destroy(event); } - if (READ_ONCE(x86_pmu.attr_rdpmc)) + if (READ_ONCE(x86_pmu.attr_rdpmc) && + !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS)) event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED; return err; @@ -2206,9 +2206,9 @@ static ssize_t set_attr_rdpmc(struct device *cdev, * but only root can trigger it, so it's okay. */ if (val == 2) - static_key_slow_inc(&rdpmc_always_available); + static_branch_inc(&rdpmc_always_available_key); else - static_key_slow_dec(&rdpmc_always_available); + static_branch_dec(&rdpmc_always_available_key); on_each_cpu(refresh_pce, NULL, 1); } diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 731153a4681e..707b2a96e516 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2060,6 +2060,14 @@ static void intel_pmu_del_event(struct perf_event *event) intel_pmu_pebs_del(event); } +static void intel_pmu_read_event(struct perf_event *event) +{ + if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) + intel_pmu_auto_reload_read(event); + else + x86_perf_event_update(event); +} + static void intel_pmu_enable_fixed(struct hw_perf_event *hwc) { int idx = hwc->idx - INTEL_PMC_IDX_FIXED; @@ -2201,16 +2209,23 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) int bit, loops; u64 status; int handled; + int pmu_enabled; cpuc = this_cpu_ptr(&cpu_hw_events); /* + * Save the PMU state. + * It needs to be restored when leaving the handler. + */ + pmu_enabled = cpuc->enabled; + /* * No known reason to not always do late ACK, * but just in case do it opt-in. */ if (!x86_pmu.late_ack) apic_write(APIC_LVTPC, APIC_DM_NMI); intel_bts_disable_local(); + cpuc->enabled = 0; __intel_pmu_disable_all(); handled = intel_pmu_drain_bts_buffer(); handled += intel_bts_interrupt(); @@ -2320,7 +2335,8 @@ again: done: /* Only restore PMU state when it's active. See x86_pmu_disable(). */ - if (cpuc->enabled) + cpuc->enabled = pmu_enabled; + if (pmu_enabled) __intel_pmu_enable_all(0, true); intel_bts_enable_local(); @@ -2952,9 +2968,9 @@ static void intel_pebs_aliases_skl(struct perf_event *event) return intel_pebs_aliases_precdist(event); } -static unsigned long intel_pmu_free_running_flags(struct perf_event *event) +static unsigned long intel_pmu_large_pebs_flags(struct perf_event *event) { - unsigned long flags = x86_pmu.free_running_flags; + unsigned long flags = x86_pmu.large_pebs_flags; if (event->attr.use_clockid) flags &= ~PERF_SAMPLE_TIME; @@ -2976,8 +2992,8 @@ static int intel_pmu_hw_config(struct perf_event *event) if (!event->attr.freq) { event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; if (!(event->attr.sample_type & - ~intel_pmu_free_running_flags(event))) - event->hw.flags |= PERF_X86_EVENT_FREERUNNING; + ~intel_pmu_large_pebs_flags(event))) + event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS; } if (x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); @@ -3188,13 +3204,13 @@ glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx, * Therefore the effective (average) period matches the requested period, * despite coarser hardware granularity. */ -static unsigned bdw_limit_period(struct perf_event *event, unsigned left) +static u64 bdw_limit_period(struct perf_event *event, u64 left) { if ((event->hw.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0xc0, .umask=0x01)) { if (left < 128) left = 128; - left &= ~0x3fu; + left &= ~0x3fULL; } return left; } @@ -3323,7 +3339,8 @@ static void intel_pmu_cpu_starting(int cpu) cpuc->lbr_sel = NULL; - flip_smm_bit(&x86_pmu.attr_freeze_on_smi); + if (x86_pmu.version > 1) + flip_smm_bit(&x86_pmu.attr_freeze_on_smi); if (!cpuc->shared_regs) return; @@ -3460,7 +3477,7 @@ static __initconst const struct x86_pmu core_pmu = { .event_map = intel_pmu_event_map, .max_events = ARRAY_SIZE(intel_perfmon_event_map), .apic = 1, - .free_running_flags = PEBS_FREERUNNING_FLAGS, + .large_pebs_flags = LARGE_PEBS_FLAGS, /* * Intel PMCs cannot be accessed sanely above 32-bit width, @@ -3486,6 +3503,8 @@ static __initconst const struct x86_pmu core_pmu = { .cpu_dying = intel_pmu_cpu_dying, }; +static struct attribute *intel_pmu_attrs[]; + static __initconst const struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, @@ -3495,6 +3514,7 @@ static __initconst const struct x86_pmu intel_pmu = { .disable = intel_pmu_disable_event, .add = intel_pmu_add_event, .del = intel_pmu_del_event, + .read = intel_pmu_read_event, .hw_config = intel_pmu_hw_config, .schedule_events = x86_schedule_events, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, @@ -3502,7 +3522,7 @@ static __initconst const struct x86_pmu intel_pmu = { .event_map = intel_pmu_event_map, .max_events = ARRAY_SIZE(intel_perfmon_event_map), .apic = 1, - .free_running_flags = PEBS_FREERUNNING_FLAGS, + .large_pebs_flags = LARGE_PEBS_FLAGS, /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of @@ -3516,6 +3536,8 @@ static __initconst const struct x86_pmu intel_pmu = { .format_attrs = intel_arch3_formats_attr, .events_sysfs_show = intel_event_sysfs_show, + .attrs = intel_pmu_attrs, + .cpu_prepare = intel_pmu_cpu_prepare, .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, @@ -3559,7 +3581,7 @@ static int intel_snb_pebs_broken(int cpu) break; case INTEL_FAM6_SANDYBRIDGE_X: - switch (cpu_data(cpu).x86_mask) { + switch (cpu_data(cpu).x86_stepping) { case 6: rev = 0x618; break; case 7: rev = 0x70c; break; } @@ -3894,8 +3916,6 @@ __init int intel_pmu_init(void) x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters); - - x86_pmu.attrs = intel_pmu_attrs; /* * Quirk: v2 perfmon does not report fixed-purpose events, so * assume at least 3 events, when not running in a hypervisor: diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 72db0664a53d..9aca448bb8e6 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -40,50 +40,51 @@ * Model specific counters: * MSR_CORE_C1_RES: CORE C1 Residency Counter * perf code: 0x00 - * Available model: SLM,AMT,GLM + * Available model: SLM,AMT,GLM,CNL * Scope: Core (each processor core has a MSR) * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter * perf code: 0x01 - * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,GLM + * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,GLM, + CNL * Scope: Core * MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter * perf code: 0x02 - * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW - * SKL,KNL,GLM + * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, + * SKL,KNL,GLM,CNL * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 - * Available model: SNB,IVB,HSW,BDW,SKL + * Available model: SNB,IVB,HSW,BDW,SKL,CNL * Scope: Core * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. * perf code: 0x00 - * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM + * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL * Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. * perf code: 0x01 - * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL - * GLM + * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL, + * GLM,CNL * Scope: Package (physical package) * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter. * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW - * SKL,KNL,GLM + * SKL,KNL,GLM,CNL * Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. * perf code: 0x03 - * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL + * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL * Scope: Package (physical package) * MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter. * perf code: 0x04 - * Available model: HSW ULT only + * Available model: HSW ULT,CNL * Scope: Package (physical package) * MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter. * perf code: 0x05 - * Available model: HSW ULT only + * Available model: HSW ULT,CNL * Scope: Package (physical package) * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. * perf code: 0x06 - * Available model: HSW ULT, GLM + * Available model: HSW ULT,GLM,CNL * Scope: Package (physical package) * */ @@ -486,6 +487,21 @@ static const struct cstate_model hswult_cstates __initconst = { BIT(PERF_CSTATE_PKG_C10_RES), }; +static const struct cstate_model cnl_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | + BIT(PERF_CSTATE_CORE_C3_RES) | + BIT(PERF_CSTATE_CORE_C6_RES) | + BIT(PERF_CSTATE_CORE_C7_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) | + BIT(PERF_CSTATE_PKG_C3_RES) | + BIT(PERF_CSTATE_PKG_C6_RES) | + BIT(PERF_CSTATE_PKG_C7_RES) | + BIT(PERF_CSTATE_PKG_C8_RES) | + BIT(PERF_CSTATE_PKG_C9_RES) | + BIT(PERF_CSTATE_PKG_C10_RES), +}; + static const struct cstate_model slm_cstates __initconst = { .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | BIT(PERF_CSTATE_CORE_C6_RES), @@ -557,6 +573,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_MOBILE, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_DESKTOP, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_CANNONLAKE_MOBILE, cnl_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNL, knl_cstates), X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNM, knl_cstates), diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 18c25ab28557..8a10a045b57b 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -935,7 +935,7 @@ void intel_pmu_pebs_add(struct perf_event *event) bool needed_cb = pebs_needs_sched_cb(cpuc); cpuc->n_pebs++; - if (hwc->flags & PERF_X86_EVENT_FREERUNNING) + if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS) cpuc->n_large_pebs++; pebs_update_state(needed_cb, cpuc, event->ctx->pmu); @@ -975,7 +975,7 @@ void intel_pmu_pebs_del(struct perf_event *event) bool needed_cb = pebs_needs_sched_cb(cpuc); cpuc->n_pebs--; - if (hwc->flags & PERF_X86_EVENT_FREERUNNING) + if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS) cpuc->n_large_pebs--; pebs_update_state(needed_cb, cpuc, event->ctx->pmu); @@ -1196,8 +1196,13 @@ static void setup_pebs_sample_data(struct perf_event *event, * and PMI. */ *regs = *iregs; - regs->flags = pebs->flags; - set_linear_ip(regs, pebs->ip); + + /* + * Initialize regs_>flags from PEBS, + * Clear exact bit (which uses x86 EFLAGS Reserved bit 3), + * i.e., do not rely on it being zero: + */ + regs->flags = pebs->flags & ~PERF_EFLAGS_EXACT; if (sample_type & PERF_SAMPLE_REGS_INTR) { regs->ax = pebs->ax; @@ -1217,10 +1222,6 @@ static void setup_pebs_sample_data(struct perf_event *event, regs->sp = pebs->sp; } - /* - * Preserve PERF_EFLAGS_VM from set_linear_ip(). - */ - regs->flags = pebs->flags | (regs->flags & PERF_EFLAGS_VM); #ifndef CONFIG_X86_32 regs->r8 = pebs->r8; regs->r9 = pebs->r9; @@ -1233,13 +1234,35 @@ static void setup_pebs_sample_data(struct perf_event *event, #endif } - if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) { - regs->ip = pebs->real_ip; - regs->flags |= PERF_EFLAGS_EXACT; - } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(regs)) - regs->flags |= PERF_EFLAGS_EXACT; - else - regs->flags &= ~PERF_EFLAGS_EXACT; + if (event->attr.precise_ip > 1) { + /* + * Haswell and later processors have an 'eventing IP' + * (real IP) which fixes the off-by-1 skid in hardware. + * Use it when precise_ip >= 2 : + */ + if (x86_pmu.intel_cap.pebs_format >= 2) { + set_linear_ip(regs, pebs->real_ip); + regs->flags |= PERF_EFLAGS_EXACT; + } else { + /* Otherwise, use PEBS off-by-1 IP: */ + set_linear_ip(regs, pebs->ip); + + /* + * With precise_ip >= 2, try to fix up the off-by-1 IP + * using the LBR. If successful, the fixup function + * corrects regs->ip and calls set_linear_ip() on regs: + */ + if (intel_pmu_pebs_fixup_ip(regs)) + regs->flags |= PERF_EFLAGS_EXACT; + } + } else { + /* + * When precise_ip == 1, return the PEBS off-by-1 IP, + * no fixup attempted: + */ + set_linear_ip(regs, pebs->ip); + } + if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) && x86_pmu.intel_cap.pebs_format >= 1) @@ -1306,17 +1329,93 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit) return NULL; } +void intel_pmu_auto_reload_read(struct perf_event *event) +{ + WARN_ON(!(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)); + + perf_pmu_disable(event->pmu); + intel_pmu_drain_pebs_buffer(); + perf_pmu_enable(event->pmu); +} + +/* + * Special variant of intel_pmu_save_and_restart() for auto-reload. + */ +static int +intel_pmu_save_and_restart_reload(struct perf_event *event, int count) +{ + struct hw_perf_event *hwc = &event->hw; + int shift = 64 - x86_pmu.cntval_bits; + u64 period = hwc->sample_period; + u64 prev_raw_count, new_raw_count; + s64 new, old; + + WARN_ON(!period); + + /* + * drain_pebs() only happens when the PMU is disabled. + */ + WARN_ON(this_cpu_read(cpu_hw_events.enabled)); + + prev_raw_count = local64_read(&hwc->prev_count); + rdpmcl(hwc->event_base_rdpmc, new_raw_count); + local64_set(&hwc->prev_count, new_raw_count); + + /* + * Since the counter increments a negative counter value and + * overflows on the sign switch, giving the interval: + * + * [-period, 0] + * + * the difference between two consequtive reads is: + * + * A) value2 - value1; + * when no overflows have happened in between, + * + * B) (0 - value1) + (value2 - (-period)); + * when one overflow happened in between, + * + * C) (0 - value1) + (n - 1) * (period) + (value2 - (-period)); + * when @n overflows happened in between. + * + * Here A) is the obvious difference, B) is the extension to the + * discrete interval, where the first term is to the top of the + * interval and the second term is from the bottom of the next + * interval and C) the extension to multiple intervals, where the + * middle term is the whole intervals covered. + * + * An equivalent of C, by reduction, is: + * + * value2 - value1 + n * period + */ + new = ((s64)(new_raw_count << shift) >> shift); + old = ((s64)(prev_raw_count << shift) >> shift); + local64_add(new - old + count * period, &event->count); + + perf_event_update_userpage(event); + + return 0; +} + static void __intel_pmu_pebs_event(struct perf_event *event, struct pt_regs *iregs, void *base, void *top, int bit, int count) { + struct hw_perf_event *hwc = &event->hw; struct perf_sample_data data; struct pt_regs regs; void *at = get_next_pebs_record_by_bit(base, top, bit); - if (!intel_pmu_save_and_restart(event) && - !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)) + if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { + /* + * Now, auto-reload is only enabled in fixed period mode. + * The reload value is always hwc->sample_period. + * May need to change it, if auto-reload is enabled in + * freq mode later. + */ + intel_pmu_save_and_restart_reload(event, count); + } else if (!intel_pmu_save_and_restart(event)) return; while (count > 1) { @@ -1368,8 +1467,11 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) return; n = top - at; - if (n <= 0) + if (n <= 0) { + if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) + intel_pmu_save_and_restart_reload(event, 0); return; + } __intel_pmu_pebs_event(event, iregs, at, top, 0, n); } @@ -1392,8 +1494,22 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) ds->pebs_index = ds->pebs_buffer_base; - if (unlikely(base >= top)) + if (unlikely(base >= top)) { + /* + * The drain_pebs() could be called twice in a short period + * for auto-reload event in pmu::read(). There are no + * overflows have happened in between. + * It needs to call intel_pmu_save_and_restart_reload() to + * update the event->count for this case. + */ + for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled, + x86_pmu.max_pebs_events) { + event = cpuc->events[bit]; + if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) + intel_pmu_save_and_restart_reload(event, 0); + } return; + } for (at = base; at < top; at += x86_pmu.pebs_record_size) { struct pebs_record_nhm *p = at; @@ -1530,7 +1646,7 @@ void __init intel_ds_init(void) x86_pmu.pebs_record_size = sizeof(struct pebs_record_skl); x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; - x86_pmu.free_running_flags |= PERF_SAMPLE_TIME; + x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME; break; default: diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index ae64d0b69729..cf372b90557e 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1186,7 +1186,7 @@ void __init intel_pmu_lbr_init_atom(void) * on PMU interrupt */ if (boot_cpu_data.x86_model == 28 - && boot_cpu_data.x86_mask < 10) { + && boot_cpu_data.x86_stepping < 10) { pr_cont("LBR disabled due to erratum"); return; } diff --git a/arch/x86/events/intel/p6.c b/arch/x86/events/intel/p6.c index a5604c352930..408879b0c0d4 100644 --- a/arch/x86/events/intel/p6.c +++ b/arch/x86/events/intel/p6.c @@ -234,7 +234,7 @@ static __initconst const struct x86_pmu p6_pmu = { static __init void p6_pmu_rdpmc_quirk(void) { - if (boot_cpu_data.x86_mask < 9) { + if (boot_cpu_data.x86_stepping < 9) { /* * PPro erratum 26; fixed in stepping 9 and above. */ diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 81fd41d5a0d9..3b993942a0e4 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -1186,8 +1186,12 @@ static int pt_event_addr_filters_validate(struct list_head *filters) int range = 0; list_for_each_entry(filter, filters, entry) { - /* PT doesn't support single address triggers */ - if (!filter->range || !filter->size) + /* + * PT doesn't support single address triggers and + * 'start' filters. + */ + if (!filter->size || + filter->action == PERF_ADDR_FILTER_ACTION_START) return -EOPNOTSUPP; if (!filter->inode) { @@ -1227,7 +1231,10 @@ static void pt_event_addr_filters_sync(struct perf_event *event) filters->filter[range].msr_a = msr_a; filters->filter[range].msr_b = msr_b; - filters->filter[range].config = filter->filter ? 1 : 2; + if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER) + filters->filter[range].config = 1; + else + filters->filter[range].config = 2; range++; } diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index a2efb490f743..32f3e9423e99 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -774,6 +774,8 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = { X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, skl_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_CANNONLAKE_MOBILE, skl_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_DENVERTON, hsw_rapl_init), diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 7874c980d569..a7956fc7ca1d 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -354,7 +354,7 @@ uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, if (!dogrp) return n; - list_for_each_entry(event, &leader->sibling_list, group_entry) { + for_each_sibling_event(event, leader) { if (!is_box_event(box, event) || event->state <= PERF_EVENT_STATE_OFF) continue; diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 6d8044ab1060..77076a102e34 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3028,10 +3028,27 @@ static struct intel_uncore_type bdx_uncore_cbox = { .format_group = &hswep_uncore_cbox_format_group, }; +static struct intel_uncore_type bdx_uncore_sbox = { + .name = "sbox", + .num_counters = 4, + .num_boxes = 4, + .perf_ctr_bits = 48, + .event_ctl = HSWEP_S0_MSR_PMON_CTL0, + .perf_ctr = HSWEP_S0_MSR_PMON_CTR0, + .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = HSWEP_S0_MSR_PMON_BOX_CTL, + .msr_offset = HSWEP_SBOX_MSR_OFFSET, + .ops = &hswep_uncore_sbox_msr_ops, + .format_group = &hswep_uncore_sbox_format_group, +}; + +#define BDX_MSR_UNCORE_SBOX 3 + static struct intel_uncore_type *bdx_msr_uncores[] = { &bdx_uncore_ubox, &bdx_uncore_cbox, &hswep_uncore_pcu, + &bdx_uncore_sbox, NULL, }; @@ -3043,10 +3060,25 @@ static struct event_constraint bdx_uncore_pcu_constraints[] = { void bdx_uncore_cpu_init(void) { + int pkg = topology_phys_to_logical_pkg(0); + if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; uncore_msr_uncores = bdx_msr_uncores; + /* BDX-DE doesn't have SBOX */ + if (boot_cpu_data.x86_model == 86) { + uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL; + /* Detect systems with no SBOXes */ + } else if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) { + struct pci_dev *pdev; + u32 capid4; + + pdev = uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]; + pci_read_config_dword(pdev, 0x94, &capid4); + if (((capid4 >> 6) & 0x3) == 0) + bdx_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL; + } hswep_uncore_pcu.constraints = bdx_uncore_pcu_constraints; } @@ -3264,6 +3296,11 @@ static const struct pci_device_id bdx_uncore_pci_ids[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46), .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 2), }, + { /* PCU.3 (for Capability registers) */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fc0), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + HSWEP_PCI_PCU_3), + }, { /* end: all zeroes */ } }; @@ -3343,6 +3380,7 @@ static struct extra_reg skx_uncore_cha_extra_regs[] = { SNBEP_CBO_EVENT_EXTRA_REG(0x9134, 0xffff, 0x4), SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x8), SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x38, 0xff, 0x3), EVENT_EXTRA_END }; @@ -3562,24 +3600,27 @@ static struct intel_uncore_type *skx_msr_uncores[] = { NULL, }; +/* + * To determine the number of CHAs, it should read bits 27:0 in the CAPID6 + * register which located at Device 30, Function 3, Offset 0x9C. PCI ID 0x2083. + */ +#define SKX_CAPID6 0x9c +#define SKX_CHA_BIT_MASK GENMASK(27, 0) + static int skx_count_chabox(void) { - struct pci_dev *chabox_dev = NULL; - int bus, count = 0; + struct pci_dev *dev = NULL; + u32 val = 0; - while (1) { - chabox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x208d, chabox_dev); - if (!chabox_dev) - break; - if (count == 0) - bus = chabox_dev->bus->number; - if (bus != chabox_dev->bus->number) - break; - count++; - } + dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x2083, dev); + if (!dev) + goto out; - pci_dev_put(chabox_dev); - return count; + pci_read_config_dword(dev, SKX_CAPID6, &val); + val &= SKX_CHA_BIT_MASK; +out: + pci_dev_put(dev); + return hweight32(val); } void skx_uncore_cpu_init(void) @@ -3606,7 +3647,7 @@ static struct intel_uncore_type skx_uncore_imc = { }; static struct attribute *skx_upi_uncore_formats_attr[] = { - &format_attr_event_ext.attr, + &format_attr_event.attr, &format_attr_umask_ext.attr, &format_attr_edge.attr, &format_attr_inv.attr, diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 18e2628e2d8f..e7edf19e64c2 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -188,10 +188,11 @@ static inline u64 msr_read_counter(struct perf_event *event) if (event->hw.event_base) rdmsrl(event->hw.event_base, now); else - rdtscll(now); + now = rdtsc_ordered(); return now; } + static void msr_event_update(struct perf_event *event) { u64 prev, now; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 78f91ec1056e..9f3711470ec1 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -69,7 +69,7 @@ struct event_constraint { #define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */ #define PERF_X86_EVENT_EXCL_ACCT 0x0200 /* accounted EXCL event */ #define PERF_X86_EVENT_AUTO_RELOAD 0x0400 /* use PEBS auto-reload */ -#define PERF_X86_EVENT_FREERUNNING 0x0800 /* use freerunning PEBS */ +#define PERF_X86_EVENT_LARGE_PEBS 0x0800 /* use large PEBS */ struct amd_nb { @@ -88,7 +88,7 @@ struct amd_nb { * REGS_USER can be handled for events limited to ring 3. * */ -#define PEBS_FREERUNNING_FLAGS \ +#define LARGE_PEBS_FLAGS \ (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \ PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \ PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ @@ -520,6 +520,7 @@ struct x86_pmu { void (*disable)(struct perf_event *); void (*add)(struct perf_event *); void (*del)(struct perf_event *); + void (*read)(struct perf_event *event); int (*hw_config)(struct perf_event *event); int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); unsigned eventsel; @@ -557,7 +558,7 @@ struct x86_pmu { struct x86_pmu_quirk *quirks; int perfctr_second_write; bool late_ack; - unsigned (*limit_period)(struct perf_event *event, unsigned l); + u64 (*limit_period)(struct perf_event *event, u64 l); /* * sysfs attrs @@ -608,7 +609,7 @@ struct x86_pmu { struct event_constraint *pebs_constraints; void (*pebs_aliases)(struct perf_event *event); int max_pebs_events; - unsigned long free_running_flags; + unsigned long large_pebs_flags; /* * Intel LBR @@ -923,6 +924,8 @@ void intel_pmu_pebs_disable_all(void); void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in); +void intel_pmu_auto_reload_read(struct perf_event *event); + void intel_ds_init(void); void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 2edc49e7409b..cfecc2272f2d 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -21,7 +21,7 @@ #include <asm/apic.h> #include <asm/desc.h> #include <asm/hypervisor.h> -#include <asm/hyperv.h> +#include <asm/hyperv-tlfs.h> #include <asm/mshyperv.h> #include <linux/version.h> #include <linux/vmalloc.h> @@ -88,11 +88,15 @@ EXPORT_SYMBOL_GPL(hyperv_cs); u32 *hv_vp_index; EXPORT_SYMBOL_GPL(hv_vp_index); +struct hv_vp_assist_page **hv_vp_assist_page; +EXPORT_SYMBOL_GPL(hv_vp_assist_page); + u32 hv_max_vp_index; static int hv_cpu_init(unsigned int cpu) { u64 msr_vp_index; + struct hv_vp_assist_page **hvp = &hv_vp_assist_page[smp_processor_id()]; hv_get_vp_index(msr_vp_index); @@ -101,6 +105,22 @@ static int hv_cpu_init(unsigned int cpu) if (msr_vp_index > hv_max_vp_index) hv_max_vp_index = msr_vp_index; + if (!hv_vp_assist_page) + return 0; + + if (!*hvp) + *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL); + + if (*hvp) { + u64 val; + + val = vmalloc_to_pfn(*hvp); + val = (val << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) | + HV_X64_MSR_VP_ASSIST_PAGE_ENABLE; + + wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, val); + } + return 0; } @@ -198,6 +218,9 @@ static int hv_cpu_die(unsigned int cpu) struct hv_reenlightenment_control re_ctrl; unsigned int new_cpu; + if (hv_vp_assist_page && hv_vp_assist_page[cpu]) + wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0); + if (hv_reenlightenment_cb == NULL) return 0; @@ -224,6 +247,7 @@ void hyperv_init(void) { u64 guest_id, required_msrs; union hv_x64_msr_hypercall_contents hypercall_msr; + int cpuhp; if (x86_hyper_type != X86_HYPER_MS_HYPERV) return; @@ -241,9 +265,17 @@ void hyperv_init(void) if (!hv_vp_index) return; - if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online", - hv_cpu_init, hv_cpu_die) < 0) + hv_vp_assist_page = kcalloc(num_possible_cpus(), + sizeof(*hv_vp_assist_page), GFP_KERNEL); + if (!hv_vp_assist_page) { + ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; goto free_vp_index; + } + + cpuhp = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online", + hv_cpu_init, hv_cpu_die); + if (cpuhp < 0) + goto free_vp_assist_page; /* * Setup the hypercall page and enable hypercalls. @@ -256,7 +288,7 @@ void hyperv_init(void) hv_hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX); if (hv_hypercall_pg == NULL) { wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); - goto free_vp_index; + goto remove_cpuhp_state; } rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); @@ -304,6 +336,11 @@ register_msr_cs: return; +remove_cpuhp_state: + cpuhp_remove_state(cpuhp); +free_vp_assist_page: + kfree(hv_vp_assist_page); + hv_vp_assist_page = NULL; free_vp_index: kfree(hv_vp_index); hv_vp_index = NULL; diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 41c671854642..86b1341cba9a 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -33,7 +33,6 @@ #include <asm/vdso.h> #include <asm/sigframe.h> #include <asm/sighandling.h> -#include <asm/sys_ia32.h> #include <asm/smap.h> /* diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 96cd33bbfc85..11ef7b7c9cc8 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -41,27 +41,28 @@ #include <linux/highuid.h> #include <linux/sysctl.h> #include <linux/slab.h> +#include <linux/sched/task.h> #include <asm/mman.h> #include <asm/types.h> #include <linux/uaccess.h> #include <linux/atomic.h> #include <asm/vgtod.h> -#include <asm/sys_ia32.h> +#include <asm/ia32.h> #define AA(__x) ((unsigned long)(__x)) -asmlinkage long sys32_truncate64(const char __user *filename, - unsigned long offset_low, - unsigned long offset_high) +COMPAT_SYSCALL_DEFINE3(x86_truncate64, const char __user *, filename, + unsigned long, offset_low, unsigned long, offset_high) { - return sys_truncate(filename, ((loff_t) offset_high << 32) | offset_low); + return ksys_truncate(filename, + ((loff_t) offset_high << 32) | offset_low); } -asmlinkage long sys32_ftruncate64(unsigned int fd, unsigned long offset_low, - unsigned long offset_high) +COMPAT_SYSCALL_DEFINE3(x86_ftruncate64, unsigned int, fd, + unsigned long, offset_low, unsigned long, offset_high) { - return sys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low); + return ksys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low); } /* @@ -96,8 +97,8 @@ static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat) return 0; } -asmlinkage long sys32_stat64(const char __user *filename, - struct stat64 __user *statbuf) +COMPAT_SYSCALL_DEFINE2(x86_stat64, const char __user *, filename, + struct stat64 __user *, statbuf) { struct kstat stat; int ret = vfs_stat(filename, &stat); @@ -107,8 +108,8 @@ asmlinkage long sys32_stat64(const char __user *filename, return ret; } -asmlinkage long sys32_lstat64(const char __user *filename, - struct stat64 __user *statbuf) +COMPAT_SYSCALL_DEFINE2(x86_lstat64, const char __user *, filename, + struct stat64 __user *, statbuf) { struct kstat stat; int ret = vfs_lstat(filename, &stat); @@ -117,7 +118,8 @@ asmlinkage long sys32_lstat64(const char __user *filename, return ret; } -asmlinkage long sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf) +COMPAT_SYSCALL_DEFINE2(x86_fstat64, unsigned int, fd, + struct stat64 __user *, statbuf) { struct kstat stat; int ret = vfs_fstat(fd, &stat); @@ -126,8 +128,9 @@ asmlinkage long sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf) return ret; } -asmlinkage long sys32_fstatat(unsigned int dfd, const char __user *filename, - struct stat64 __user *statbuf, int flag) +COMPAT_SYSCALL_DEFINE4(x86_fstatat, unsigned int, dfd, + const char __user *, filename, + struct stat64 __user *, statbuf, int, flag) { struct kstat stat; int error; @@ -153,7 +156,7 @@ struct mmap_arg_struct32 { unsigned int offset; }; -asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *arg) +COMPAT_SYSCALL_DEFINE1(x86_mmap, struct mmap_arg_struct32 __user *, arg) { struct mmap_arg_struct32 a; @@ -163,29 +166,23 @@ asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *arg) if (a.offset & ~PAGE_MASK) return -EINVAL; - return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset>>PAGE_SHIFT); } -asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int __user *stat_addr, - int options) -{ - return compat_sys_wait4(pid, stat_addr, options, NULL); -} - /* warning: next two assume little endian */ -asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count, - u32 poslo, u32 poshi) +COMPAT_SYSCALL_DEFINE5(x86_pread, unsigned int, fd, char __user *, ubuf, + u32, count, u32, poslo, u32, poshi) { - return sys_pread64(fd, ubuf, count, - ((loff_t)AA(poshi) << 32) | AA(poslo)); + return ksys_pread64(fd, ubuf, count, + ((loff_t)AA(poshi) << 32) | AA(poslo)); } -asmlinkage long sys32_pwrite(unsigned int fd, const char __user *ubuf, - u32 count, u32 poslo, u32 poshi) +COMPAT_SYSCALL_DEFINE5(x86_pwrite, unsigned int, fd, const char __user *, ubuf, + u32, count, u32, poslo, u32, poshi) { - return sys_pwrite64(fd, ubuf, count, - ((loff_t)AA(poshi) << 32) | AA(poslo)); + return ksys_pwrite64(fd, ubuf, count, + ((loff_t)AA(poshi) << 32) | AA(poslo)); } @@ -193,40 +190,53 @@ asmlinkage long sys32_pwrite(unsigned int fd, const char __user *ubuf, * Some system calls that need sign extended arguments. This could be * done by a generic wrapper. */ -long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high, - __u32 len_low, __u32 len_high, int advice) +COMPAT_SYSCALL_DEFINE6(x86_fadvise64_64, int, fd, __u32, offset_low, + __u32, offset_high, __u32, len_low, __u32, len_high, + int, advice) +{ + return ksys_fadvise64_64(fd, + (((u64)offset_high)<<32) | offset_low, + (((u64)len_high)<<32) | len_low, + advice); +} + +COMPAT_SYSCALL_DEFINE4(x86_readahead, int, fd, unsigned int, off_lo, + unsigned int, off_hi, size_t, count) { - return sys_fadvise64_64(fd, - (((u64)offset_high)<<32) | offset_low, - (((u64)len_high)<<32) | len_low, - advice); + return ksys_readahead(fd, ((u64)off_hi << 32) | off_lo, count); } -asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi, - size_t count) +COMPAT_SYSCALL_DEFINE6(x86_sync_file_range, int, fd, unsigned int, off_low, + unsigned int, off_hi, unsigned int, n_low, + unsigned int, n_hi, int, flags) { - return sys_readahead(fd, ((u64)off_hi << 32) | off_lo, count); + return ksys_sync_file_range(fd, + ((u64)off_hi << 32) | off_low, + ((u64)n_hi << 32) | n_low, flags); } -asmlinkage long sys32_sync_file_range(int fd, unsigned off_low, unsigned off_hi, - unsigned n_low, unsigned n_hi, int flags) +COMPAT_SYSCALL_DEFINE5(x86_fadvise64, int, fd, unsigned int, offset_lo, + unsigned int, offset_hi, size_t, len, int, advice) { - return sys_sync_file_range(fd, - ((u64)off_hi << 32) | off_low, - ((u64)n_hi << 32) | n_low, flags); + return ksys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo, + len, advice); } -asmlinkage long sys32_fadvise64(int fd, unsigned offset_lo, unsigned offset_hi, - size_t len, int advice) +COMPAT_SYSCALL_DEFINE6(x86_fallocate, int, fd, int, mode, + unsigned int, offset_lo, unsigned int, offset_hi, + unsigned int, len_lo, unsigned int, len_hi) { - return sys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo, - len, advice); + return ksys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo, + ((u64)len_hi << 32) | len_lo); } -asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo, - unsigned offset_hi, unsigned len_lo, - unsigned len_hi) +/* + * The 32-bit clone ABI is CONFIG_CLONE_BACKWARDS + */ +COMPAT_SYSCALL_DEFINE5(x86_clone, unsigned long, clone_flags, + unsigned long, newsp, int __user *, parent_tidptr, + unsigned long, tls_val, int __user *, child_tidptr) { - return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo, - ((u64)len_hi << 32) | len_lo); + return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, + tls_val); } diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 44f5d79d5105..a303d7b7d763 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -31,6 +31,7 @@ #include <asm/mmu.h> #include <asm/mpspec.h> #include <asm/realmode.h> +#include <asm/x86_init.h> #ifdef CONFIG_ACPI_APEI # include <asm/pgtable_types.h> @@ -94,7 +95,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate) if (boot_cpu_data.x86 == 0x0F && boot_cpu_data.x86_vendor == X86_VENDOR_AMD && boot_cpu_data.x86_model <= 0x05 && - boot_cpu_data.x86_mask < 0x0A) + boot_cpu_data.x86_stepping < 0x0A) return 1; else if (boot_cpu_has(X86_BUG_AMD_APIC_C1E)) return 1; @@ -133,6 +134,14 @@ static inline bool acpi_has_cpu_in_madt(void) return !!acpi_lapic; } +#define ACPI_HAVE_ARCH_GET_ROOT_POINTER +static inline u64 acpi_arch_get_root_pointer(void) +{ + return x86_init.acpi.get_root_pointer(); +} + +void acpi_generic_reduced_hw_init(void); + #else /* !CONFIG_ACPI */ #define acpi_lapic 0 @@ -142,6 +151,8 @@ static inline void acpi_noirq_set(void) { } static inline void acpi_disable_pci(void) { } static inline void disable_acpi(void) { } +static inline void acpi_generic_reduced_hw_init(void) { } + #endif /* !CONFIG_ACPI */ #define ARCH_HAS_POWER_INIT 1 diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index cf5961ca8677..4cd6a3b71824 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -218,13 +218,11 @@ static inline int alternatives_text_reserved(void *start, void *end) */ #define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \ output, input...) \ -{ \ asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\ "call %P[new2]", feature2) \ : output, ASM_CALL_CONSTRAINT \ : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ - [new2] "i" (newfunc2), ## input); \ -} + [new2] "i" (newfunc2), ## input) /* * use this macro(s) if you need more than one output parameter diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 98722773391d..08acd954f00e 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -138,7 +138,6 @@ extern void lapic_shutdown(void); extern void sync_Arb_IDs(void); extern void init_bsp_APIC(void); extern void apic_intr_mode_init(void); -extern void setup_local_APIC(void); extern void init_apic_mappings(void); void register_lapic_address(unsigned long address); extern void setup_boot_APIC_clock(void); @@ -183,6 +182,7 @@ static inline void disable_local_APIC(void) { } # define setup_boot_APIC_clock x86_init_noop # define setup_secondary_APIC_clock x86_init_noop static inline void lapic_update_tsc_freq(void) { } +static inline void init_bsp_APIC(void) { } static inline void apic_intr_mode_init(void) { } static inline void lapic_assign_system_vectors(void) { } static inline void lapic_assign_legacy_vector(unsigned int i, bool r) { } @@ -304,12 +304,6 @@ struct apic { u32 irq_delivery_mode; u32 irq_dest_mode; - /* Functions and data related to vector allocation */ - void (*vector_allocation_domain)(int cpu, struct cpumask *retmask, - const struct cpumask *mask); - int (*cpu_mask_to_apicid)(const struct cpumask *cpumask, - struct irq_data *irqdata, - unsigned int *apicid); u32 (*calc_dest_apicid)(unsigned int cpu); /* ICR related functions */ @@ -319,7 +313,7 @@ struct apic { /* Probe, setup and smpboot functions */ int (*probe)(void); int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); - int (*apic_id_valid)(int apicid); + int (*apic_id_valid)(u32 apicid); int (*apic_id_registered)(void); bool (*check_apicid_used)(physid_mask_t *map, int apicid); @@ -492,24 +486,14 @@ static inline unsigned int read_apic_id(void) return apic->get_apic_id(reg); } -extern int default_apic_id_valid(int apicid); +extern int default_apic_id_valid(u32 apicid); extern int default_acpi_madt_oem_check(char *, char *); extern void default_setup_apic_routing(void); extern u32 apic_default_calc_apicid(unsigned int cpu); extern u32 apic_flat_calc_apicid(unsigned int cpu); -extern int flat_cpu_mask_to_apicid(const struct cpumask *cpumask, - struct irq_data *irqdata, - unsigned int *apicid); -extern int default_cpu_mask_to_apicid(const struct cpumask *cpumask, - struct irq_data *irqdata, - unsigned int *apicid); extern bool default_check_apicid_used(physid_mask_t *map, int apicid); -extern void flat_vector_allocation_domain(int cpu, struct cpumask *retmask, - const struct cpumask *mask); -extern void default_vector_allocation_domain(int cpu, struct cpumask *retmask, - const struct cpumask *mask); extern void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap); extern int default_cpu_present_to_apicid(int mps_cpu); extern int default_check_phys_apicid_present(int phys_apicid); diff --git a/arch/x86/include/asm/apm.h b/arch/x86/include/asm/apm.h index 4d4015ddcf26..c356098b6fb9 100644 --- a/arch/x86/include/asm/apm.h +++ b/arch/x86/include/asm/apm.h @@ -7,6 +7,8 @@ #ifndef _ASM_X86_MACH_DEFAULT_APM_H #define _ASM_X86_MACH_DEFAULT_APM_H +#include <asm/nospec-branch.h> + #ifdef APM_ZERO_SEGS # define APM_DO_ZERO_SEGS \ "pushl %%ds\n\t" \ @@ -32,6 +34,7 @@ static inline void apm_bios_call_asm(u32 func, u32 ebx_in, u32 ecx_in, * N.B. We do NOT need a cld after the BIOS call * because we always save and restore the flags. */ + firmware_restrict_branch_speculation_start(); __asm__ __volatile__(APM_DO_ZERO_SEGS "pushl %%edi\n\t" "pushl %%ebp\n\t" @@ -44,6 +47,7 @@ static inline void apm_bios_call_asm(u32 func, u32 ebx_in, u32 ecx_in, "=S" (*esi) : "a" (func), "b" (ebx_in), "c" (ecx_in) : "memory", "cc"); + firmware_restrict_branch_speculation_end(); } static inline bool apm_bios_call_simple_asm(u32 func, u32 ebx_in, @@ -56,6 +60,7 @@ static inline bool apm_bios_call_simple_asm(u32 func, u32 ebx_in, * N.B. We do NOT need a cld after the BIOS call * because we always save and restore the flags. */ + firmware_restrict_branch_speculation_start(); __asm__ __volatile__(APM_DO_ZERO_SEGS "pushl %%edi\n\t" "pushl %%ebp\n\t" @@ -68,6 +73,7 @@ static inline bool apm_bios_call_simple_asm(u32 func, u32 ebx_in, "=S" (si) : "a" (func), "b" (ebx_in), "c" (ecx_in) : "memory", "cc"); + firmware_restrict_branch_speculation_end(); return error; } diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 4d111616524b..1908214b9125 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -38,7 +38,4 @@ INDIRECT_THUNK(dx) INDIRECT_THUNK(si) INDIRECT_THUNK(di) INDIRECT_THUNK(bp) -asmlinkage void __fill_rsb(void); -asmlinkage void __clear_rsb(void); - #endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 386a6900e206..219faaec51df 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -136,7 +136,6 @@ #endif #ifndef __ASSEMBLY__ -#ifndef __BPF__ /* * This output constraint should be used for any inline asm which has a "call" * instruction. Otherwise the asm may be inserted before the frame pointer @@ -146,6 +145,5 @@ register unsigned long current_stack_pointer asm(_ASM_SP); #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) #endif -#endif #endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 72759f131cc5..0db6bec95489 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -17,36 +17,40 @@ #define ATOMIC_INIT(i) { (i) } /** - * atomic_read - read atomic variable + * arch_atomic_read - read atomic variable * @v: pointer of type atomic_t * * Atomically reads the value of @v. */ -static __always_inline int atomic_read(const atomic_t *v) +static __always_inline int arch_atomic_read(const atomic_t *v) { + /* + * Note for KASAN: we deliberately don't use READ_ONCE_NOCHECK() here, + * it's non-inlined function that increases binary size and stack usage. + */ return READ_ONCE((v)->counter); } /** - * atomic_set - set atomic variable + * arch_atomic_set - set atomic variable * @v: pointer of type atomic_t * @i: required value * * Atomically sets the value of @v to @i. */ -static __always_inline void atomic_set(atomic_t *v, int i) +static __always_inline void arch_atomic_set(atomic_t *v, int i) { WRITE_ONCE(v->counter, i); } /** - * atomic_add - add integer to atomic variable + * arch_atomic_add - add integer to atomic variable * @i: integer value to add * @v: pointer of type atomic_t * * Atomically adds @i to @v. */ -static __always_inline void atomic_add(int i, atomic_t *v) +static __always_inline void arch_atomic_add(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "addl %1,%0" : "+m" (v->counter) @@ -54,13 +58,13 @@ static __always_inline void atomic_add(int i, atomic_t *v) } /** - * atomic_sub - subtract integer from atomic variable + * arch_atomic_sub - subtract integer from atomic variable * @i: integer value to subtract * @v: pointer of type atomic_t * * Atomically subtracts @i from @v. */ -static __always_inline void atomic_sub(int i, atomic_t *v) +static __always_inline void arch_atomic_sub(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "subl %1,%0" : "+m" (v->counter) @@ -68,7 +72,7 @@ static __always_inline void atomic_sub(int i, atomic_t *v) } /** - * atomic_sub_and_test - subtract value from variable and test result + * arch_atomic_sub_and_test - subtract value from variable and test result * @i: integer value to subtract * @v: pointer of type atomic_t * @@ -76,63 +80,63 @@ static __always_inline void atomic_sub(int i, atomic_t *v) * true if the result is zero, or false for all * other cases. */ -static __always_inline bool atomic_sub_and_test(int i, atomic_t *v) +static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", e); } /** - * atomic_inc - increment atomic variable + * arch_atomic_inc - increment atomic variable * @v: pointer of type atomic_t * * Atomically increments @v by 1. */ -static __always_inline void atomic_inc(atomic_t *v) +static __always_inline void arch_atomic_inc(atomic_t *v) { asm volatile(LOCK_PREFIX "incl %0" : "+m" (v->counter)); } /** - * atomic_dec - decrement atomic variable + * arch_atomic_dec - decrement atomic variable * @v: pointer of type atomic_t * * Atomically decrements @v by 1. */ -static __always_inline void atomic_dec(atomic_t *v) +static __always_inline void arch_atomic_dec(atomic_t *v) { asm volatile(LOCK_PREFIX "decl %0" : "+m" (v->counter)); } /** - * atomic_dec_and_test - decrement and test + * arch_atomic_dec_and_test - decrement and test * @v: pointer of type atomic_t * * Atomically decrements @v by 1 and * returns true if the result is 0, or false for all other * cases. */ -static __always_inline bool atomic_dec_and_test(atomic_t *v) +static __always_inline bool arch_atomic_dec_and_test(atomic_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", e); } /** - * atomic_inc_and_test - increment and test + * arch_atomic_inc_and_test - increment and test * @v: pointer of type atomic_t * * Atomically increments @v by 1 * and returns true if the result is zero, or false for all * other cases. */ -static __always_inline bool atomic_inc_and_test(atomic_t *v) +static __always_inline bool arch_atomic_inc_and_test(atomic_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", e); } /** - * atomic_add_negative - add and test if negative + * arch_atomic_add_negative - add and test if negative * @i: integer value to add * @v: pointer of type atomic_t * @@ -140,65 +144,65 @@ static __always_inline bool atomic_inc_and_test(atomic_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ -static __always_inline bool atomic_add_negative(int i, atomic_t *v) +static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", s); } /** - * atomic_add_return - add integer and return + * arch_atomic_add_return - add integer and return * @i: integer value to add * @v: pointer of type atomic_t * * Atomically adds @i to @v and returns @i + @v */ -static __always_inline int atomic_add_return(int i, atomic_t *v) +static __always_inline int arch_atomic_add_return(int i, atomic_t *v) { return i + xadd(&v->counter, i); } /** - * atomic_sub_return - subtract integer and return + * arch_atomic_sub_return - subtract integer and return * @v: pointer of type atomic_t * @i: integer value to subtract * * Atomically subtracts @i from @v and returns @v - @i */ -static __always_inline int atomic_sub_return(int i, atomic_t *v) +static __always_inline int arch_atomic_sub_return(int i, atomic_t *v) { - return atomic_add_return(-i, v); + return arch_atomic_add_return(-i, v); } -#define atomic_inc_return(v) (atomic_add_return(1, v)) -#define atomic_dec_return(v) (atomic_sub_return(1, v)) +#define arch_atomic_inc_return(v) (arch_atomic_add_return(1, v)) +#define arch_atomic_dec_return(v) (arch_atomic_sub_return(1, v)) -static __always_inline int atomic_fetch_add(int i, atomic_t *v) +static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v) { return xadd(&v->counter, i); } -static __always_inline int atomic_fetch_sub(int i, atomic_t *v) +static __always_inline int arch_atomic_fetch_sub(int i, atomic_t *v) { return xadd(&v->counter, -i); } -static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new) +static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new) { - return cmpxchg(&v->counter, old, new); + return arch_cmpxchg(&v->counter, old, new); } -#define atomic_try_cmpxchg atomic_try_cmpxchg -static __always_inline bool atomic_try_cmpxchg(atomic_t *v, int *old, int new) +#define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg +static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new) { return try_cmpxchg(&v->counter, old, new); } -static inline int atomic_xchg(atomic_t *v, int new) +static inline int arch_atomic_xchg(atomic_t *v, int new) { return xchg(&v->counter, new); } -static inline void atomic_and(int i, atomic_t *v) +static inline void arch_atomic_and(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "andl %1,%0" : "+m" (v->counter) @@ -206,16 +210,16 @@ static inline void atomic_and(int i, atomic_t *v) : "memory"); } -static inline int atomic_fetch_and(int i, atomic_t *v) +static inline int arch_atomic_fetch_and(int i, atomic_t *v) { - int val = atomic_read(v); + int val = arch_atomic_read(v); - do { } while (!atomic_try_cmpxchg(v, &val, val & i)); + do { } while (!arch_atomic_try_cmpxchg(v, &val, val & i)); return val; } -static inline void atomic_or(int i, atomic_t *v) +static inline void arch_atomic_or(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "orl %1,%0" : "+m" (v->counter) @@ -223,16 +227,16 @@ static inline void atomic_or(int i, atomic_t *v) : "memory"); } -static inline int atomic_fetch_or(int i, atomic_t *v) +static inline int arch_atomic_fetch_or(int i, atomic_t *v) { - int val = atomic_read(v); + int val = arch_atomic_read(v); - do { } while (!atomic_try_cmpxchg(v, &val, val | i)); + do { } while (!arch_atomic_try_cmpxchg(v, &val, val | i)); return val; } -static inline void atomic_xor(int i, atomic_t *v) +static inline void arch_atomic_xor(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "xorl %1,%0" : "+m" (v->counter) @@ -240,17 +244,17 @@ static inline void atomic_xor(int i, atomic_t *v) : "memory"); } -static inline int atomic_fetch_xor(int i, atomic_t *v) +static inline int arch_atomic_fetch_xor(int i, atomic_t *v) { - int val = atomic_read(v); + int val = arch_atomic_read(v); - do { } while (!atomic_try_cmpxchg(v, &val, val ^ i)); + do { } while (!arch_atomic_try_cmpxchg(v, &val, val ^ i)); return val; } /** - * __atomic_add_unless - add unless the number is already a given value + * __arch_atomic_add_unless - add unless the number is already a given value * @v: pointer of type atomic_t * @a: the amount to add to v... * @u: ...unless v is equal to u. @@ -258,14 +262,14 @@ static inline int atomic_fetch_xor(int i, atomic_t *v) * Atomically adds @a to @v, so long as @v was not already @u. * Returns the old value of @v. */ -static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) +static __always_inline int __arch_atomic_add_unless(atomic_t *v, int a, int u) { - int c = atomic_read(v); + int c = arch_atomic_read(v); do { if (unlikely(c == u)) break; - } while (!atomic_try_cmpxchg(v, &c, c + a)); + } while (!arch_atomic_try_cmpxchg(v, &c, c + a)); return c; } @@ -276,4 +280,6 @@ static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) # include <asm/atomic64_64.h> #endif +#include <asm-generic/atomic-instrumented.h> + #endif /* _ASM_X86_ATOMIC_H */ diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 97c46b8169b7..92212bf0484f 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -62,7 +62,7 @@ ATOMIC64_DECL(add_unless); #undef ATOMIC64_EXPORT /** - * atomic64_cmpxchg - cmpxchg atomic64 variable + * arch_atomic64_cmpxchg - cmpxchg atomic64 variable * @v: pointer to type atomic64_t * @o: expected value * @n: new value @@ -71,20 +71,21 @@ ATOMIC64_DECL(add_unless); * the old value. */ -static inline long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n) +static inline long long arch_atomic64_cmpxchg(atomic64_t *v, long long o, + long long n) { - return cmpxchg64(&v->counter, o, n); + return arch_cmpxchg64(&v->counter, o, n); } /** - * atomic64_xchg - xchg atomic64 variable + * arch_atomic64_xchg - xchg atomic64 variable * @v: pointer to type atomic64_t * @n: value to assign * * Atomically xchgs the value of @v to @n and returns * the old value. */ -static inline long long atomic64_xchg(atomic64_t *v, long long n) +static inline long long arch_atomic64_xchg(atomic64_t *v, long long n) { long long o; unsigned high = (unsigned)(n >> 32); @@ -96,13 +97,13 @@ static inline long long atomic64_xchg(atomic64_t *v, long long n) } /** - * atomic64_set - set atomic64 variable + * arch_atomic64_set - set atomic64 variable * @v: pointer to type atomic64_t * @i: value to assign * * Atomically sets the value of @v to @n. */ -static inline void atomic64_set(atomic64_t *v, long long i) +static inline void arch_atomic64_set(atomic64_t *v, long long i) { unsigned high = (unsigned)(i >> 32); unsigned low = (unsigned)i; @@ -112,26 +113,26 @@ static inline void atomic64_set(atomic64_t *v, long long i) } /** - * atomic64_read - read atomic64 variable + * arch_atomic64_read - read atomic64 variable * @v: pointer to type atomic64_t * * Atomically reads the value of @v and returns it. */ -static inline long long atomic64_read(const atomic64_t *v) +static inline long long arch_atomic64_read(const atomic64_t *v) { long long r; alternative_atomic64(read, "=&A" (r), "c" (v) : "memory"); return r; - } +} /** - * atomic64_add_return - add and return + * arch_atomic64_add_return - add and return * @i: integer value to add * @v: pointer to type atomic64_t * * Atomically adds @i to @v and returns @i + *@v */ -static inline long long atomic64_add_return(long long i, atomic64_t *v) +static inline long long arch_atomic64_add_return(long long i, atomic64_t *v) { alternative_atomic64(add_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -142,7 +143,7 @@ static inline long long atomic64_add_return(long long i, atomic64_t *v) /* * Other variants with different arithmetic operators: */ -static inline long long atomic64_sub_return(long long i, atomic64_t *v) +static inline long long arch_atomic64_sub_return(long long i, atomic64_t *v) { alternative_atomic64(sub_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -150,7 +151,7 @@ static inline long long atomic64_sub_return(long long i, atomic64_t *v) return i; } -static inline long long atomic64_inc_return(atomic64_t *v) +static inline long long arch_atomic64_inc_return(atomic64_t *v) { long long a; alternative_atomic64(inc_return, "=&A" (a), @@ -158,7 +159,7 @@ static inline long long atomic64_inc_return(atomic64_t *v) return a; } -static inline long long atomic64_dec_return(atomic64_t *v) +static inline long long arch_atomic64_dec_return(atomic64_t *v) { long long a; alternative_atomic64(dec_return, "=&A" (a), @@ -167,13 +168,13 @@ static inline long long atomic64_dec_return(atomic64_t *v) } /** - * atomic64_add - add integer to atomic64 variable + * arch_atomic64_add - add integer to atomic64 variable * @i: integer value to add * @v: pointer to type atomic64_t * * Atomically adds @i to @v. */ -static inline long long atomic64_add(long long i, atomic64_t *v) +static inline long long arch_atomic64_add(long long i, atomic64_t *v) { __alternative_atomic64(add, add_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -182,13 +183,13 @@ static inline long long atomic64_add(long long i, atomic64_t *v) } /** - * atomic64_sub - subtract the atomic64 variable + * arch_atomic64_sub - subtract the atomic64 variable * @i: integer value to subtract * @v: pointer to type atomic64_t * * Atomically subtracts @i from @v. */ -static inline long long atomic64_sub(long long i, atomic64_t *v) +static inline long long arch_atomic64_sub(long long i, atomic64_t *v) { __alternative_atomic64(sub, sub_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -197,7 +198,7 @@ static inline long long atomic64_sub(long long i, atomic64_t *v) } /** - * atomic64_sub_and_test - subtract value from variable and test result + * arch_atomic64_sub_and_test - subtract value from variable and test result * @i: integer value to subtract * @v: pointer to type atomic64_t * @@ -205,46 +206,46 @@ static inline long long atomic64_sub(long long i, atomic64_t *v) * true if the result is zero, or false for all * other cases. */ -static inline int atomic64_sub_and_test(long long i, atomic64_t *v) +static inline int arch_atomic64_sub_and_test(long long i, atomic64_t *v) { - return atomic64_sub_return(i, v) == 0; + return arch_atomic64_sub_return(i, v) == 0; } /** - * atomic64_inc - increment atomic64 variable + * arch_atomic64_inc - increment atomic64 variable * @v: pointer to type atomic64_t * * Atomically increments @v by 1. */ -static inline void atomic64_inc(atomic64_t *v) +static inline void arch_atomic64_inc(atomic64_t *v) { __alternative_atomic64(inc, inc_return, /* no output */, "S" (v) : "memory", "eax", "ecx", "edx"); } /** - * atomic64_dec - decrement atomic64 variable + * arch_atomic64_dec - decrement atomic64 variable * @v: pointer to type atomic64_t * * Atomically decrements @v by 1. */ -static inline void atomic64_dec(atomic64_t *v) +static inline void arch_atomic64_dec(atomic64_t *v) { __alternative_atomic64(dec, dec_return, /* no output */, "S" (v) : "memory", "eax", "ecx", "edx"); } /** - * atomic64_dec_and_test - decrement and test + * arch_atomic64_dec_and_test - decrement and test * @v: pointer to type atomic64_t * * Atomically decrements @v by 1 and * returns true if the result is 0, or false for all other * cases. */ -static inline int atomic64_dec_and_test(atomic64_t *v) +static inline int arch_atomic64_dec_and_test(atomic64_t *v) { - return atomic64_dec_return(v) == 0; + return arch_atomic64_dec_return(v) == 0; } /** @@ -255,13 +256,13 @@ static inline int atomic64_dec_and_test(atomic64_t *v) * and returns true if the result is zero, or false for all * other cases. */ -static inline int atomic64_inc_and_test(atomic64_t *v) +static inline int arch_atomic64_inc_and_test(atomic64_t *v) { - return atomic64_inc_return(v) == 0; + return arch_atomic64_inc_return(v) == 0; } /** - * atomic64_add_negative - add and test if negative + * arch_atomic64_add_negative - add and test if negative * @i: integer value to add * @v: pointer to type atomic64_t * @@ -269,13 +270,13 @@ static inline int atomic64_inc_and_test(atomic64_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ -static inline int atomic64_add_negative(long long i, atomic64_t *v) +static inline int arch_atomic64_add_negative(long long i, atomic64_t *v) { - return atomic64_add_return(i, v) < 0; + return arch_atomic64_add_return(i, v) < 0; } /** - * atomic64_add_unless - add unless the number is a given value + * arch_atomic64_add_unless - add unless the number is a given value * @v: pointer of type atomic64_t * @a: the amount to add to v... * @u: ...unless v is equal to u. @@ -283,7 +284,8 @@ static inline int atomic64_add_negative(long long i, atomic64_t *v) * Atomically adds @a to @v, so long as it was not @u. * Returns non-zero if the add was done, zero otherwise. */ -static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u) +static inline int arch_atomic64_add_unless(atomic64_t *v, long long a, + long long u) { unsigned low = (unsigned)u; unsigned high = (unsigned)(u >> 32); @@ -294,7 +296,7 @@ static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u) } -static inline int atomic64_inc_not_zero(atomic64_t *v) +static inline int arch_atomic64_inc_not_zero(atomic64_t *v) { int r; alternative_atomic64(inc_not_zero, "=&a" (r), @@ -302,7 +304,7 @@ static inline int atomic64_inc_not_zero(atomic64_t *v) return r; } -static inline long long atomic64_dec_if_positive(atomic64_t *v) +static inline long long arch_atomic64_dec_if_positive(atomic64_t *v) { long long r; alternative_atomic64(dec_if_positive, "=&A" (r), @@ -313,70 +315,70 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v) #undef alternative_atomic64 #undef __alternative_atomic64 -static inline void atomic64_and(long long i, atomic64_t *v) +static inline void arch_atomic64_and(long long i, atomic64_t *v) { long long old, c = 0; - while ((old = atomic64_cmpxchg(v, c, c & i)) != c) + while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c) c = old; } -static inline long long atomic64_fetch_and(long long i, atomic64_t *v) +static inline long long arch_atomic64_fetch_and(long long i, atomic64_t *v) { long long old, c = 0; - while ((old = atomic64_cmpxchg(v, c, c & i)) != c) + while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c) c = old; return old; } -static inline void atomic64_or(long long i, atomic64_t *v) +static inline void arch_atomic64_or(long long i, atomic64_t *v) { long long old, c = 0; - while ((old = atomic64_cmpxchg(v, c, c | i)) != c) + while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c) c = old; } -static inline long long atomic64_fetch_or(long long i, atomic64_t *v) +static inline long long arch_atomic64_fetch_or(long long i, atomic64_t *v) { long long old, c = 0; - while ((old = atomic64_cmpxchg(v, c, c | i)) != c) + while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c) c = old; return old; } -static inline void atomic64_xor(long long i, atomic64_t *v) +static inline void arch_atomic64_xor(long long i, atomic64_t *v) { long long old, c = 0; - while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c) + while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c) c = old; } -static inline long long atomic64_fetch_xor(long long i, atomic64_t *v) +static inline long long arch_atomic64_fetch_xor(long long i, atomic64_t *v) { long long old, c = 0; - while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c) + while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c) c = old; return old; } -static inline long long atomic64_fetch_add(long long i, atomic64_t *v) +static inline long long arch_atomic64_fetch_add(long long i, atomic64_t *v) { long long old, c = 0; - while ((old = atomic64_cmpxchg(v, c, c + i)) != c) + while ((old = arch_atomic64_cmpxchg(v, c, c + i)) != c) c = old; return old; } -#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v)) +#define arch_atomic64_fetch_sub(i, v) arch_atomic64_fetch_add(-(i), (v)) #endif /* _ASM_X86_ATOMIC64_32_H */ diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 738495caf05f..6106b59d3260 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -11,37 +11,37 @@ #define ATOMIC64_INIT(i) { (i) } /** - * atomic64_read - read atomic64 variable + * arch_atomic64_read - read atomic64 variable * @v: pointer of type atomic64_t * * Atomically reads the value of @v. * Doesn't imply a read memory barrier. */ -static inline long atomic64_read(const atomic64_t *v) +static inline long arch_atomic64_read(const atomic64_t *v) { return READ_ONCE((v)->counter); } /** - * atomic64_set - set atomic64 variable + * arch_atomic64_set - set atomic64 variable * @v: pointer to type atomic64_t * @i: required value * * Atomically sets the value of @v to @i. */ -static inline void atomic64_set(atomic64_t *v, long i) +static inline void arch_atomic64_set(atomic64_t *v, long i) { WRITE_ONCE(v->counter, i); } /** - * atomic64_add - add integer to atomic64 variable + * arch_atomic64_add - add integer to atomic64 variable * @i: integer value to add * @v: pointer to type atomic64_t * * Atomically adds @i to @v. */ -static __always_inline void atomic64_add(long i, atomic64_t *v) +static __always_inline void arch_atomic64_add(long i, atomic64_t *v) { asm volatile(LOCK_PREFIX "addq %1,%0" : "=m" (v->counter) @@ -49,13 +49,13 @@ static __always_inline void atomic64_add(long i, atomic64_t *v) } /** - * atomic64_sub - subtract the atomic64 variable + * arch_atomic64_sub - subtract the atomic64 variable * @i: integer value to subtract * @v: pointer to type atomic64_t * * Atomically subtracts @i from @v. */ -static inline void atomic64_sub(long i, atomic64_t *v) +static inline void arch_atomic64_sub(long i, atomic64_t *v) { asm volatile(LOCK_PREFIX "subq %1,%0" : "=m" (v->counter) @@ -63,7 +63,7 @@ static inline void atomic64_sub(long i, atomic64_t *v) } /** - * atomic64_sub_and_test - subtract value from variable and test result + * arch_atomic64_sub_and_test - subtract value from variable and test result * @i: integer value to subtract * @v: pointer to type atomic64_t * @@ -71,18 +71,18 @@ static inline void atomic64_sub(long i, atomic64_t *v) * true if the result is zero, or false for all * other cases. */ -static inline bool atomic64_sub_and_test(long i, atomic64_t *v) +static inline bool arch_atomic64_sub_and_test(long i, atomic64_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, "er", i, "%0", e); } /** - * atomic64_inc - increment atomic64 variable + * arch_atomic64_inc - increment atomic64 variable * @v: pointer to type atomic64_t * * Atomically increments @v by 1. */ -static __always_inline void atomic64_inc(atomic64_t *v) +static __always_inline void arch_atomic64_inc(atomic64_t *v) { asm volatile(LOCK_PREFIX "incq %0" : "=m" (v->counter) @@ -90,12 +90,12 @@ static __always_inline void atomic64_inc(atomic64_t *v) } /** - * atomic64_dec - decrement atomic64 variable + * arch_atomic64_dec - decrement atomic64 variable * @v: pointer to type atomic64_t * * Atomically decrements @v by 1. */ -static __always_inline void atomic64_dec(atomic64_t *v) +static __always_inline void arch_atomic64_dec(atomic64_t *v) { asm volatile(LOCK_PREFIX "decq %0" : "=m" (v->counter) @@ -103,33 +103,33 @@ static __always_inline void atomic64_dec(atomic64_t *v) } /** - * atomic64_dec_and_test - decrement and test + * arch_atomic64_dec_and_test - decrement and test * @v: pointer to type atomic64_t * * Atomically decrements @v by 1 and * returns true if the result is 0, or false for all other * cases. */ -static inline bool atomic64_dec_and_test(atomic64_t *v) +static inline bool arch_atomic64_dec_and_test(atomic64_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", e); } /** - * atomic64_inc_and_test - increment and test + * arch_atomic64_inc_and_test - increment and test * @v: pointer to type atomic64_t * * Atomically increments @v by 1 * and returns true if the result is zero, or false for all * other cases. */ -static inline bool atomic64_inc_and_test(atomic64_t *v) +static inline bool arch_atomic64_inc_and_test(atomic64_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", e); } /** - * atomic64_add_negative - add and test if negative + * arch_atomic64_add_negative - add and test if negative * @i: integer value to add * @v: pointer to type atomic64_t * @@ -137,59 +137,59 @@ static inline bool atomic64_inc_and_test(atomic64_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ -static inline bool atomic64_add_negative(long i, atomic64_t *v) +static inline bool arch_atomic64_add_negative(long i, atomic64_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, "er", i, "%0", s); } /** - * atomic64_add_return - add and return + * arch_atomic64_add_return - add and return * @i: integer value to add * @v: pointer to type atomic64_t * * Atomically adds @i to @v and returns @i + @v */ -static __always_inline long atomic64_add_return(long i, atomic64_t *v) +static __always_inline long arch_atomic64_add_return(long i, atomic64_t *v) { return i + xadd(&v->counter, i); } -static inline long atomic64_sub_return(long i, atomic64_t *v) +static inline long arch_atomic64_sub_return(long i, atomic64_t *v) { - return atomic64_add_return(-i, v); + return arch_atomic64_add_return(-i, v); } -static inline long atomic64_fetch_add(long i, atomic64_t *v) +static inline long arch_atomic64_fetch_add(long i, atomic64_t *v) { return xadd(&v->counter, i); } -static inline long atomic64_fetch_sub(long i, atomic64_t *v) +static inline long arch_atomic64_fetch_sub(long i, atomic64_t *v) { return xadd(&v->counter, -i); } -#define atomic64_inc_return(v) (atomic64_add_return(1, (v))) -#define atomic64_dec_return(v) (atomic64_sub_return(1, (v))) +#define arch_atomic64_inc_return(v) (arch_atomic64_add_return(1, (v))) +#define arch_atomic64_dec_return(v) (arch_atomic64_sub_return(1, (v))) -static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) +static inline long arch_atomic64_cmpxchg(atomic64_t *v, long old, long new) { - return cmpxchg(&v->counter, old, new); + return arch_cmpxchg(&v->counter, old, new); } -#define atomic64_try_cmpxchg atomic64_try_cmpxchg -static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, long new) +#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg +static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, long new) { return try_cmpxchg(&v->counter, old, new); } -static inline long atomic64_xchg(atomic64_t *v, long new) +static inline long arch_atomic64_xchg(atomic64_t *v, long new) { return xchg(&v->counter, new); } /** - * atomic64_add_unless - add unless the number is a given value + * arch_atomic64_add_unless - add unless the number is a given value * @v: pointer of type atomic64_t * @a: the amount to add to v... * @u: ...unless v is equal to u. @@ -197,37 +197,37 @@ static inline long atomic64_xchg(atomic64_t *v, long new) * Atomically adds @a to @v, so long as it was not @u. * Returns the old value of @v. */ -static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) +static inline bool arch_atomic64_add_unless(atomic64_t *v, long a, long u) { - s64 c = atomic64_read(v); + s64 c = arch_atomic64_read(v); do { if (unlikely(c == u)) return false; - } while (!atomic64_try_cmpxchg(v, &c, c + a)); + } while (!arch_atomic64_try_cmpxchg(v, &c, c + a)); return true; } -#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0) +#define arch_atomic64_inc_not_zero(v) arch_atomic64_add_unless((v), 1, 0) /* - * atomic64_dec_if_positive - decrement by 1 if old value positive + * arch_atomic64_dec_if_positive - decrement by 1 if old value positive * @v: pointer of type atomic_t * * The function returns the old value of *v minus 1, even if * the atomic variable, v, was not decremented. */ -static inline long atomic64_dec_if_positive(atomic64_t *v) +static inline long arch_atomic64_dec_if_positive(atomic64_t *v) { - s64 dec, c = atomic64_read(v); + s64 dec, c = arch_atomic64_read(v); do { dec = c - 1; if (unlikely(dec < 0)) break; - } while (!atomic64_try_cmpxchg(v, &c, dec)); + } while (!arch_atomic64_try_cmpxchg(v, &c, dec)); return dec; } -static inline void atomic64_and(long i, atomic64_t *v) +static inline void arch_atomic64_and(long i, atomic64_t *v) { asm volatile(LOCK_PREFIX "andq %1,%0" : "+m" (v->counter) @@ -235,16 +235,16 @@ static inline void atomic64_and(long i, atomic64_t *v) : "memory"); } -static inline long atomic64_fetch_and(long i, atomic64_t *v) +static inline long arch_atomic64_fetch_and(long i, atomic64_t *v) { - s64 val = atomic64_read(v); + s64 val = arch_atomic64_read(v); do { - } while (!atomic64_try_cmpxchg(v, &val, val & i)); + } while (!arch_atomic64_try_cmpxchg(v, &val, val & i)); return val; } -static inline void atomic64_or(long i, atomic64_t *v) +static inline void arch_atomic64_or(long i, atomic64_t *v) { asm volatile(LOCK_PREFIX "orq %1,%0" : "+m" (v->counter) @@ -252,16 +252,16 @@ static inline void atomic64_or(long i, atomic64_t *v) : "memory"); } -static inline long atomic64_fetch_or(long i, atomic64_t *v) +static inline long arch_atomic64_fetch_or(long i, atomic64_t *v) { - s64 val = atomic64_read(v); + s64 val = arch_atomic64_read(v); do { - } while (!atomic64_try_cmpxchg(v, &val, val | i)); + } while (!arch_atomic64_try_cmpxchg(v, &val, val | i)); return val; } -static inline void atomic64_xor(long i, atomic64_t *v) +static inline void arch_atomic64_xor(long i, atomic64_t *v) { asm volatile(LOCK_PREFIX "xorq %1,%0" : "+m" (v->counter) @@ -269,12 +269,12 @@ static inline void atomic64_xor(long i, atomic64_t *v) : "memory"); } -static inline long atomic64_fetch_xor(long i, atomic64_t *v) +static inline long arch_atomic64_fetch_xor(long i, atomic64_t *v) { - s64 val = atomic64_read(v); + s64 val = arch_atomic64_read(v); do { - } while (!atomic64_try_cmpxchg(v, &val, val ^ i)); + } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i)); return val; } diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 30d406146016..042b5e892ed1 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -40,7 +40,7 @@ static inline unsigned long array_index_mask_nospec(unsigned long index, asm ("cmp %1,%2; sbb %0,%0;" :"=r" (mask) - :"r"(size),"r" (index) + :"g"(size),"r" (index) :"cc"); return mask; } @@ -52,11 +52,7 @@ static inline unsigned long array_index_mask_nospec(unsigned long index, #define barrier_nospec() alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, \ "lfence", X86_FEATURE_LFENCE_RDTSC) -#ifdef CONFIG_X86_PPRO_FENCE -#define dma_rmb() rmb() -#else #define dma_rmb() barrier() -#endif #define dma_wmb() barrier() #ifdef CONFIG_X86_32 @@ -68,30 +64,6 @@ static inline unsigned long array_index_mask_nospec(unsigned long index, #define __smp_wmb() barrier() #define __smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0) -#if defined(CONFIG_X86_PPRO_FENCE) - -/* - * For this option x86 doesn't have a strong TSO memory - * model and we should fall back to full barriers. - */ - -#define __smp_store_release(p, v) \ -do { \ - compiletime_assert_atomic_type(*p); \ - __smp_mb(); \ - WRITE_ONCE(*p, v); \ -} while (0) - -#define __smp_load_acquire(p) \ -({ \ - typeof(*p) ___p1 = READ_ONCE(*p); \ - compiletime_assert_atomic_type(*p); \ - __smp_mb(); \ - ___p1; \ -}) - -#else /* regular x86 TSO memory ordering */ - #define __smp_store_release(p, v) \ do { \ compiletime_assert_atomic_type(*p); \ @@ -107,8 +79,6 @@ do { \ ___p1; \ }) -#endif - /* Atomic operations are already serializing on x86 */ #define __smp_mb__before_atomic() barrier() #define __smp_mb__after_atomic() barrier() diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 3fa039855b8f..9f645ba57dbb 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -78,7 +78,7 @@ set_bit(long nr, volatile unsigned long *addr) : "iq" ((u8)CONST_MASK(nr)) : "memory"); } else { - asm volatile(LOCK_PREFIX "bts %1,%0" + asm volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0" : BITOP_ADDR(addr) : "Ir" (nr) : "memory"); } } @@ -94,7 +94,7 @@ set_bit(long nr, volatile unsigned long *addr) */ static __always_inline void __set_bit(long nr, volatile unsigned long *addr) { - asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory"); + asm volatile(__ASM_SIZE(bts) " %1,%0" : ADDR : "Ir" (nr) : "memory"); } /** @@ -115,7 +115,7 @@ clear_bit(long nr, volatile unsigned long *addr) : CONST_MASK_ADDR(nr, addr) : "iq" ((u8)~CONST_MASK(nr))); } else { - asm volatile(LOCK_PREFIX "btr %1,%0" + asm volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0" : BITOP_ADDR(addr) : "Ir" (nr)); } @@ -137,7 +137,7 @@ static __always_inline void clear_bit_unlock(long nr, volatile unsigned long *ad static __always_inline void __clear_bit(long nr, volatile unsigned long *addr) { - asm volatile("btr %1,%0" : ADDR : "Ir" (nr)); + asm volatile(__ASM_SIZE(btr) " %1,%0" : ADDR : "Ir" (nr)); } static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) @@ -182,7 +182,7 @@ static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long * */ static __always_inline void __change_bit(long nr, volatile unsigned long *addr) { - asm volatile("btc %1,%0" : ADDR : "Ir" (nr)); + asm volatile(__ASM_SIZE(btc) " %1,%0" : ADDR : "Ir" (nr)); } /** @@ -201,7 +201,7 @@ static __always_inline void change_bit(long nr, volatile unsigned long *addr) : CONST_MASK_ADDR(nr, addr) : "iq" ((u8)CONST_MASK(nr))); } else { - asm volatile(LOCK_PREFIX "btc %1,%0" + asm volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0" : BITOP_ADDR(addr) : "Ir" (nr)); } @@ -217,7 +217,8 @@ static __always_inline void change_bit(long nr, volatile unsigned long *addr) */ static __always_inline bool test_and_set_bit(long nr, volatile unsigned long *addr) { - GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", c); + GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts), + *addr, "Ir", nr, "%0", c); } /** @@ -246,7 +247,7 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long * { bool oldbit; - asm("bts %2,%1" + asm(__ASM_SIZE(bts) " %2,%1" CC_SET(c) : CC_OUT(c) (oldbit), ADDR : "Ir" (nr)); @@ -263,7 +264,8 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long * */ static __always_inline bool test_and_clear_bit(long nr, volatile unsigned long *addr) { - GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", c); + GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btr), + *addr, "Ir", nr, "%0", c); } /** @@ -286,7 +288,7 @@ static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long { bool oldbit; - asm volatile("btr %2,%1" + asm volatile(__ASM_SIZE(btr) " %2,%1" CC_SET(c) : CC_OUT(c) (oldbit), ADDR : "Ir" (nr)); @@ -298,7 +300,7 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon { bool oldbit; - asm volatile("btc %2,%1" + asm volatile(__ASM_SIZE(btc) " %2,%1" CC_SET(c) : CC_OUT(c) (oldbit), ADDR : "Ir" (nr) : "memory"); @@ -316,7 +318,8 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon */ static __always_inline bool test_and_change_bit(long nr, volatile unsigned long *addr) { - GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", c); + GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc), + *addr, "Ir", nr, "%0", c); } static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr) @@ -329,7 +332,7 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l { bool oldbit; - asm volatile("bt %2,%1" + asm volatile(__ASM_SIZE(bt) " %2,%1" CC_SET(c) : CC_OUT(c) (oldbit) : "m" (*(unsigned long *)addr), "Ir" (nr)); diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 34d99af43994..6804d6642767 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -5,23 +5,20 @@ #include <linux/stringify.h> /* - * Since some emulators terminate on UD2, we cannot use it for WARN. - * Since various instruction decoders disagree on the length of UD1, - * we cannot use it either. So use UD0 for WARN. + * Despite that some emulators terminate on UD2, we use it for WARN(). * - * (binutils knows about "ud1" but {en,de}codes it as 2 bytes, whereas - * our kernel decoder thinks it takes a ModRM byte, which seems consistent - * with various things like the Intel SDM instruction encoding rules) + * Since various instruction decoders/specs disagree on the encoding of + * UD0/UD1. */ -#define ASM_UD0 ".byte 0x0f, 0xff" +#define ASM_UD0 ".byte 0x0f, 0xff" /* + ModRM (for Intel) */ #define ASM_UD1 ".byte 0x0f, 0xb9" /* + ModRM */ #define ASM_UD2 ".byte 0x0f, 0x0b" #define INSN_UD0 0xff0f #define INSN_UD2 0x0b0f -#define LEN_UD0 2 +#define LEN_UD2 2 #ifdef CONFIG_GENERIC_BUG @@ -77,7 +74,11 @@ do { \ unreachable(); \ } while (0) -#define __WARN_FLAGS(flags) _BUG_FLAGS(ASM_UD0, BUGFLAG_WARNING|(flags)) +#define __WARN_FLAGS(flags) \ +do { \ + _BUG_FLAGS(ASM_UD2, BUGFLAG_WARNING|(flags)); \ + annotate_reachable(); \ +} while (0) #include <asm-generic/bug.h> diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 56bd436ed01b..e3efd8a06066 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -145,13 +145,13 @@ extern void __add_wrong_size(void) # include <asm/cmpxchg_64.h> #endif -#define cmpxchg(ptr, old, new) \ +#define arch_cmpxchg(ptr, old, new) \ __cmpxchg(ptr, old, new, sizeof(*(ptr))) -#define sync_cmpxchg(ptr, old, new) \ +#define arch_sync_cmpxchg(ptr, old, new) \ __sync_cmpxchg(ptr, old, new, sizeof(*(ptr))) -#define cmpxchg_local(ptr, old, new) \ +#define arch_cmpxchg_local(ptr, old, new) \ __cmpxchg_local(ptr, old, new, sizeof(*(ptr))) @@ -221,7 +221,7 @@ extern void __add_wrong_size(void) #define __try_cmpxchg(ptr, pold, new, size) \ __raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX) -#define try_cmpxchg(ptr, pold, new) \ +#define try_cmpxchg(ptr, pold, new) \ __try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr))) /* @@ -250,10 +250,10 @@ extern void __add_wrong_size(void) __ret; \ }) -#define cmpxchg_double(p1, p2, o1, o2, n1, n2) \ +#define arch_cmpxchg_double(p1, p2, o1, o2, n1, n2) \ __cmpxchg_double(LOCK_PREFIX, p1, p2, o1, o2, n1, n2) -#define cmpxchg_double_local(p1, p2, o1, o2, n1, n2) \ +#define arch_cmpxchg_double_local(p1, p2, o1, o2, n1, n2) \ __cmpxchg_double(, p1, p2, o1, o2, n1, n2) #endif /* ASM_X86_CMPXCHG_H */ diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 1732704f0445..1a2eafca7038 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -36,10 +36,10 @@ static inline void set_64bit(volatile u64 *ptr, u64 value) } #ifdef CONFIG_X86_CMPXCHG64 -#define cmpxchg64(ptr, o, n) \ +#define arch_cmpxchg64(ptr, o, n) \ ((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \ (unsigned long long)(n))) -#define cmpxchg64_local(ptr, o, n) \ +#define arch_cmpxchg64_local(ptr, o, n) \ ((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \ (unsigned long long)(n))) #endif @@ -76,7 +76,7 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) * to simulate the cmpxchg8b on the 80386 and 80486 CPU. */ -#define cmpxchg64(ptr, o, n) \ +#define arch_cmpxchg64(ptr, o, n) \ ({ \ __typeof__(*(ptr)) __ret; \ __typeof__(*(ptr)) __old = (o); \ @@ -93,7 +93,7 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) __ret; }) -#define cmpxchg64_local(ptr, o, n) \ +#define arch_cmpxchg64_local(ptr, o, n) \ ({ \ __typeof__(*(ptr)) __ret; \ __typeof__(*(ptr)) __old = (o); \ diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 03cad196a301..bfca3b346c74 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -7,13 +7,13 @@ static inline void set_64bit(volatile u64 *ptr, u64 val) *ptr = val; } -#define cmpxchg64(ptr, o, n) \ +#define arch_cmpxchg64(ptr, o, n) \ ({ \ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ cmpxchg((ptr), (o), (n)); \ }) -#define cmpxchg64_local(ptr, o, n) \ +#define arch_cmpxchg64_local(ptr, o, n) \ ({ \ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ cmpxchg_local((ptr), (o), (n)); \ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 70eddb3922ff..b27da9602a6d 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -140,7 +140,6 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); #define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) -#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS) /* * Static testing of CPU features. Used the same as boot_cpu_has(). * These will statically patch the target code for additional @@ -148,45 +147,46 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); */ static __always_inline __pure bool _static_cpu_has(u16 bit) { - asm_volatile_goto("1: jmp 6f\n" - "2:\n" - ".skip -(((5f-4f) - (2b-1b)) > 0) * " - "((5f-4f) - (2b-1b)),0x90\n" - "3:\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" /* src offset */ - " .long 4f - .\n" /* repl offset */ - " .word %P1\n" /* always replace */ - " .byte 3b - 1b\n" /* src len */ - " .byte 5f - 4f\n" /* repl len */ - " .byte 3b - 2b\n" /* pad len */ - ".previous\n" - ".section .altinstr_replacement,\"ax\"\n" - "4: jmp %l[t_no]\n" - "5:\n" - ".previous\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" /* src offset */ - " .long 0\n" /* no replacement */ - " .word %P0\n" /* feature bit */ - " .byte 3b - 1b\n" /* src len */ - " .byte 0\n" /* repl len */ - " .byte 0\n" /* pad len */ - ".previous\n" - ".section .altinstr_aux,\"ax\"\n" - "6:\n" - " testb %[bitnum],%[cap_byte]\n" - " jnz %l[t_yes]\n" - " jmp %l[t_no]\n" - ".previous\n" - : : "i" (bit), "i" (X86_FEATURE_ALWAYS), - [bitnum] "i" (1 << (bit & 7)), - [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3]) - : : t_yes, t_no); - t_yes: - return true; - t_no: - return false; + asm_volatile_goto("1: jmp 6f\n" + "2:\n" + ".skip -(((5f-4f) - (2b-1b)) > 0) * " + "((5f-4f) - (2b-1b)),0x90\n" + "3:\n" + ".section .altinstructions,\"a\"\n" + " .long 1b - .\n" /* src offset */ + " .long 4f - .\n" /* repl offset */ + " .word %P[always]\n" /* always replace */ + " .byte 3b - 1b\n" /* src len */ + " .byte 5f - 4f\n" /* repl len */ + " .byte 3b - 2b\n" /* pad len */ + ".previous\n" + ".section .altinstr_replacement,\"ax\"\n" + "4: jmp %l[t_no]\n" + "5:\n" + ".previous\n" + ".section .altinstructions,\"a\"\n" + " .long 1b - .\n" /* src offset */ + " .long 0\n" /* no replacement */ + " .word %P[feature]\n" /* feature bit */ + " .byte 3b - 1b\n" /* src len */ + " .byte 0\n" /* repl len */ + " .byte 0\n" /* pad len */ + ".previous\n" + ".section .altinstr_aux,\"ax\"\n" + "6:\n" + " testb %[bitnum],%[cap_byte]\n" + " jnz %l[t_yes]\n" + " jmp %l[t_no]\n" + ".previous\n" + : : [feature] "i" (bit), + [always] "i" (X86_FEATURE_ALWAYS), + [bitnum] "i" (1 << (bit & 7)), + [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3]) + : : t_yes, t_no); +t_yes: + return true; +t_no: + return false; } #define static_cpu_has(bit) \ @@ -195,13 +195,6 @@ static __always_inline __pure bool _static_cpu_has(u16 bit) boot_cpu_has(bit) : \ _static_cpu_has(bit) \ ) -#else -/* - * Fall back to dynamic for gcc versions which don't support asm goto. Should be - * a minority now anyway. - */ -#define static_cpu_has(bit) boot_cpu_has(bit) -#endif #define cpu_has_bug(c, bit) cpu_has(c, (bit)) #define set_cpu_bug(c, bit) set_cpu_cap(c, (bit)) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 0dfe4d3f74e2..578793e97431 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -213,6 +213,7 @@ #define X86_FEATURE_SEV ( 7*32+20) /* AMD Secure Encrypted Virtualization */ #define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ +#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ @@ -315,9 +316,11 @@ #define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */ #define X86_FEATURE_AVX512_VNNI (16*32+11) /* Vector Neural Network Instructions */ #define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ +#define X86_FEATURE_TME (16*32+13) /* Intel Total Memory Encryption */ #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ #define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ #define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ +#define X86_FEATURE_CLDEMOTE (16*32+25) /* CLDEMOTE instruction */ /* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */ #define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */ @@ -327,6 +330,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ #define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ #define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ diff --git a/arch/x86/include/asm/crypto/camellia.h b/arch/x86/include/asm/crypto/camellia.h index 10f8d590bcfe..a5d86fc0593f 100644 --- a/arch/x86/include/asm/crypto/camellia.h +++ b/arch/x86/include/asm/crypto/camellia.h @@ -2,8 +2,9 @@ #ifndef ASM_X86_CAMELLIA_H #define ASM_X86_CAMELLIA_H -#include <linux/kernel.h> +#include <crypto/b128ops.h> #include <linux/crypto.h> +#include <linux/kernel.h> #define CAMELLIA_MIN_KEY_SIZE 16 #define CAMELLIA_MAX_KEY_SIZE 32 @@ -11,16 +12,13 @@ #define CAMELLIA_TABLE_BYTE_LEN 272 #define CAMELLIA_PARALLEL_BLOCKS 2 +struct crypto_skcipher; + struct camellia_ctx { u64 key_table[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)]; u32 key_length; }; -struct camellia_lrw_ctx { - struct lrw_table_ctx lrw_table; - struct camellia_ctx camellia_ctx; -}; - struct camellia_xts_ctx { struct camellia_ctx tweak_ctx; struct camellia_ctx crypt_ctx; @@ -30,11 +28,7 @@ extern int __camellia_setkey(struct camellia_ctx *cctx, const unsigned char *key, unsigned int key_len, u32 *flags); -extern int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen); -extern void lrw_camellia_exit_tfm(struct crypto_tfm *tfm); - -extern int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, +extern int xts_camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen); /* regular block cipher functions */ diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h index 553a03de55c3..d1818634ae7e 100644 --- a/arch/x86/include/asm/crypto/glue_helper.h +++ b/arch/x86/include/asm/crypto/glue_helper.h @@ -45,7 +45,7 @@ struct common_glue_ctx { }; static inline bool glue_fpu_begin(unsigned int bsize, int fpu_blocks_limit, - struct blkcipher_desc *desc, + struct skcipher_walk *walk, bool fpu_enabled, unsigned int nbytes) { if (likely(fpu_blocks_limit < 0)) @@ -61,33 +61,6 @@ static inline bool glue_fpu_begin(unsigned int bsize, int fpu_blocks_limit, if (nbytes < bsize * (unsigned int)fpu_blocks_limit) return false; - if (desc) { - /* prevent sleeping if FPU is in use */ - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - } - - kernel_fpu_begin(); - return true; -} - -static inline bool glue_skwalk_fpu_begin(unsigned int bsize, - int fpu_blocks_limit, - struct skcipher_walk *walk, - bool fpu_enabled, unsigned int nbytes) -{ - if (likely(fpu_blocks_limit < 0)) - return false; - - if (fpu_enabled) - return true; - - /* - * Vector-registers are only used when chunk to be processed is large - * enough, so do not enable FPU until it is necessary. - */ - if (nbytes < bsize * (unsigned int)fpu_blocks_limit) - return false; - /* prevent sleeping if FPU is in use */ skcipher_walk_atomise(walk); @@ -126,41 +99,17 @@ static inline void le128_inc(le128 *i) i->b = cpu_to_le64(b); } -extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, - struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes); - -extern int glue_cbc_encrypt_128bit(const common_glue_func_t fn, - struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, - unsigned int nbytes); - -extern int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, - struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, - unsigned int nbytes); - -extern int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, - struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes); - -extern int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, - struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes, - common_glue_func_t tweak_fn, void *tweak_ctx, - void *crypt_ctx); - -extern int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, - struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes, - common_glue_func_t tweak_fn, void *tweak_ctx, - void *crypt_ctx); +extern int glue_ecb_req_128bit(const struct common_glue_ctx *gctx, + struct skcipher_request *req); + +extern int glue_cbc_encrypt_req_128bit(const common_glue_func_t fn, + struct skcipher_request *req); + +extern int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx, + struct skcipher_request *req); + +extern int glue_ctr_req_128bit(const struct common_glue_ctx *gctx, + struct skcipher_request *req); extern int glue_xts_req_128bit(const struct common_glue_ctx *gctx, struct skcipher_request *req, diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h index c958b7bd0fcb..db7c9cc32234 100644 --- a/arch/x86/include/asm/crypto/serpent-avx.h +++ b/arch/x86/include/asm/crypto/serpent-avx.h @@ -2,15 +2,13 @@ #ifndef ASM_X86_SERPENT_AVX_H #define ASM_X86_SERPENT_AVX_H -#include <linux/crypto.h> +#include <crypto/b128ops.h> #include <crypto/serpent.h> +#include <linux/types.h> -#define SERPENT_PARALLEL_BLOCKS 8 +struct crypto_skcipher; -struct serpent_lrw_ctx { - struct lrw_table_ctx lrw_table; - struct serpent_ctx serpent_ctx; -}; +#define SERPENT_PARALLEL_BLOCKS 8 struct serpent_xts_ctx { struct serpent_ctx tweak_ctx; @@ -38,12 +36,7 @@ extern void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, extern void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv); extern void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv); -extern int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen); - -extern void lrw_serpent_exit_tfm(struct crypto_tfm *tfm); - -extern int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, +extern int xts_serpent_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen); #endif diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h index 65bb80adba3e..f618bf272b90 100644 --- a/arch/x86/include/asm/crypto/twofish.h +++ b/arch/x86/include/asm/crypto/twofish.h @@ -4,19 +4,8 @@ #include <linux/crypto.h> #include <crypto/twofish.h> -#include <crypto/lrw.h> #include <crypto/b128ops.h> -struct twofish_lrw_ctx { - struct lrw_table_ctx lrw_table; - struct twofish_ctx twofish_ctx; -}; - -struct twofish_xts_ctx { - struct twofish_ctx tweak_ctx; - struct twofish_ctx crypt_ctx; -}; - /* regular block cipher functions from twofish_x86_64 module */ asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst, const u8 *src); @@ -36,12 +25,4 @@ extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, extern void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, le128 *iv); -extern int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen); - -extern void lrw_twofish_exit_tfm(struct crypto_tfm *tfm); - -extern int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen); - #endif /* ASM_X86_TWOFISH_H */ diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h index 5e12c63b47aa..a8f6c809d9b1 100644 --- a/arch/x86/include/asm/device.h +++ b/arch/x86/include/asm/device.h @@ -6,6 +6,9 @@ struct dev_archdata { #if defined(CONFIG_INTEL_IOMMU) || defined(CONFIG_AMD_IOMMU) void *iommu; /* hook for IOMMU specific extension */ #endif +#ifdef CONFIG_STA2X11 + bool is_sta2x11; +#endif }; #if defined(CONFIG_X86_DEV_DMA_OPS) && defined(CONFIG_PCI_DOMAINS) diff --git a/arch/x86/include/asm/dma-direct.h b/arch/x86/include/asm/dma-direct.h index 1295bc622ebe..1a19251eaac9 100644 --- a/arch/x86/include/asm/dma-direct.h +++ b/arch/x86/include/asm/dma-direct.h @@ -2,29 +2,8 @@ #ifndef ASM_X86_DMA_DIRECT_H #define ASM_X86_DMA_DIRECT_H 1 -#include <linux/mem_encrypt.h> - -#ifdef CONFIG_X86_DMA_REMAP /* Platform code defines bridge-specific code */ bool dma_capable(struct device *dev, dma_addr_t addr, size_t size); -dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr); -phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr); -#else -static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) -{ - if (!dev->dma_mask) - return 0; - - return addr + size - 1 <= *dev->dma_mask; -} - -static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) -{ - return __sme_set(paddr); -} +dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr); +phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t daddr); -static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) -{ - return __sme_clr(daddr); -} -#endif /* CONFIG_X86_DMA_REMAP */ #endif /* ASM_X86_DMA_DIRECT_H */ diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 6277c83c0eb1..89ce4bfd241f 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -36,37 +36,4 @@ int arch_dma_supported(struct device *dev, u64 mask); bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp); #define arch_dma_alloc_attrs arch_dma_alloc_attrs -extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_addr, gfp_t flag, - unsigned long attrs); - -extern void dma_generic_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_addr, - unsigned long attrs); - -static inline unsigned long dma_alloc_coherent_mask(struct device *dev, - gfp_t gfp) -{ - unsigned long dma_mask = 0; - - dma_mask = dev->coherent_dma_mask; - if (!dma_mask) - dma_mask = (gfp & GFP_DMA) ? DMA_BIT_MASK(24) : DMA_BIT_MASK(32); - - return dma_mask; -} - -static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp) -{ - unsigned long dma_mask = dma_alloc_coherent_mask(dev, gfp); - - if (dma_mask <= DMA_BIT_MASK(24)) - gfp |= GFP_DMA; -#ifdef CONFIG_X86_64 - if (dma_mask <= DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) - gfp |= GFP_DMA32; -#endif - return gfp; -} - #endif diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 85f6ccb80b91..cec5fae23eb3 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -6,6 +6,8 @@ #include <asm/pgtable.h> #include <asm/processor-flags.h> #include <asm/tlb.h> +#include <asm/nospec-branch.h> +#include <asm/mmu_context.h> /* * We map the EFI regions needed for runtime services non-contiguously, @@ -36,8 +38,18 @@ extern asmlinkage unsigned long efi_call_phys(void *, ...); -#define arch_efi_call_virt_setup() kernel_fpu_begin() -#define arch_efi_call_virt_teardown() kernel_fpu_end() +#define arch_efi_call_virt_setup() \ +({ \ + kernel_fpu_begin(); \ + firmware_restrict_branch_speculation_start(); \ +}) + +#define arch_efi_call_virt_teardown() \ +({ \ + firmware_restrict_branch_speculation_end(); \ + kernel_fpu_end(); \ +}) + /* * Wrap all the virtual calls in a way that forces the parameters on the stack. @@ -58,14 +70,13 @@ extern asmlinkage u64 efi_call(void *fp, ...); #define efi_call_phys(f, args...) efi_call((f), args) /* - * Scratch space used for switching the pagetable in the EFI stub + * struct efi_scratch - Scratch space used while switching to/from efi_mm + * @phys_stack: stack used during EFI Mixed Mode + * @prev_mm: store/restore stolen mm_struct while switching to/from efi_mm */ struct efi_scratch { - u64 r15; - u64 prev_cr3; - pgd_t *efi_pgt; - bool use_pgd; - u64 phys_stack; + u64 phys_stack; + struct mm_struct *prev_mm; } __packed; #define arch_efi_call_virt_setup() \ @@ -73,12 +84,10 @@ struct efi_scratch { efi_sync_low_kernel_mappings(); \ preempt_disable(); \ __kernel_fpu_begin(); \ + firmware_restrict_branch_speculation_start(); \ \ - if (efi_scratch.use_pgd) { \ - efi_scratch.prev_cr3 = __read_cr3(); \ - write_cr3((unsigned long)efi_scratch.efi_pgt); \ - __flush_tlb_all(); \ - } \ + if (!efi_enabled(EFI_OLD_MEMMAP)) \ + efi_switch_mm(&efi_mm); \ }) #define arch_efi_call_virt(p, f, args...) \ @@ -86,11 +95,10 @@ struct efi_scratch { #define arch_efi_call_virt_teardown() \ ({ \ - if (efi_scratch.use_pgd) { \ - write_cr3(efi_scratch.prev_cr3); \ - __flush_tlb_all(); \ - } \ + if (!efi_enabled(EFI_OLD_MEMMAP)) \ + efi_switch_mm(efi_scratch.prev_mm); \ \ + firmware_restrict_branch_speculation_end(); \ __kernel_fpu_end(); \ preempt_enable(); \ }) @@ -131,6 +139,7 @@ extern void __init efi_dump_pagetable(void); extern void __init efi_apply_memmap_quirks(void); extern int __init efi_reuse_config(u64 tables, int nr_tables); extern void efi_delete_dummy_variable(void); +extern void efi_switch_mm(struct mm_struct *mm); struct efi_setup_data { u64 fw_vendor; diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 09ad88572746..cc8f8fcf9b4a 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -46,7 +46,21 @@ int ftrace_int3_handler(struct pt_regs *regs); #endif /* CONFIG_FUNCTION_TRACER */ -#if !defined(__ASSEMBLY__) && !defined(COMPILE_OFFSETS) +#ifndef __ASSEMBLY__ + +#define ARCH_HAS_SYSCALL_MATCH_SYM_NAME +static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) +{ + /* + * Compare the symbol name with the system call name. Skip the + * "__x64_sys", "__ia32_sys" or simple "sys" prefix. + */ + return !strcmp(sym + 3, name + 3) || + (!strncmp(sym, "__x64_", 6) && !strcmp(sym + 9, name + 3)) || + (!strncmp(sym, "__ia32_", 7) && !strcmp(sym + 10, name + 3)); +} + +#ifndef COMPILE_OFFSETS #if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_IA32_EMULATION) #include <asm/compat.h> @@ -67,6 +81,7 @@ static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs) return false; } #endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_IA32_EMULATION */ -#endif /* !__ASSEMBLY__ && !COMPILE_OFFSETS */ +#endif /* !COMPILE_OFFSETS */ +#endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_FTRACE_H */ diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 7c341a74ec8c..5ea2afd4c871 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -40,6 +40,7 @@ typedef struct { #endif #if IS_ENABLED(CONFIG_HYPERV) unsigned int irq_hv_reenlightenment_count; + unsigned int hyperv_stimer0_count; #endif } ____cacheline_aligned irq_cpustat_t; diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 2851077b6051..32e666e1231e 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -36,6 +36,7 @@ extern asmlinkage void kvm_posted_intr_wakeup_ipi(void); extern asmlinkage void kvm_posted_intr_nested_ipi(void); extern asmlinkage void error_interrupt(void); extern asmlinkage void irq_work_interrupt(void); +extern asmlinkage void uv_bau_message_intr1(void); extern asmlinkage void spurious_interrupt(void); extern asmlinkage void thermal_interrupt(void); diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/asm/hyperv-tlfs.h index 197c2e6c7376..416cb0e0c496 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -1,6 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _ASM_X86_HYPERV_H -#define _ASM_X86_HYPERV_H + +/* + * This file contains definitions from Hyper-V Hypervisor Top-Level Functional + * Specification (TLFS): + * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/reference/tlfs + */ + +#ifndef _ASM_X86_HYPERV_TLFS_H +#define _ASM_X86_HYPERV_TLFS_H #include <linux/types.h> @@ -14,6 +21,7 @@ #define HYPERV_CPUID_FEATURES 0x40000003 #define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004 #define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005 +#define HYPERV_CPUID_NESTED_FEATURES 0x4000000A #define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000 #define HYPERV_CPUID_MIN 0x40000005 @@ -77,6 +85,9 @@ /* Crash MSR available */ #define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE (1 << 10) +/* stimer Direct Mode is available */ +#define HV_X64_STIMER_DIRECT_MODE_AVAILABLE (1 << 19) + /* * Feature identification: EBX indicates which flags were specified at * partition creation. The format is the same as the partition creation @@ -156,6 +167,9 @@ /* Recommend using the newer ExProcessorMasks interface */ #define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11) +/* Recommend using enlightened VMCS */ +#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED (1 << 14) + /* * Crash notification flag. */ @@ -189,7 +203,7 @@ #define HV_X64_MSR_EOI 0x40000070 #define HV_X64_MSR_ICR 0x40000071 #define HV_X64_MSR_TPR 0x40000072 -#define HV_X64_MSR_APIC_ASSIST_PAGE 0x40000073 +#define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073 /* Define synthetic interrupt controller model specific registers. */ #define HV_X64_MSR_SCONTROL 0x40000080 @@ -237,28 +251,77 @@ #define HV_X64_MSR_CRASH_PARAMS \ (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0)) +/* + * Declare the MSR used to setup pages used to communicate with the hypervisor. + */ +union hv_x64_msr_hypercall_contents { + u64 as_uint64; + struct { + u64 enable:1; + u64 reserved:11; + u64 guest_physical_address:52; + }; +}; + +/* + * TSC page layout. + */ +struct ms_hyperv_tsc_page { + volatile u32 tsc_sequence; + u32 reserved1; + volatile u64 tsc_scale; + volatile s64 tsc_offset; + u64 reserved2[509]; +}; + +/* + * The guest OS needs to register the guest ID with the hypervisor. + * The guest ID is a 64 bit entity and the structure of this ID is + * specified in the Hyper-V specification: + * + * msdn.microsoft.com/en-us/library/windows/hardware/ff542653%28v=vs.85%29.aspx + * + * While the current guideline does not specify how Linux guest ID(s) + * need to be generated, our plan is to publish the guidelines for + * Linux and other guest operating systems that currently are hosted + * on Hyper-V. The implementation here conforms to this yet + * unpublished guidelines. + * + * + * Bit(s) + * 63 - Indicates if the OS is Open Source or not; 1 is Open Source + * 62:56 - Os Type; Linux is 0x100 + * 55:48 - Distro specific identification + * 47:16 - Linux kernel version number + * 15:0 - Distro specific identification + * + * + */ + +#define HV_LINUX_VENDOR_ID 0x8100 + /* TSC emulation after migration */ #define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 struct hv_reenlightenment_control { - u64 vector:8; - u64 reserved1:8; - u64 enabled:1; - u64 reserved2:15; - u64 target_vp:32; + __u64 vector:8; + __u64 reserved1:8; + __u64 enabled:1; + __u64 reserved2:15; + __u64 target_vp:32; }; #define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107 #define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108 struct hv_tsc_emulation_control { - u64 enabled:1; - u64 reserved:63; + __u64 enabled:1; + __u64 reserved:63; }; struct hv_tsc_emulation_status { - u64 inprogress:1; - u64 reserved:63; + __u64 inprogress:1; + __u64 reserved:63; }; #define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001 @@ -275,10 +338,13 @@ struct hv_tsc_emulation_status { #define HVCALL_POST_MESSAGE 0x005c #define HVCALL_SIGNAL_EVENT 0x005d -#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001 -#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12 -#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \ - (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1)) +#define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001 +#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12 +#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK \ + (~((1ull << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) - 1)) + +/* Hyper-V Enlightened VMCS version mask in nested features CPUID */ +#define HV_X64_ENLIGHTENED_VMCS_VERSION 0xff #define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001 #define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12 @@ -298,12 +364,22 @@ enum HV_GENERIC_SET_FORMAT { HV_GENERIC_SET_ALL, }; +#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0) +#define HV_HYPERCALL_FAST_BIT BIT(16) +#define HV_HYPERCALL_VARHEAD_OFFSET 17 +#define HV_HYPERCALL_REP_COMP_OFFSET 32 +#define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32) +#define HV_HYPERCALL_REP_START_OFFSET 48 +#define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48) + /* hypercall status code */ #define HV_STATUS_SUCCESS 0 #define HV_STATUS_INVALID_HYPERCALL_CODE 2 #define HV_STATUS_INVALID_HYPERCALL_INPUT 3 #define HV_STATUS_INVALID_ALIGNMENT 4 +#define HV_STATUS_INVALID_PARAMETER 5 #define HV_STATUS_INSUFFICIENT_MEMORY 11 +#define HV_STATUS_INVALID_PORT_ID 17 #define HV_STATUS_INVALID_CONNECTION_ID 18 #define HV_STATUS_INSUFFICIENT_BUFFERS 19 @@ -318,6 +394,8 @@ typedef struct _HV_REFERENCE_TSC_PAGE { #define HV_SYNIC_SINT_COUNT (16) /* Define the expected SynIC version. */ #define HV_SYNIC_VERSION_1 (0x1) +/* Valid SynIC vectors are 16-255. */ +#define HV_SYNIC_FIRST_VALID_VECTOR (16) #define HV_SYNIC_CONTROL_ENABLE (1ULL << 0) #define HV_SYNIC_SIMP_ENABLE (1ULL << 0) @@ -412,6 +490,216 @@ struct hv_timer_message_payload { __u64 delivery_time; /* When the message was delivered */ }; +/* Define virtual processor assist page structure. */ +struct hv_vp_assist_page { + __u32 apic_assist; + __u32 reserved; + __u64 vtl_control[2]; + __u64 nested_enlightenments_control[2]; + __u32 enlighten_vmentry; + __u64 current_nested_vmcs; +}; + +struct hv_enlightened_vmcs { + u32 revision_id; + u32 abort; + + u16 host_es_selector; + u16 host_cs_selector; + u16 host_ss_selector; + u16 host_ds_selector; + u16 host_fs_selector; + u16 host_gs_selector; + u16 host_tr_selector; + + u64 host_ia32_pat; + u64 host_ia32_efer; + + u64 host_cr0; + u64 host_cr3; + u64 host_cr4; + + u64 host_ia32_sysenter_esp; + u64 host_ia32_sysenter_eip; + u64 host_rip; + u32 host_ia32_sysenter_cs; + + u32 pin_based_vm_exec_control; + u32 vm_exit_controls; + u32 secondary_vm_exec_control; + + u64 io_bitmap_a; + u64 io_bitmap_b; + u64 msr_bitmap; + + u16 guest_es_selector; + u16 guest_cs_selector; + u16 guest_ss_selector; + u16 guest_ds_selector; + u16 guest_fs_selector; + u16 guest_gs_selector; + u16 guest_ldtr_selector; + u16 guest_tr_selector; + + u32 guest_es_limit; + u32 guest_cs_limit; + u32 guest_ss_limit; + u32 guest_ds_limit; + u32 guest_fs_limit; + u32 guest_gs_limit; + u32 guest_ldtr_limit; + u32 guest_tr_limit; + u32 guest_gdtr_limit; + u32 guest_idtr_limit; + + u32 guest_es_ar_bytes; + u32 guest_cs_ar_bytes; + u32 guest_ss_ar_bytes; + u32 guest_ds_ar_bytes; + u32 guest_fs_ar_bytes; + u32 guest_gs_ar_bytes; + u32 guest_ldtr_ar_bytes; + u32 guest_tr_ar_bytes; + + u64 guest_es_base; + u64 guest_cs_base; + u64 guest_ss_base; + u64 guest_ds_base; + u64 guest_fs_base; + u64 guest_gs_base; + u64 guest_ldtr_base; + u64 guest_tr_base; + u64 guest_gdtr_base; + u64 guest_idtr_base; + + u64 padding64_1[3]; + + u64 vm_exit_msr_store_addr; + u64 vm_exit_msr_load_addr; + u64 vm_entry_msr_load_addr; + + u64 cr3_target_value0; + u64 cr3_target_value1; + u64 cr3_target_value2; + u64 cr3_target_value3; + + u32 page_fault_error_code_mask; + u32 page_fault_error_code_match; + + u32 cr3_target_count; + u32 vm_exit_msr_store_count; + u32 vm_exit_msr_load_count; + u32 vm_entry_msr_load_count; + + u64 tsc_offset; + u64 virtual_apic_page_addr; + u64 vmcs_link_pointer; + + u64 guest_ia32_debugctl; + u64 guest_ia32_pat; + u64 guest_ia32_efer; + + u64 guest_pdptr0; + u64 guest_pdptr1; + u64 guest_pdptr2; + u64 guest_pdptr3; + + u64 guest_pending_dbg_exceptions; + u64 guest_sysenter_esp; + u64 guest_sysenter_eip; + + u32 guest_activity_state; + u32 guest_sysenter_cs; + + u64 cr0_guest_host_mask; + u64 cr4_guest_host_mask; + u64 cr0_read_shadow; + u64 cr4_read_shadow; + u64 guest_cr0; + u64 guest_cr3; + u64 guest_cr4; + u64 guest_dr7; + + u64 host_fs_base; + u64 host_gs_base; + u64 host_tr_base; + u64 host_gdtr_base; + u64 host_idtr_base; + u64 host_rsp; + + u64 ept_pointer; + + u16 virtual_processor_id; + u16 padding16[3]; + + u64 padding64_2[5]; + u64 guest_physical_address; + + u32 vm_instruction_error; + u32 vm_exit_reason; + u32 vm_exit_intr_info; + u32 vm_exit_intr_error_code; + u32 idt_vectoring_info_field; + u32 idt_vectoring_error_code; + u32 vm_exit_instruction_len; + u32 vmx_instruction_info; + + u64 exit_qualification; + u64 exit_io_instruction_ecx; + u64 exit_io_instruction_esi; + u64 exit_io_instruction_edi; + u64 exit_io_instruction_eip; + + u64 guest_linear_address; + u64 guest_rsp; + u64 guest_rflags; + + u32 guest_interruptibility_info; + u32 cpu_based_vm_exec_control; + u32 exception_bitmap; + u32 vm_entry_controls; + u32 vm_entry_intr_info_field; + u32 vm_entry_exception_error_code; + u32 vm_entry_instruction_len; + u32 tpr_threshold; + + u64 guest_rip; + + u32 hv_clean_fields; + u32 hv_padding_32; + u32 hv_synthetic_controls; + u32 hv_enlightenments_control; + u32 hv_vp_id; + + u64 hv_vm_id; + u64 partition_assist_page; + u64 padding64_4[4]; + u64 guest_bndcfgs; + u64 padding64_5[7]; + u64 xss_exit_bitmap; + u64 padding64_6[7]; +}; + +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE 0 +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP BIT(0) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP BIT(1) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2 BIT(2) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1 BIT(3) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC BIT(4) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT BIT(5) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY BIT(6) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN BIT(7) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR BIT(8) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT BIT(9) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC BIT(10) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1 BIT(11) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2 BIT(12) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER BIT(13) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1 BIT(14) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL BIT(15) + +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF + #define HV_STIMER_ENABLE (1ULL << 0) #define HV_STIMER_PERIODIC (1ULL << 1) #define HV_STIMER_LAZY (1ULL << 2) diff --git a/arch/x86/include/asm/intel_pconfig.h b/arch/x86/include/asm/intel_pconfig.h new file mode 100644 index 000000000000..3cb002b1d0f9 --- /dev/null +++ b/arch/x86/include/asm/intel_pconfig.h @@ -0,0 +1,65 @@ +#ifndef _ASM_X86_INTEL_PCONFIG_H +#define _ASM_X86_INTEL_PCONFIG_H + +#include <asm/asm.h> +#include <asm/processor.h> + +enum pconfig_target { + INVALID_TARGET = 0, + MKTME_TARGET = 1, + PCONFIG_TARGET_NR +}; + +int pconfig_target_supported(enum pconfig_target target); + +enum pconfig_leaf { + MKTME_KEY_PROGRAM = 0, + PCONFIG_LEAF_INVALID, +}; + +#define PCONFIG ".byte 0x0f, 0x01, 0xc5" + +/* Defines and structure for MKTME_KEY_PROGRAM of PCONFIG instruction */ + +/* mktme_key_program::keyid_ctrl COMMAND, bits [7:0] */ +#define MKTME_KEYID_SET_KEY_DIRECT 0 +#define MKTME_KEYID_SET_KEY_RANDOM 1 +#define MKTME_KEYID_CLEAR_KEY 2 +#define MKTME_KEYID_NO_ENCRYPT 3 + +/* mktme_key_program::keyid_ctrl ENC_ALG, bits [23:8] */ +#define MKTME_AES_XTS_128 (1 << 8) + +/* Return codes from the PCONFIG MKTME_KEY_PROGRAM */ +#define MKTME_PROG_SUCCESS 0 +#define MKTME_INVALID_PROG_CMD 1 +#define MKTME_ENTROPY_ERROR 2 +#define MKTME_INVALID_KEYID 3 +#define MKTME_INVALID_ENC_ALG 4 +#define MKTME_DEVICE_BUSY 5 + +/* Hardware requires the structure to be 256 byte alinged. Otherwise #GP(0). */ +struct mktme_key_program { + u16 keyid; + u32 keyid_ctrl; + u8 __rsvd[58]; + u8 key_field_1[64]; + u8 key_field_2[64]; +} __packed __aligned(256); + +static inline int mktme_key_program(struct mktme_key_program *key_program) +{ + unsigned long rax = MKTME_KEY_PROGRAM; + + if (!pconfig_target_supported(MKTME_TARGET)) + return -ENXIO; + + asm volatile(PCONFIG + : "=a" (rax), "=b" (key_program) + : "0" (rax), "1" (key_program) + : "memory", "cc"); + + return rax; +} + +#endif /* _ASM_X86_INTEL_PCONFIG_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 95e948627fd0..f6e5b9375d8c 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -232,21 +232,6 @@ extern void set_iounmap_nonlazy(void); */ #define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET)) -/* - * Cache management - * - * This needed for two cases - * 1. Out of order aware processors - * 2. Accidentally out of order processors (PPro errata #51) - */ - -static inline void flush_write_buffers(void) -{ -#if defined(CONFIG_X86_PPRO_FENCE) - asm volatile("lock; addl $0,0(%%esp)": : :"memory"); -#endif -} - #endif /* __KERNEL__ */ extern void native_io_delay(void); diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index a8834dd546cd..fd20a2334885 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -183,16 +183,17 @@ extern void disable_ioapic_support(void); extern void __init io_apic_init_mappings(void); extern unsigned int native_io_apic_read(unsigned int apic, unsigned int reg); -extern void native_disable_io_apic(void); +extern void native_restore_boot_irq_mode(void); static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) { - return x86_io_apic_ops.read(apic, reg); + return x86_apic_ops.io_apic_read(apic, reg); } extern void setup_IO_APIC(void); extern void enable_IO_APIC(void); -extern void disable_IO_APIC(void); +extern void clear_IO_APIC(void); +extern void restore_boot_irq_mode(void); extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin); extern void print_IO_APICs(void); #else /* !CONFIG_X86_IO_APIC */ @@ -228,10 +229,11 @@ static inline void mp_save_irq(struct mpc_intsrc *m) { } static inline void disable_ioapic_support(void) { } static inline void io_apic_init_mappings(void) { } #define native_io_apic_read NULL -#define native_disable_io_apic NULL +#define native_restore_boot_irq_mode NULL static inline void setup_IO_APIC(void) { } static inline void enable_IO_APIC(void) { } +static inline void restore_boot_irq_mode(void) { } #endif diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index 1e5d5d92eb40..baedab8ac538 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -2,13 +2,10 @@ #ifndef _ASM_X86_IOMMU_H #define _ASM_X86_IOMMU_H -extern const struct dma_map_ops nommu_dma_ops; extern int force_iommu, no_iommu; extern int iommu_detected; extern int iommu_pass_through; -int x86_dma_supported(struct device *dev, u64 mask); - /* 10 seconds */ #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index e71c1120426b..548d90bbf919 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -34,11 +34,6 @@ * (0x80 is the syscall vector, 0x30-0x3f are for ISA) */ #define FIRST_EXTERNAL_VECTOR 0x20 -/* - * We start allocating at 0x21 to spread out vectors evenly between - * priority levels. (0x80 is the syscall vector) - */ -#define VECTOR_OFFSET_START 1 /* * Reserve the lowest usable vector (and hence lowest priority) 0x20 for @@ -106,9 +101,10 @@ #if IS_ENABLED(CONFIG_HYPERV) #define HYPERV_REENLIGHTENMENT_VECTOR 0xee +#define HYPERV_STIMER0_VECTOR 0xed #endif -#define LOCAL_TIMER_VECTOR 0xed +#define LOCAL_TIMER_VECTOR 0xec #define NR_VECTORS 256 @@ -118,8 +114,6 @@ #define FIRST_SYSTEM_VECTOR NR_VECTORS #endif -#define FPU_IRQ 13 - /* * Size the maximum number of interrupts. * diff --git a/arch/x86/include/asm/jailhouse_para.h b/arch/x86/include/asm/jailhouse_para.h index 875b54376689..a34897aef2c2 100644 --- a/arch/x86/include/asm/jailhouse_para.h +++ b/arch/x86/include/asm/jailhouse_para.h @@ -1,7 +1,7 @@ -/* SPDX-License-Identifier: GPL2.0 */ +/* SPDX-License-Identifier: GPL-2.0 */ /* - * Jailhouse paravirt_ops implementation + * Jailhouse paravirt detection * * Copyright (c) Siemens AG, 2015-2017 * diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h index 460991e3b529..db7ba2feb947 100644 --- a/arch/x86/include/asm/kaslr.h +++ b/arch/x86/include/asm/kaslr.h @@ -5,10 +5,6 @@ unsigned long kaslr_get_random_long(const char *purpose); #ifdef CONFIG_RANDOMIZE_MEMORY -extern unsigned long page_offset_base; -extern unsigned long vmalloc_base; -extern unsigned long vmemmap_base; - void kernel_randomize_memory(void); #else static inline void kernel_randomize_memory(void) { } diff --git a/arch/x86/include/asm/kexec-bzimage64.h b/arch/x86/include/asm/kexec-bzimage64.h index 9f07cff43705..df89ee7d3e9e 100644 --- a/arch/x86/include/asm/kexec-bzimage64.h +++ b/arch/x86/include/asm/kexec-bzimage64.h @@ -2,6 +2,6 @@ #ifndef _ASM_KEXEC_BZIMAGE64_H #define _ASM_KEXEC_BZIMAGE64_H -extern struct kexec_file_ops kexec_bzImage64_ops; +extern const struct kexec_file_ops kexec_bzImage64_ops; #endif /* _ASM_KEXE_BZIMAGE64_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dd6f57a54a26..c25775fad4ed 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -34,6 +34,7 @@ #include <asm/msr-index.h> #include <asm/asm.h> #include <asm/kvm_page_track.h> +#include <asm/hyperv-tlfs.h> #define KVM_MAX_VCPUS 288 #define KVM_SOFT_MAX_VCPUS 240 @@ -73,6 +74,7 @@ #define KVM_REQ_HV_RESET KVM_ARCH_REQ(20) #define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21) #define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22) +#define KVM_REQ_LOAD_EOI_EXITMAP KVM_ARCH_REQ(23) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -498,6 +500,7 @@ struct kvm_vcpu_arch { u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ bool apicv_active; + bool load_eoi_exitmap_pending; DECLARE_BITMAP(ioapic_handled_vectors, 256); unsigned long apic_attention; int32_t apic_arb_prio; @@ -507,6 +510,7 @@ struct kvm_vcpu_arch { u64 smi_count; bool tpr_access_reporting; u64 ia32_xss; + u64 microcode_version; /* * Paging state of the vcpu @@ -570,7 +574,7 @@ struct kvm_vcpu_arch { } exception; struct kvm_queued_interrupt { - bool pending; + bool injected; bool soft; u8 nr; } interrupt; @@ -753,6 +757,12 @@ struct kvm_hv { u64 hv_crash_ctl; HV_REFERENCE_TSC_PAGE tsc_ref; + + struct idr conn_to_evt; + + u64 hv_reenlightenment_control; + u64 hv_tsc_emulation_control; + u64 hv_tsc_emulation_status; }; enum kvm_irqchip_mode { @@ -761,15 +771,6 @@ enum kvm_irqchip_mode { KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */ }; -struct kvm_sev_info { - bool active; /* SEV enabled guest */ - unsigned int asid; /* ASID used for this guest */ - unsigned int handle; /* SEV firmware handle */ - int fd; /* SEV device fd */ - unsigned long pages_locked; /* Number of pages locked */ - struct list_head regions_list; /* List of registered regions */ -}; - struct kvm_arch { unsigned int n_used_mmu_pages; unsigned int n_requested_mmu_pages; @@ -799,13 +800,13 @@ struct kvm_arch { struct mutex apic_map_lock; struct kvm_apic_map *apic_map; - unsigned int tss_addr; bool apic_access_page_done; gpa_t wall_clock; - bool ept_identity_pagetable_done; - gpa_t ept_identity_map_addr; + bool mwait_in_guest; + bool hlt_in_guest; + bool pause_in_guest; unsigned long irq_sources_bitmap; s64 kvmclock_offset; @@ -848,17 +849,8 @@ struct kvm_arch { bool disabled_lapic_found; - /* Struct members for AVIC */ - u32 avic_vm_id; - u32 ldr_mode; - struct page *avic_logical_id_table_page; - struct page *avic_physical_id_table_page; - struct hlist_node hnode; - bool x2apic_format; bool x2apic_broadcast_quirk_disabled; - - struct kvm_sev_info sev_info; }; struct kvm_vm_stat { @@ -935,6 +927,8 @@ struct kvm_x86_ops { bool (*cpu_has_high_real_mode_segbase)(void); void (*cpuid_update)(struct kvm_vcpu *vcpu); + struct kvm *(*vm_alloc)(void); + void (*vm_free)(struct kvm *); int (*vm_init)(struct kvm *kvm); void (*vm_destroy)(struct kvm *kvm); @@ -1006,6 +1000,7 @@ struct kvm_x86_ops { void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); + int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); int (*get_tdp_level)(struct kvm_vcpu *vcpu); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); int (*get_lpage_level)(void); @@ -1018,6 +1013,7 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); + u64 (*read_l1_tsc_offset)(struct kvm_vcpu *vcpu); void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); @@ -1095,6 +1091,8 @@ struct kvm_x86_ops { int (*mem_enc_op)(struct kvm *kvm, void __user *argp); int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp); + + int (*get_msr_feature)(struct kvm_msr_entry *entry); }; struct kvm_arch_async_pf { @@ -1106,6 +1104,17 @@ struct kvm_arch_async_pf { extern struct kvm_x86_ops *kvm_x86_ops; +#define __KVM_HAVE_ARCH_VM_ALLOC +static inline struct kvm *kvm_arch_alloc_vm(void) +{ + return kvm_x86_ops->vm_alloc(); +} + +static inline void kvm_arch_free_vm(struct kvm *kvm) +{ + return kvm_x86_ops->vm_free(kvm); +} + int kvm_mmu_module_init(void); void kvm_mmu_module_exit(void); @@ -1184,6 +1193,8 @@ enum emulation_result { #define EMULTYPE_SKIP (1 << 2) #define EMULTYPE_RETRY (1 << 3) #define EMULTYPE_NO_REEXECUTE (1 << 4) +#define EMULTYPE_NO_UD_ON_FAIL (1 << 5) +#define EMULTYPE_VMWARE (1 << 6) int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, int emulation_type, void *insn, int insn_len); @@ -1201,8 +1212,7 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); struct x86_emulate_ctxt; -int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port); -int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port); +int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in); int kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); int kvm_vcpu_halt(struct kvm_vcpu *vcpu); @@ -1464,7 +1474,4 @@ static inline int kvm_cpu_get_apicid(int mps_cpu) #define put_smstate(type, buf, offset, val) \ *(type *)((buf) + (offset) - 0x7e00) = val -void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end); - #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 7b407dda2bd7..3aea2658323a 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -88,6 +88,7 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, #ifdef CONFIG_KVM_GUEST bool kvm_para_available(void); unsigned int kvm_arch_para_features(void); +unsigned int kvm_arch_para_hints(void); void kvm_async_pf_task_wait(u32 token, int interrupt_kernel); void kvm_async_pf_task_wake(u32 token); u32 kvm_read_and_reset_pf_reason(void); @@ -115,6 +116,11 @@ static inline unsigned int kvm_arch_para_features(void) return 0; } +static inline unsigned int kvm_arch_para_hints(void) +{ + return 0; +} + static inline u32 kvm_read_and_reset_pf_reason(void) { return 0; diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 96ea4b5ba658..8c7b3e5a2d01 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -138,58 +138,6 @@ struct mce_log_buffer { struct mce entry[MCE_LOG_LEN]; }; -struct mca_config { - bool dont_log_ce; - bool cmci_disabled; - bool lmce_disabled; - bool ignore_ce; - bool disabled; - bool ser; - bool recovery; - bool bios_cmci_threshold; - u8 banks; - s8 bootlog; - int tolerant; - int monarch_timeout; - int panic_timeout; - u32 rip_msr; -}; - -struct mce_vendor_flags { - /* - * Indicates that overflow conditions are not fatal, when set. - */ - __u64 overflow_recov : 1, - - /* - * (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and - * Recovery. It indicates support for data poisoning in HW and deferred - * error interrupts. - */ - succor : 1, - - /* - * (AMD) SMCA: This bit indicates support for Scalable MCA which expands - * the register space for each MCA bank and also increases number of - * banks. Also, to accommodate the new banks and registers, the MCA - * register space is moved to a new MSR range. - */ - smca : 1, - - __reserved_0 : 61; -}; - -struct mca_msr_regs { - u32 (*ctl) (int bank); - u32 (*status) (int bank); - u32 (*addr) (int bank); - u32 (*misc) (int bank); -}; - -extern struct mce_vendor_flags mce_flags; - -extern struct mca_msr_regs msr_ops; - enum mce_notifier_prios { MCE_PRIO_FIRST = INT_MAX, MCE_PRIO_SRAO = INT_MAX - 1, @@ -346,6 +294,7 @@ enum smca_bank_types { SMCA_IF, /* Instruction Fetch */ SMCA_L2_CACHE, /* L2 Cache */ SMCA_DE, /* Decoder Unit */ + SMCA_RESERVED, /* Reserved */ SMCA_EX, /* Execution Unit */ SMCA_FP, /* Floating Point */ SMCA_L3_CACHE, /* L3 Cache */ diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 22c5f3e6f820..c0643831706e 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -22,6 +22,7 @@ #ifdef CONFIG_AMD_MEM_ENCRYPT extern u64 sme_me_mask; +extern bool sev_enabled; void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr, unsigned long decrypted_kernel_vaddr, @@ -48,8 +49,6 @@ int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size); /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void); -void swiotlb_set_mem_attributes(void *vaddr, unsigned long size); - bool sme_active(void); bool sev_active(void); diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 55520cec8b27..2b7cc5397f80 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -6,20 +6,6 @@ #include <linux/earlycpio.h> #include <linux/initrd.h> -#define native_rdmsr(msr, val1, val2) \ -do { \ - u64 __val = __rdmsr((msr)); \ - (void)((val1) = (u32)__val); \ - (void)((val2) = (u32)(__val >> 32)); \ -} while (0) - -#define native_wrmsr(msr, low, high) \ - __wrmsr(msr, low, high) - -#define native_wrmsrl(msr, val) \ - __wrmsr((msr), (u32)((u64)(val)), \ - (u32)((u64)(val) >> 32)) - struct ucode_patch { struct list_head plist; void *data; /* Intel uses only this one */ @@ -37,7 +23,13 @@ struct cpu_signature { struct device; -enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND }; +enum ucode_state { + UCODE_OK = 0, + UCODE_NEW, + UCODE_UPDATED, + UCODE_NFOUND, + UCODE_ERROR, +}; struct microcode_ops { enum ucode_state (*request_microcode_user) (int cpu, @@ -54,7 +46,7 @@ struct microcode_ops { * are being called. * See also the "Synchronization" section in microcode_core.c. */ - int (*apply_microcode) (int cpu); + enum ucode_state (*apply_microcode) (int cpu); int (*collect_cpu_info) (int cpu, struct cpu_signature *csig); }; diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index c931b88982a0..57e3785d0d26 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -24,11 +24,12 @@ static inline void paravirt_activate_mm(struct mm_struct *prev, #endif /* !CONFIG_PARAVIRT */ #ifdef CONFIG_PERF_EVENTS -extern struct static_key rdpmc_always_available; + +DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key); static inline void load_mm_cr4(struct mm_struct *mm) { - if (static_key_false(&rdpmc_always_available) || + if (static_branch_unlikely(&rdpmc_always_available_key) || atomic_read(&mm->context.perf_rdpmc_allowed)) cr4_set_bits(X86_CR4_PCE); else @@ -74,6 +75,7 @@ static inline void *ldt_slot_va(int slot) return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot); #else BUG(); + return (void *)fix_to_virt(FIX_HOLE); #endif } diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 25283f7eb299..b90e79610cf7 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -6,90 +6,23 @@ #include <linux/atomic.h> #include <linux/nmi.h> #include <asm/io.h> -#include <asm/hyperv.h> +#include <asm/hyperv-tlfs.h> #include <asm/nospec-branch.h> -/* - * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent - * is set by CPUID(HVCPUID_VERSION_FEATURES). - */ -enum hv_cpuid_function { - HVCPUID_VERSION_FEATURES = 0x00000001, - HVCPUID_VENDOR_MAXFUNCTION = 0x40000000, - HVCPUID_INTERFACE = 0x40000001, - - /* - * The remaining functions depend on the value of - * HVCPUID_INTERFACE - */ - HVCPUID_VERSION = 0x40000002, - HVCPUID_FEATURES = 0x40000003, - HVCPUID_ENLIGHTENMENT_INFO = 0x40000004, - HVCPUID_IMPLEMENTATION_LIMITS = 0x40000005, -}; - struct ms_hyperv_info { u32 features; u32 misc_features; u32 hints; + u32 nested_features; u32 max_vp_index; u32 max_lp_index; }; extern struct ms_hyperv_info ms_hyperv; -/* - * Declare the MSR used to setup pages used to communicate with the hypervisor. - */ -union hv_x64_msr_hypercall_contents { - u64 as_uint64; - struct { - u64 enable:1; - u64 reserved:11; - u64 guest_physical_address:52; - }; -}; - -/* - * TSC page layout. - */ - -struct ms_hyperv_tsc_page { - volatile u32 tsc_sequence; - u32 reserved1; - volatile u64 tsc_scale; - volatile s64 tsc_offset; - u64 reserved2[509]; -}; - -/* - * The guest OS needs to register the guest ID with the hypervisor. - * The guest ID is a 64 bit entity and the structure of this ID is - * specified in the Hyper-V specification: - * - * msdn.microsoft.com/en-us/library/windows/hardware/ff542653%28v=vs.85%29.aspx - * - * While the current guideline does not specify how Linux guest ID(s) - * need to be generated, our plan is to publish the guidelines for - * Linux and other guest operating systems that currently are hosted - * on Hyper-V. The implementation here conforms to this yet - * unpublished guidelines. - * - * - * Bit(s) - * 63 - Indicates if the OS is Open Source or not; 1 is Open Source - * 62:56 - Os Type; Linux is 0x100 - * 55:48 - Distro specific identification - * 47:16 - Linux kernel version number - * 15:0 - Distro specific identification - * - * - */ - -#define HV_LINUX_VENDOR_ID 0x8100 /* - * Generate the guest ID based on the guideline described above. + * Generate the guest ID. */ static inline __u64 generate_guest_id(__u64 d_info1, __u64 kernel_version, @@ -173,6 +106,19 @@ void hv_remove_kexec_handler(void); void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs)); void hv_remove_crash_handler(void); +/* + * Routines for stimer0 Direct Mode handling. + * On x86/x64, there are no percpu actions to take. + */ +void hv_stimer0_vector_handler(struct pt_regs *regs); +void hv_stimer0_callback_vector(void); +int hv_setup_stimer0_irq(int *irq, int *vector, void (*handler)(void)); +void hv_remove_stimer0_irq(int irq); + +static inline void hv_enable_stimer0_percpu_irq(int irq) {} +static inline void hv_disable_stimer0_percpu_irq(int irq) {} + + #if IS_ENABLED(CONFIG_HYPERV) extern struct clocksource *hyperv_cs; extern void *hv_hypercall_pg; @@ -215,14 +161,6 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) return hv_status; } -#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0) -#define HV_HYPERCALL_FAST_BIT BIT(16) -#define HV_HYPERCALL_VARHEAD_OFFSET 17 -#define HV_HYPERCALL_REP_COMP_OFFSET 32 -#define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32) -#define HV_HYPERCALL_REP_START_OFFSET 48 -#define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48) - /* Fast hypercall with 8 bytes of input and no output */ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) { @@ -294,6 +232,15 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, */ extern u32 *hv_vp_index; extern u32 hv_max_vp_index; +extern struct hv_vp_assist_page **hv_vp_assist_page; + +static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu) +{ + if (!hv_vp_assist_page) + return NULL; + + return hv_vp_assist_page[cpu]; +} /** * hv_cpu_number_to_vp_number() - Map CPU to VP. @@ -330,6 +277,10 @@ static inline void hyperv_setup_mmu_ops(void) {} static inline void set_hv_tscchange_cb(void (*cb)(void)) {} static inline void clear_hv_tscchange_cb(void) {} static inline void hyperv_stop_tsc_emulation(void) {}; +static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu) +{ + return NULL; +} #endif /* CONFIG_HYPERV */ #ifdef CONFIG_HYPERV_TSCPAGE diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index c9084dedfcfa..53d5b1b9255e 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -353,7 +353,21 @@ /* Fam 15h MSRs */ #define MSR_F15H_PERF_CTL 0xc0010200 +#define MSR_F15H_PERF_CTL0 MSR_F15H_PERF_CTL +#define MSR_F15H_PERF_CTL1 (MSR_F15H_PERF_CTL + 2) +#define MSR_F15H_PERF_CTL2 (MSR_F15H_PERF_CTL + 4) +#define MSR_F15H_PERF_CTL3 (MSR_F15H_PERF_CTL + 6) +#define MSR_F15H_PERF_CTL4 (MSR_F15H_PERF_CTL + 8) +#define MSR_F15H_PERF_CTL5 (MSR_F15H_PERF_CTL + 10) + #define MSR_F15H_PERF_CTR 0xc0010201 +#define MSR_F15H_PERF_CTR0 MSR_F15H_PERF_CTR +#define MSR_F15H_PERF_CTR1 (MSR_F15H_PERF_CTR + 2) +#define MSR_F15H_PERF_CTR2 (MSR_F15H_PERF_CTR + 4) +#define MSR_F15H_PERF_CTR3 (MSR_F15H_PERF_CTR + 6) +#define MSR_F15H_PERF_CTR4 (MSR_F15H_PERF_CTR + 8) +#define MSR_F15H_PERF_CTR5 (MSR_F15H_PERF_CTR + 10) + #define MSR_F15H_NB_PERF_CTL 0xc0010240 #define MSR_F15H_NB_PERF_CTR 0xc0010241 #define MSR_F15H_PTSC 0xc0010280 diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 30df295f6d94..04addd6e0a4a 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -108,6 +108,20 @@ static inline void notrace __wrmsr(unsigned int msr, u32 low, u32 high) : : "c" (msr), "a"(low), "d" (high) : "memory"); } +#define native_rdmsr(msr, val1, val2) \ +do { \ + u64 __val = __rdmsr((msr)); \ + (void)((val1) = (u32)__val); \ + (void)((val2) = (u32)(__val >> 32)); \ +} while (0) + +#define native_wrmsr(msr, low, high) \ + __wrmsr(msr, low, high) + +#define native_wrmsrl(msr, val) \ + __wrmsr((msr), (u32)((u64)(val)), \ + (u32)((u64)(val) >> 32)) + static inline unsigned long long native_read_msr(unsigned int msr) { unsigned long long val; @@ -218,9 +232,6 @@ static __always_inline unsigned long long rdtsc_ordered(void) return rdtsc(); } -/* Deprecated, keep it for a cycle for easier merging: */ -#define rdtscll(now) do { (now) = rdtsc_ordered(); } while (0) - static inline unsigned long long native_read_pmc(int counter) { DECLARE_ARGS(val, low, high); diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 4d57894635f2..f928ad9b143f 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -6,6 +6,51 @@ #include <asm/alternative.h> #include <asm/alternative-asm.h> #include <asm/cpufeatures.h> +#include <asm/msr-index.h> + +/* + * Fill the CPU return stack buffer. + * + * Each entry in the RSB, if used for a speculative 'ret', contains an + * infinite 'pause; lfence; jmp' loop to capture speculative execution. + * + * This is required in various cases for retpoline and IBRS-based + * mitigations for the Spectre variant 2 vulnerability. Sometimes to + * eliminate potentially bogus entries from the RSB, and sometimes + * purely to ensure that it doesn't get empty, which on some CPUs would + * allow predictions from other (unwanted!) sources to be used. + * + * We define a CPP macro such that it can be used from both .S files and + * inline assembly. It's possible to do a .macro and then include that + * from C via asm(".include <asm/nospec-branch.h>") but let's not go there. + */ + +#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */ +#define RSB_FILL_LOOPS 16 /* To avoid underflow */ + +/* + * Google experimented with loop-unrolling and this turned out to be + * the optimal version — two calls, each with their own speculation + * trap should their return address end up getting used, in a loop. + */ +#define __FILL_RETURN_BUFFER(reg, nr, sp) \ + mov $(nr/2), reg; \ +771: \ + call 772f; \ +773: /* speculation trap */ \ + pause; \ + lfence; \ + jmp 773b; \ +772: \ + call 774f; \ +775: /* speculation trap */ \ + pause; \ + lfence; \ + jmp 775b; \ +774: \ + dec reg; \ + jnz 771b; \ + add $(BITS_PER_LONG/8) * nr, sp; #ifdef __ASSEMBLY__ @@ -23,6 +68,18 @@ .endm /* + * This should be used immediately before an indirect jump/call. It tells + * objtool the subsequent indirect jump/call is vouched safe for retpoline + * builds. + */ +.macro ANNOTATE_RETPOLINE_SAFE + .Lannotate_\@: + .pushsection .discard.retpoline_safe + _ASM_PTR .Lannotate_\@ + .popsection +.endm + +/* * These are the bare retpoline primitives for indirect jmp and call. * Do not use these directly; they only exist to make the ALTERNATIVE * invocation below less ugly. @@ -58,9 +115,9 @@ .macro JMP_NOSPEC reg:req #ifdef CONFIG_RETPOLINE ANNOTATE_NOSPEC_ALTERNATIVE - ALTERNATIVE_2 __stringify(jmp *\reg), \ + ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *\reg), \ __stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE, \ - __stringify(lfence; jmp *\reg), X86_FEATURE_RETPOLINE_AMD + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *\reg), X86_FEATURE_RETPOLINE_AMD #else jmp *\reg #endif @@ -69,18 +126,25 @@ .macro CALL_NOSPEC reg:req #ifdef CONFIG_RETPOLINE ANNOTATE_NOSPEC_ALTERNATIVE - ALTERNATIVE_2 __stringify(call *\reg), \ + ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *\reg), \ __stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\ - __stringify(lfence; call *\reg), X86_FEATURE_RETPOLINE_AMD + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *\reg), X86_FEATURE_RETPOLINE_AMD #else call *\reg #endif .endm -/* This clobbers the BX register */ -.macro FILL_RETURN_BUFFER nr:req ftr:req + /* + * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP + * monstrosity above, manually. + */ +.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req #ifdef CONFIG_RETPOLINE - ALTERNATIVE "", "call __clear_rsb", \ftr + ANNOTATE_NOSPEC_ALTERNATIVE + ALTERNATIVE "jmp .Lskip_rsb_\@", \ + __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \ + \ftr +.Lskip_rsb_\@: #endif .endm @@ -92,6 +156,12 @@ ".long 999b - .\n\t" \ ".popsection\n\t" +#define ANNOTATE_RETPOLINE_SAFE \ + "999:\n\t" \ + ".pushsection .discard.retpoline_safe\n\t" \ + _ASM_PTR " 999b\n\t" \ + ".popsection\n\t" + #if defined(CONFIG_X86_64) && defined(RETPOLINE) /* @@ -101,6 +171,7 @@ # define CALL_NOSPEC \ ANNOTATE_NOSPEC_ALTERNATIVE \ ALTERNATIVE( \ + ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ "call __x86_indirect_thunk_%V[thunk_target]\n", \ X86_FEATURE_RETPOLINE) @@ -112,7 +183,10 @@ * otherwise we'll run out of registers. We don't care about CET * here, anyway. */ -# define CALL_NOSPEC ALTERNATIVE("call *%[thunk_target]\n", \ +# define CALL_NOSPEC \ + ALTERNATIVE( \ + ANNOTATE_RETPOLINE_SAFE \ + "call *%[thunk_target]\n", \ " jmp 904f;\n" \ " .align 16\n" \ "901: call 903f;\n" \ @@ -155,20 +229,90 @@ extern char __indirect_thunk_end[]; static inline void vmexit_fill_RSB(void) { #ifdef CONFIG_RETPOLINE - alternative_input("", - "call __fill_rsb", - X86_FEATURE_RETPOLINE, - ASM_NO_INPUT_CLOBBER(_ASM_BX, "memory")); + unsigned long loops; + + asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE + ALTERNATIVE("jmp 910f", + __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)), + X86_FEATURE_RETPOLINE) + "910:" + : "=r" (loops), ASM_CALL_CONSTRAINT + : : "memory" ); #endif } +#define alternative_msr_write(_msr, _val, _feature) \ + asm volatile(ALTERNATIVE("", \ + "movl %[msr], %%ecx\n\t" \ + "movl %[val], %%eax\n\t" \ + "movl $0, %%edx\n\t" \ + "wrmsr", \ + _feature) \ + : : [msr] "i" (_msr), [val] "i" (_val) \ + : "eax", "ecx", "edx", "memory") + static inline void indirect_branch_prediction_barrier(void) { - alternative_input("", - "call __ibp_barrier", - X86_FEATURE_USE_IBPB, - ASM_NO_INPUT_CLOBBER("eax", "ecx", "edx", "memory")); + alternative_msr_write(MSR_IA32_PRED_CMD, PRED_CMD_IBPB, + X86_FEATURE_USE_IBPB); } +/* + * With retpoline, we must use IBRS to restrict branch prediction + * before calling into firmware. + * + * (Implemented as CPP macros due to header hell.) + */ +#define firmware_restrict_branch_speculation_start() \ +do { \ + preempt_disable(); \ + alternative_msr_write(MSR_IA32_SPEC_CTRL, SPEC_CTRL_IBRS, \ + X86_FEATURE_USE_IBRS_FW); \ +} while (0) + +#define firmware_restrict_branch_speculation_end() \ +do { \ + alternative_msr_write(MSR_IA32_SPEC_CTRL, 0, \ + X86_FEATURE_USE_IBRS_FW); \ + preempt_enable(); \ +} while (0) + #endif /* __ASSEMBLY__ */ + +/* + * Below is used in the eBPF JIT compiler and emits the byte sequence + * for the following assembly: + * + * With retpolines configured: + * + * callq do_rop + * spec_trap: + * pause + * lfence + * jmp spec_trap + * do_rop: + * mov %rax,(%rsp) + * retq + * + * Without retpolines configured: + * + * jmp *%rax + */ +#ifdef CONFIG_RETPOLINE +# define RETPOLINE_RAX_BPF_JIT_SIZE 17 +# define RETPOLINE_RAX_BPF_JIT() \ + EMIT1_off32(0xE8, 7); /* callq do_rop */ \ + /* spec_trap: */ \ + EMIT2(0xF3, 0x90); /* pause */ \ + EMIT3(0x0F, 0xAE, 0xE8); /* lfence */ \ + EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \ + /* do_rop: */ \ + EMIT4(0x48, 0x89, 0x04, 0x24); /* mov %rax,(%rsp) */ \ + EMIT1(0xC3); /* retq */ +#else +# define RETPOLINE_RAX_BPF_JIT_SIZE 2 +# define RETPOLINE_RAX_BPF_JIT() \ + EMIT2(0xFF, 0xE0); /* jmp *%rax */ +#endif + #endif /* _ASM_X86_NOSPEC_BRANCH_H_ */ diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 4baa6bceb232..939b1cff4a7b 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -11,6 +11,10 @@ extern unsigned long max_pfn; extern unsigned long phys_base; +extern unsigned long page_offset_base; +extern unsigned long vmalloc_base; +extern unsigned long vmemmap_base; + static inline unsigned long __phys_addr_nodebug(unsigned long x) { unsigned long y = x - __START_KERNEL_map; @@ -47,15 +51,11 @@ static inline void clear_page(void *page) clear_page_erms, X86_FEATURE_ERMS, "=D" (page), "0" (page) - : "memory", "rax", "rcx"); + : "cc", "memory", "rax", "rcx"); } void copy_page(void *to, void *from); -#ifdef CONFIG_X86_MCE -#define arch_unmap_kpfn arch_unmap_kpfn -#endif - #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_VSYSCALL_EMULATION diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index e1407312c412..2c5a966dc222 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -37,26 +37,24 @@ * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's * what Xen requires. */ -#ifdef CONFIG_X86_5LEVEL -#define __PAGE_OFFSET_BASE _AC(0xff10000000000000, UL) -#else -#define __PAGE_OFFSET_BASE _AC(0xffff880000000000, UL) -#endif +#define __PAGE_OFFSET_BASE_L5 _AC(0xff10000000000000, UL) +#define __PAGE_OFFSET_BASE_L4 _AC(0xffff880000000000, UL) -#ifdef CONFIG_RANDOMIZE_MEMORY +#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT #define __PAGE_OFFSET page_offset_base #else -#define __PAGE_OFFSET __PAGE_OFFSET_BASE -#endif /* CONFIG_RANDOMIZE_MEMORY */ +#define __PAGE_OFFSET __PAGE_OFFSET_BASE_L4 +#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ #define __START_KERNEL_map _AC(0xffffffff80000000, UL) /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ -#ifdef CONFIG_X86_5LEVEL + #define __PHYSICAL_MASK_SHIFT 52 -#define __VIRTUAL_MASK_SHIFT 56 + +#ifdef CONFIG_X86_5LEVEL +#define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled ? 56 : 47) #else -#define __PHYSICAL_MASK_SHIFT 46 #define __VIRTUAL_MASK_SHIFT 47 #endif diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 892df375b615..9be2bf13825b 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -7,6 +7,7 @@ #ifdef CONFIG_PARAVIRT #include <asm/pgtable_types.h> #include <asm/asm.h> +#include <asm/nospec-branch.h> #include <asm/paravirt_types.h> @@ -297,9 +298,9 @@ static inline void __flush_tlb_global(void) { PVOP_VCALL0(pv_mmu_ops.flush_tlb_kernel); } -static inline void __flush_tlb_single(unsigned long addr) +static inline void __flush_tlb_one_user(unsigned long addr) { - PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr); + PVOP_VCALL1(pv_mmu_ops.flush_tlb_one_user, addr); } static inline void flush_tlb_others(const struct cpumask *cpumask, @@ -567,17 +568,22 @@ static inline p4dval_t p4d_val(p4d_t p4d) return PVOP_CALLEE1(p4dval_t, pv_mmu_ops.p4d_val, p4d.p4d); } -static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) +static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd) { - pgdval_t val = native_pgd_val(pgd); - - PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, val); + PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, native_pgd_val(pgd)); } -static inline void pgd_clear(pgd_t *pgdp) -{ - set_pgd(pgdp, __pgd(0)); -} +#define set_pgd(pgdp, pgdval) do { \ + if (pgtable_l5_enabled) \ + __set_pgd(pgdp, pgdval); \ + else \ + set_p4d((p4d_t *)(pgdp), (p4d_t) { (pgdval).pgd }); \ +} while (0) + +#define pgd_clear(pgdp) do { \ + if (pgtable_l5_enabled) \ + set_pgd(pgdp, __pgd(0)); \ +} while (0) #endif /* CONFIG_PGTABLE_LEVELS == 5 */ @@ -879,23 +885,27 @@ extern void default_banner(void); #define INTERRUPT_RETURN \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE, \ - jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_iret)) + ANNOTATE_RETPOLINE_SAFE; \ + jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_iret);) #define DISABLE_INTERRUPTS(clobbers) \ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ + ANNOTATE_RETPOLINE_SAFE; \ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable); \ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) #define ENABLE_INTERRUPTS(clobbers) \ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ + ANNOTATE_RETPOLINE_SAFE; \ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) #ifdef CONFIG_X86_32 #define GET_CR0_INTO_EAX \ push %ecx; push %edx; \ + ANNOTATE_RETPOLINE_SAFE; \ call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \ pop %edx; pop %ecx #else /* !CONFIG_X86_32 */ @@ -917,21 +927,25 @@ extern void default_banner(void); */ #define SWAPGS \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \ - call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs) \ + ANNOTATE_RETPOLINE_SAFE; \ + call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs); \ ) #define GET_CR2_INTO_RAX \ - call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2) + ANNOTATE_RETPOLINE_SAFE; \ + call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2); #define USERGS_SYSRET64 \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ CLBR_NONE, \ - jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) + ANNOTATE_RETPOLINE_SAFE; \ + jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64);) #ifdef CONFIG_DEBUG_ENTRY #define SAVE_FLAGS(clobbers) \ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ + ANNOTATE_RETPOLINE_SAFE; \ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl); \ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) #endif diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 6ec54d01972d..180bc0bff0fb 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -43,6 +43,7 @@ #include <asm/desc_defs.h> #include <asm/kmap_types.h> #include <asm/pgtable_types.h> +#include <asm/nospec-branch.h> struct page; struct thread_struct; @@ -217,7 +218,7 @@ struct pv_mmu_ops { /* TLB operations */ void (*flush_tlb_user)(void); void (*flush_tlb_kernel)(void); - void (*flush_tlb_single)(unsigned long addr); + void (*flush_tlb_one_user)(unsigned long addr); void (*flush_tlb_others)(const struct cpumask *cpus, const struct flush_tlb_info *info); @@ -392,7 +393,9 @@ int paravirt_disable_iospace(void); * offset into the paravirt_patch_template structure, and can therefore be * freely converted back into a structure offset. */ -#define PARAVIRT_CALL "call *%c[paravirt_opptr];" +#define PARAVIRT_CALL \ + ANNOTATE_RETPOLINE_SAFE \ + "call *%c[paravirt_opptr];" /* * These macros are intended to wrap calls through one of the paravirt diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index eb66fa9cd0fc..959d618dbb17 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -151,6 +151,8 @@ extern int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end, phys_addr_t addr); extern int pci_mmconfig_delete(u16 seg, u8 start, u8 end); extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus); +extern struct pci_mmcfg_region *__init pci_mmconfig_add(int segment, int start, + int end, u64 addr); extern struct list_head pci_mmcfg_list; diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index ba3c523aaf16..a06b07399d17 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -526,7 +526,7 @@ static inline bool x86_this_cpu_variable_test_bit(int nr, { bool oldbit; - asm volatile("bt "__percpu_arg(2)",%1" + asm volatile("btl "__percpu_arg(2)",%1" CC_SET(c) : CC_OUT(c) (oldbit) : "m" (*(unsigned long __percpu *)addr), "Ir" (nr)); diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index aff42e1da6ee..263c142a6a6c 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -167,6 +167,8 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, #if CONFIG_PGTABLE_LEVELS > 4 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d) { + if (!pgtable_l5_enabled) + return; paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT); set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d))); } @@ -191,7 +193,8 @@ extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d); static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, unsigned long address) { - ___p4d_free_tlb(tlb, p4d); + if (pgtable_l5_enabled) + ___p4d_free_tlb(tlb, p4d); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index 876b4c77d983..6a59a6d0cc50 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -44,5 +44,6 @@ typedef union { */ #define PTRS_PER_PTE 512 +#define MAX_POSSIBLE_PHYSMEM_BITS 36 #endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 63c2552b6b65..f1633de5a675 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -65,7 +65,7 @@ extern pmdval_t early_pmd_flags; #ifndef __PAGETABLE_P4D_FOLDED #define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd) -#define pgd_clear(pgd) native_pgd_clear(pgd) +#define pgd_clear(pgd) (pgtable_l5_enabled ? native_pgd_clear(pgd) : 0) #endif #ifndef set_p4d @@ -350,14 +350,14 @@ static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) { pmdval_t v = native_pmd_val(pmd); - return __pmd(v | set); + return native_make_pmd(v | set); } static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) { pmdval_t v = native_pmd_val(pmd); - return __pmd(v & ~clear); + return native_make_pmd(v & ~clear); } static inline pmd_t pmd_mkold(pmd_t pmd) @@ -409,14 +409,14 @@ static inline pud_t pud_set_flags(pud_t pud, pudval_t set) { pudval_t v = native_pud_val(pud); - return __pud(v | set); + return native_make_pud(v | set); } static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) { pudval_t v = native_pud_val(pud); - return __pud(v & ~clear); + return native_make_pud(v & ~clear); } static inline pud_t pud_mkold(pud_t pud) @@ -526,22 +526,39 @@ static inline pgprotval_t massage_pgprot(pgprot_t pgprot) return protval; } +static inline pgprotval_t check_pgprot(pgprot_t pgprot) +{ + pgprotval_t massaged_val = massage_pgprot(pgprot); + + /* mmdebug.h can not be included here because of dependencies */ +#ifdef CONFIG_DEBUG_VM + WARN_ONCE(pgprot_val(pgprot) != massaged_val, + "attempted to set unsupported pgprot: %016llx " + "bits: %016llx supported: %016llx\n", + (u64)pgprot_val(pgprot), + (u64)pgprot_val(pgprot) ^ massaged_val, + (u64)__supported_pte_mask); +#endif + + return massaged_val; +} + static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) { return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) | - massage_pgprot(pgprot)); + check_pgprot(pgprot)); } static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) { return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) | - massage_pgprot(pgprot)); + check_pgprot(pgprot)); } static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot) { return __pud(((phys_addr_t)page_nr << PAGE_SHIFT) | - massage_pgprot(pgprot)); + check_pgprot(pgprot)); } static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) @@ -553,7 +570,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) * the newprot (if present): */ val &= _PAGE_CHG_MASK; - val |= massage_pgprot(newprot) & ~_PAGE_CHG_MASK; + val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK; return __pte(val); } @@ -563,7 +580,7 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) pmdval_t val = pmd_val(pmd); val &= _HPAGE_CHG_MASK; - val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK; + val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK; return __pmd(val); } @@ -584,6 +601,11 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define canon_pgprot(p) __pgprot(massage_pgprot(p)) +static inline pgprot_t arch_filter_pgprot(pgprot_t prot) +{ + return canon_pgprot(prot); +} + static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, enum page_cache_mode pcm, enum page_cache_mode new_pcm) @@ -859,6 +881,8 @@ static inline unsigned long p4d_index(unsigned long address) #if CONFIG_PGTABLE_LEVELS > 4 static inline int pgd_present(pgd_t pgd) { + if (!pgtable_l5_enabled) + return 1; return pgd_flags(pgd) & _PAGE_PRESENT; } @@ -876,6 +900,8 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) /* to find an entry in a page-table-directory. */ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) { + if (!pgtable_l5_enabled) + return (p4d_t *)pgd; return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address); } @@ -883,6 +909,9 @@ static inline int pgd_bad(pgd_t pgd) { unsigned long ignore_flags = _PAGE_USER; + if (!pgtable_l5_enabled) + return 0; + if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) ignore_flags |= _PAGE_NX; @@ -891,6 +920,8 @@ static inline int pgd_bad(pgd_t pgd) static inline int pgd_none(pgd_t pgd) { + if (!pgtable_l5_enabled) + return 0; /* * There is no need to do a workaround for the KNL stray * A/D bit erratum here. PGDs only point to page tables diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index e67c0620aec2..88a056b01db4 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -32,6 +32,9 @@ extern pmd_t initial_pg_pmd[]; static inline void pgtable_cache_init(void) { } static inline void check_pgt_cache(void) { } void paging_init(void); +void sync_initial_page_table(void); + +static inline int pgd_large(pgd_t pgd) { return 0; } /* * Define this if things work differently on an i386 and an i486: @@ -61,7 +64,7 @@ void paging_init(void); #define kpte_clear_flush(ptep, vaddr) \ do { \ pte_clear(&init_mm, (vaddr), (ptep)); \ - __flush_tlb_one((vaddr)); \ + __flush_tlb_one_kernel((vaddr)); \ } while (0) #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index 0777e18a1d23..e3225e83db7d 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -15,6 +15,8 @@ # include <asm/pgtable-2level_types.h> #endif +#define pgtable_l5_enabled 0 + #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE - 1)) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 81462e9a34f6..877bc27718ae 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -28,6 +28,7 @@ extern pgd_t init_top_pgt[]; #define swapper_pg_dir init_top_pgt extern void paging_init(void); +static inline void sync_initial_page_table(void) { } #define pte_ERROR(e) \ pr_err("%s:%d: bad pte %p(%016lx)\n", \ @@ -217,29 +218,26 @@ static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) { -#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL) - p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd); -#else - *p4dp = p4d; -#endif + pgd_t pgd; + + if (pgtable_l5_enabled || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) { + *p4dp = p4d; + return; + } + + pgd = native_make_pgd(native_p4d_val(p4d)); + pgd = pti_set_user_pgd((pgd_t *)p4dp, pgd); + *p4dp = native_make_p4d(native_pgd_val(pgd)); } static inline void native_p4d_clear(p4d_t *p4d) { -#ifdef CONFIG_X86_5LEVEL native_set_p4d(p4d, native_make_p4d(0)); -#else - native_set_p4d(p4d, (p4d_t) { .pgd = native_make_pgd(0)}); -#endif } static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { -#ifdef CONFIG_PAGE_TABLE_ISOLATION *pgdp = pti_set_user_pgd(pgdp, pgd); -#else - *pgdp = pgd; -#endif } static inline void native_pgd_clear(pgd_t *pgd) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 6b8f73dcbc2c..adb47552e6bb 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -20,6 +20,18 @@ typedef unsigned long pgprotval_t; typedef struct { pteval_t pte; } pte_t; +#ifdef CONFIG_X86_5LEVEL +extern unsigned int __pgtable_l5_enabled; +#ifndef pgtable_l5_enabled +#define pgtable_l5_enabled cpu_feature_enabled(X86_FEATURE_LA57) +#endif +#else +#define pgtable_l5_enabled 0 +#endif + +extern unsigned int pgdir_shift; +extern unsigned int ptrs_per_p4d; + #endif /* !__ASSEMBLY__ */ #define SHARED_KERNEL_PMD 0 @@ -29,24 +41,28 @@ typedef struct { pteval_t pte; } pte_t; /* * PGDIR_SHIFT determines what a top-level page table entry can map */ -#define PGDIR_SHIFT 48 +#define PGDIR_SHIFT pgdir_shift #define PTRS_PER_PGD 512 /* * 4th level page in 5-level paging case */ -#define P4D_SHIFT 39 -#define PTRS_PER_P4D 512 -#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT) -#define P4D_MASK (~(P4D_SIZE - 1)) +#define P4D_SHIFT 39 +#define MAX_PTRS_PER_P4D 512 +#define PTRS_PER_P4D ptrs_per_p4d +#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT) +#define P4D_MASK (~(P4D_SIZE - 1)) + +#define MAX_POSSIBLE_PHYSMEM_BITS 52 #else /* CONFIG_X86_5LEVEL */ /* * PGDIR_SHIFT determines what a top-level page table entry can map */ -#define PGDIR_SHIFT 39 -#define PTRS_PER_PGD 512 +#define PGDIR_SHIFT 39 +#define PTRS_PER_PGD 512 +#define MAX_PTRS_PER_P4D 1 #endif /* CONFIG_X86_5LEVEL */ @@ -82,31 +98,33 @@ typedef struct { pteval_t pte; } pte_t; * range must not overlap with anything except the KASAN shadow area, which * is correct as KASAN disables KASLR. */ -#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) +#define MAXMEM (1UL << MAX_PHYSMEM_BITS) -#ifdef CONFIG_X86_5LEVEL -# define VMALLOC_SIZE_TB _AC(12800, UL) -# define __VMALLOC_BASE _AC(0xffa0000000000000, UL) -# define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) -# define LDT_PGD_ENTRY _AC(-112, UL) -# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) -#else -# define VMALLOC_SIZE_TB _AC(32, UL) -# define __VMALLOC_BASE _AC(0xffffc90000000000, UL) -# define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) -# define LDT_PGD_ENTRY _AC(-3, UL) -# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) -#endif +#define LDT_PGD_ENTRY_L4 -3UL +#define LDT_PGD_ENTRY_L5 -112UL +#define LDT_PGD_ENTRY (pgtable_l5_enabled ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4) +#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) + +#define __VMALLOC_BASE_L4 0xffffc90000000000UL +#define __VMALLOC_BASE_L5 0xffa0000000000000UL + +#define VMALLOC_SIZE_TB_L4 32UL +#define VMALLOC_SIZE_TB_L5 12800UL + +#define __VMEMMAP_BASE_L4 0xffffea0000000000UL +#define __VMEMMAP_BASE_L5 0xffd4000000000000UL -#ifdef CONFIG_RANDOMIZE_MEMORY +#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT # define VMALLOC_START vmalloc_base +# define VMALLOC_SIZE_TB (pgtable_l5_enabled ? VMALLOC_SIZE_TB_L5 : VMALLOC_SIZE_TB_L4) # define VMEMMAP_START vmemmap_base #else -# define VMALLOC_START __VMALLOC_BASE -# define VMEMMAP_START __VMEMMAP_BASE -#endif /* CONFIG_RANDOMIZE_MEMORY */ +# define VMALLOC_START __VMALLOC_BASE_L4 +# define VMALLOC_SIZE_TB VMALLOC_SIZE_TB_L4 +# define VMEMMAP_START __VMEMMAP_BASE_L4 +#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ -#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) +#define VMALLOC_END (VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1) #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) /* The module sections ends with the start of the fixmap */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 3696398a9475..1e5a40673953 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -174,7 +174,6 @@ enum page_cache_mode { #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_NOCACHE) -#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) #define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER) #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) @@ -197,20 +196,21 @@ enum page_cache_mode { #define __PAGE_KERNEL_NOENC (__PAGE_KERNEL) #define __PAGE_KERNEL_NOENC_WP (__PAGE_KERNEL_WP) -#define PAGE_KERNEL __pgprot(__PAGE_KERNEL | _PAGE_ENC) -#define PAGE_KERNEL_NOENC __pgprot(__PAGE_KERNEL) -#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC) -#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC) -#define PAGE_KERNEL_EXEC_NOENC __pgprot(__PAGE_KERNEL_EXEC) -#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX | _PAGE_ENC) -#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC) -#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC) -#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC) -#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL | _PAGE_ENC) -#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC) - -#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) -#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) +#define default_pgprot(x) __pgprot((x) & __default_kernel_pte_mask) + +#define PAGE_KERNEL default_pgprot(__PAGE_KERNEL | _PAGE_ENC) +#define PAGE_KERNEL_NOENC default_pgprot(__PAGE_KERNEL) +#define PAGE_KERNEL_RO default_pgprot(__PAGE_KERNEL_RO | _PAGE_ENC) +#define PAGE_KERNEL_EXEC default_pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC) +#define PAGE_KERNEL_EXEC_NOENC default_pgprot(__PAGE_KERNEL_EXEC) +#define PAGE_KERNEL_RX default_pgprot(__PAGE_KERNEL_RX | _PAGE_ENC) +#define PAGE_KERNEL_NOCACHE default_pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC) +#define PAGE_KERNEL_LARGE default_pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC) +#define PAGE_KERNEL_LARGE_EXEC default_pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC) +#define PAGE_KERNEL_VVAR default_pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC) + +#define PAGE_KERNEL_IO default_pgprot(__PAGE_KERNEL_IO) +#define PAGE_KERNEL_IO_NOCACHE default_pgprot(__PAGE_KERNEL_IO_NOCACHE) #endif /* __ASSEMBLY__ */ @@ -323,6 +323,11 @@ static inline pudval_t native_pud_val(pud_t pud) #else #include <asm-generic/pgtable-nopud.h> +static inline pud_t native_make_pud(pudval_t val) +{ + return (pud_t) { .p4d.pgd = native_make_pgd(val) }; +} + static inline pudval_t native_pud_val(pud_t pud) { return native_pgd_val(pud.p4d.pgd); @@ -344,6 +349,11 @@ static inline pmdval_t native_pmd_val(pmd_t pmd) #else #include <asm-generic/pgtable-nopmd.h> +static inline pmd_t native_make_pmd(pmdval_t val) +{ + return (pmd_t) { .pud.p4d.pgd = native_make_pgd(val) }; +} + static inline pmdval_t native_pmd_val(pmd_t pmd) { return native_pgd_val(pmd.pud.p4d.pgd); @@ -475,6 +485,7 @@ static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot) typedef struct page *pgtable_t; extern pteval_t __supported_pte_mask; +extern pteval_t __default_kernel_pte_mask; extern void set_nx(void); extern int nx_enabled; diff --git a/arch/x86/include/asm/platform_sst_audio.h b/arch/x86/include/asm/platform_sst_audio.h index 5973a2f3db3d..059823bb8af7 100644 --- a/arch/x86/include/asm/platform_sst_audio.h +++ b/arch/x86/include/asm/platform_sst_audio.h @@ -135,6 +135,7 @@ struct sst_platform_info { const struct sst_res_info *res_info; const struct sst_lib_dnld_info *lib_info; const char *platform; + bool streams_lost_on_suspend; }; int add_sst_platform_device(void); #endif diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 793bae7e7ce3..21a114914ba4 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -91,7 +91,7 @@ struct cpuinfo_x86 { __u8 x86; /* CPU family */ __u8 x86_vendor; /* CPU vendor */ __u8 x86_model; - __u8 x86_mask; + __u8 x86_stepping; #ifdef CONFIG_X86_64 /* Number of 4K pages in DTLB/ITLB combined(in pages): */ int x86_tlbsize; @@ -109,7 +109,7 @@ struct cpuinfo_x86 { char x86_vendor_id[16]; char x86_model_id[64]; /* in KB - valid for CPUS which support this call: */ - int x86_cache_size; + unsigned int x86_cache_size; int x86_cache_alignment; /* In bytes */ /* Cache QoS architectural values: */ int x86_cache_max_rmid; /* max index */ @@ -407,9 +407,19 @@ union irq_stack_union { DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible; DECLARE_INIT_PER_CPU(irq_stack_union); +static inline unsigned long cpu_kernelmode_gs_base(int cpu) +{ + return (unsigned long)per_cpu(irq_stack_union.gs_base, cpu); +} + DECLARE_PER_CPU(char *, irq_stack_ptr); DECLARE_PER_CPU(unsigned int, irq_count); extern asmlinkage void ignore_sysret(void); + +#if IS_ENABLED(CONFIG_KVM) +/* Save actual FS/GS selectors and bases to current->thread */ +void save_fsgs_for_kvm(void); +#endif #else /* X86_64 */ #ifdef CONFIG_CC_STACKPROTECTOR /* @@ -739,13 +749,11 @@ enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT, extern void enable_sep_cpu(void); extern int sysenter_setup(void); -extern void early_trap_init(void); void early_trap_pf_init(void); /* Defined in head.S */ extern struct desc_ptr early_gdt_descr; -extern void cpu_set_gdt(int); extern void switch_to_new_gdt(int); extern void load_direct_gdt(int); extern void load_fixmap_gdt(int); @@ -977,7 +985,5 @@ bool xen_set_default_idle(void); void stop_this_cpu(void *dummy); void df_debug(struct pt_regs *regs, long error_code); - -void __ibp_barrier(void); - +void microcode_check(void); #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h index 0b5ef05b2d2d..38a17f1d5c9d 100644 --- a/arch/x86/include/asm/pti.h +++ b/arch/x86/include/asm/pti.h @@ -6,8 +6,10 @@ #ifdef CONFIG_PAGE_TABLE_ISOLATION extern void pti_init(void); extern void pti_check_boottime_disable(void); +extern void pti_clone_kernel_text(void); #else static inline void pti_check_boottime_disable(void) { } +static inline void pti_clone_kernel_text(void) { } #endif #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h index 4e44250e7d0d..4cf11d88d3b3 100644 --- a/arch/x86/include/asm/refcount.h +++ b/arch/x86/include/asm/refcount.h @@ -17,7 +17,7 @@ #define _REFCOUNT_EXCEPTION \ ".pushsection .text..refcount\n" \ "111:\tlea %[counter], %%" _ASM_CX "\n" \ - "112:\t" ASM_UD0 "\n" \ + "112:\t" ASM_UD2 "\n" \ ASM_UNREACHABLE \ ".popsection\n" \ "113:\n" \ @@ -67,13 +67,13 @@ static __always_inline __must_check bool refcount_sub_and_test(unsigned int i, refcount_t *r) { GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", REFCOUNT_CHECK_LT_ZERO, - r->refs.counter, "er", i, "%0", e); + r->refs.counter, "er", i, "%0", e, "cx"); } static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r) { GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", REFCOUNT_CHECK_LT_ZERO, - r->refs.counter, "%0", e); + r->refs.counter, "%0", e, "cx"); } static __always_inline __must_check diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index fb3a6de7440b..6847d85400a8 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -53,12 +53,6 @@ # define NEED_MOVBE 0 #endif -#ifdef CONFIG_X86_5LEVEL -# define NEED_LA57 (1<<(X86_FEATURE_LA57 & 31)) -#else -# define NEED_LA57 0 -#endif - #ifdef CONFIG_X86_64 #ifdef CONFIG_PARAVIRT /* Paravirtualized systems may not have PSE or PGE available */ @@ -104,7 +98,7 @@ #define REQUIRED_MASK13 0 #define REQUIRED_MASK14 0 #define REQUIRED_MASK15 0 -#define REQUIRED_MASK16 (NEED_LA57) +#define REQUIRED_MASK16 0 #define REQUIRED_MASK17 0 #define REQUIRED_MASK18 0 #define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index f91c365e57c3..4914a3e7c803 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -2,8 +2,7 @@ #ifndef _ASM_X86_RMWcc #define _ASM_X86_RMWcc -#define __CLOBBERS_MEM "memory" -#define __CLOBBERS_MEM_CC_CX "memory", "cc", "cx" +#define __CLOBBERS_MEM(clb...) "memory", ## clb #if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO) @@ -40,18 +39,19 @@ do { \ #endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ #define GEN_UNARY_RMWcc(op, var, arg0, cc) \ - __GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM) + __GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM()) -#define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, arg0, cc) \ +#define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, arg0, cc, clobbers...)\ __GEN_RMWcc(op " " arg0 "\n\t" suffix, var, cc, \ - __CLOBBERS_MEM_CC_CX) + __CLOBBERS_MEM(clobbers)) #define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \ __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0, var, cc, \ - __CLOBBERS_MEM, vcon (val)) + __CLOBBERS_MEM(), vcon (val)) -#define GEN_BINARY_SUFFIXED_RMWcc(op, suffix, var, vcon, val, arg0, cc) \ +#define GEN_BINARY_SUFFIXED_RMWcc(op, suffix, var, vcon, val, arg0, cc, \ + clobbers...) \ __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0 "\n\t" suffix, var, cc, \ - __CLOBBERS_MEM_CC_CX, vcon (val)) + __CLOBBERS_MEM(clobbers), vcon (val)) #endif /* _ASM_X86_RMWcc */ diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index d6baf23782bc..5c019d23d06b 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -10,6 +10,7 @@ extern struct exception_table_entry __stop___ex_table[]; #if defined(CONFIG_X86_64) extern char __end_rodata_hpage_align[]; +extern char __entry_trampoline_start[], __entry_trampoline_end[]; #endif #endif /* _ASM_X86_SECTIONS_H */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 461f53d27708..f75bff8f9d82 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -129,6 +129,7 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) void cpu_disable_common(void); void native_smp_prepare_boot_cpu(void); void native_smp_prepare_cpus(unsigned int max_cpus); +void calculate_max_logical_packages(void); void native_smp_cpus_done(unsigned int max_cpus); void common_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); @@ -176,16 +177,6 @@ static inline int wbinvd_on_all_cpus(void) extern unsigned disabled_cpus; #ifdef CONFIG_X86_LOCAL_APIC - -#ifndef CONFIG_X86_64 -static inline int logical_smp_processor_id(void) -{ - /* we don't want to mark this access volatile - bad code generation */ - return GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); -} - -#endif - extern int hard_smp_processor_id(void); #else /* CONFIG_X86_LOCAL_APIC */ diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index 4fc1e9d3c43e..4617a2bf123c 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -27,13 +27,8 @@ # endif #else /* CONFIG_X86_32 */ # define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ -# ifdef CONFIG_X86_5LEVEL -# define MAX_PHYSADDR_BITS 52 -# define MAX_PHYSMEM_BITS 52 -# else -# define MAX_PHYSADDR_BITS 44 -# define MAX_PHYSMEM_BITS 46 -# endif +# define MAX_PHYSADDR_BITS (pgtable_l5_enabled ? 52 : 44) +# define MAX_PHYSMEM_BITS (pgtable_l5_enabled ? 52 : 46) #endif #endif /* CONFIG_SPARSEMEM */ diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index f73706878772..133d9425fced 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -87,8 +87,6 @@ get_stack_pointer(struct task_struct *task, struct pt_regs *regs) void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, char *log_lvl); -extern unsigned int code_bytes; - /* The form of the top of the frame on the stack */ struct stack_frame { struct stack_frame *next_frame; diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 0487ac054870..93b462e48067 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -60,7 +60,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u32 intercept_dr; u32 intercept_exceptions; u64 intercept; - u8 reserved_1[42]; + u8 reserved_1[40]; + u16 pause_filter_thresh; u16 pause_filter_count; u64 iopm_base_pa; u64 msrpm_base_pa; diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h index 1c6a6cb230ff..ff6c92eff035 100644 --- a/arch/x86/include/asm/swiotlb.h +++ b/arch/x86/include/asm/swiotlb.h @@ -27,12 +27,4 @@ static inline void pci_swiotlb_late_init(void) { } #endif - -extern void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, gfp_t flags, - unsigned long attrs); -extern void x86_swiotlb_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_addr, - unsigned long attrs); - #endif /* _ASM_X86_SWIOTLB_H */ diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h deleted file mode 100644 index 82c34ee25a65..000000000000 --- a/arch/x86/include/asm/sys_ia32.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * sys_ia32.h - Linux ia32 syscall interfaces - * - * Copyright (c) 2008 Jaswinder Singh Rajput - * - * This file is released under the GPLv2. - * See the file COPYING for more details. - */ - -#ifndef _ASM_X86_SYS_IA32_H -#define _ASM_X86_SYS_IA32_H - -#ifdef CONFIG_COMPAT - -#include <linux/compiler.h> -#include <linux/linkage.h> -#include <linux/types.h> -#include <linux/signal.h> -#include <asm/compat.h> -#include <asm/ia32.h> - -/* ia32/sys_ia32.c */ -asmlinkage long sys32_truncate64(const char __user *, unsigned long, unsigned long); -asmlinkage long sys32_ftruncate64(unsigned int, unsigned long, unsigned long); - -asmlinkage long sys32_stat64(const char __user *, struct stat64 __user *); -asmlinkage long sys32_lstat64(const char __user *, struct stat64 __user *); -asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *); -asmlinkage long sys32_fstatat(unsigned int, const char __user *, - struct stat64 __user *, int); -struct mmap_arg_struct32; -asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *); - -asmlinkage long sys32_waitpid(compat_pid_t, unsigned int __user *, int); - -asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32); -asmlinkage long sys32_pwrite(unsigned int, const char __user *, u32, u32, u32); - -long sys32_fadvise64_64(int, __u32, __u32, __u32, __u32, int); -long sys32_vm86_warning(void); - -asmlinkage ssize_t sys32_readahead(int, unsigned, unsigned, size_t); -asmlinkage long sys32_sync_file_range(int, unsigned, unsigned, - unsigned, unsigned, int); -asmlinkage long sys32_fadvise64(int, unsigned, unsigned, size_t, int); -asmlinkage long sys32_fallocate(int, int, unsigned, - unsigned, unsigned, unsigned); - -/* ia32/ia32_signal.c */ -asmlinkage long sys32_sigreturn(void); -asmlinkage long sys32_rt_sigreturn(void); - -#endif /* CONFIG_COMPAT */ - -#endif /* _ASM_X86_SYS_IA32_H */ diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 03eedc21246d..d653139857af 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -20,9 +20,13 @@ #include <asm/thread_info.h> /* for TS_COMPAT */ #include <asm/unistd.h> +#ifdef CONFIG_X86_64 +typedef asmlinkage long (*sys_call_ptr_t)(const struct pt_regs *); +#else typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); +#endif /* CONFIG_X86_64 */ extern const sys_call_ptr_t sys_call_table[]; #if defined(CONFIG_X86_32) diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h new file mode 100644 index 000000000000..e046a405743d --- /dev/null +++ b/arch/x86/include/asm/syscall_wrapper.h @@ -0,0 +1,209 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * syscall_wrapper.h - x86 specific wrappers to syscall definitions + */ + +#ifndef _ASM_X86_SYSCALL_WRAPPER_H +#define _ASM_X86_SYSCALL_WRAPPER_H + +/* Mapping of registers to parameters for syscalls on x86-64 and x32 */ +#define SC_X86_64_REGS_TO_ARGS(x, ...) \ + __MAP(x,__SC_ARGS \ + ,,regs->di,,regs->si,,regs->dx \ + ,,regs->r10,,regs->r8,,regs->r9) \ + +/* Mapping of registers to parameters for syscalls on i386 */ +#define SC_IA32_REGS_TO_ARGS(x, ...) \ + __MAP(x,__SC_ARGS \ + ,,(unsigned int)regs->bx,,(unsigned int)regs->cx \ + ,,(unsigned int)regs->dx,,(unsigned int)regs->si \ + ,,(unsigned int)regs->di,,(unsigned int)regs->bp) + +#ifdef CONFIG_IA32_EMULATION +/* + * For IA32 emulation, we need to handle "compat" syscalls *and* create + * additional wrappers (aptly named __ia32_sys_xyzzy) which decode the + * ia32 regs in the proper order for shared or "common" syscalls. As some + * syscalls may not be implemented, we need to expand COND_SYSCALL in + * kernel/sys_ni.c and SYS_NI in kernel/time/posix-stubs.c to cover this + * case as well. + */ +#define __IA32_COMPAT_SYS_STUBx(x, name, ...) \ + asmlinkage long __ia32_compat_sys##name(const struct pt_regs *regs);\ + ALLOW_ERROR_INJECTION(__ia32_compat_sys##name, ERRNO); \ + asmlinkage long __ia32_compat_sys##name(const struct pt_regs *regs)\ + { \ + return __se_compat_sys##name(SC_IA32_REGS_TO_ARGS(x,__VA_ARGS__));\ + } \ + +#define __IA32_SYS_STUBx(x, name, ...) \ + asmlinkage long __ia32_sys##name(const struct pt_regs *regs); \ + ALLOW_ERROR_INJECTION(__ia32_sys##name, ERRNO); \ + asmlinkage long __ia32_sys##name(const struct pt_regs *regs) \ + { \ + return __se_sys##name(SC_IA32_REGS_TO_ARGS(x,__VA_ARGS__));\ + } + +/* + * To keep the naming coherent, re-define SYSCALL_DEFINE0 to create an alias + * named __ia32_sys_*() + */ +#define SYSCALL_DEFINE0(sname) \ + SYSCALL_METADATA(_##sname, 0); \ + asmlinkage long __x64_sys_##sname(void); \ + ALLOW_ERROR_INJECTION(__x64_sys_##sname, ERRNO); \ + SYSCALL_ALIAS(__ia32_sys_##sname, __x64_sys_##sname); \ + asmlinkage long __x64_sys_##sname(void) + +#define COND_SYSCALL(name) \ + cond_syscall(__x64_sys_##name); \ + cond_syscall(__ia32_sys_##name) + +#define SYS_NI(name) \ + SYSCALL_ALIAS(__x64_sys_##name, sys_ni_posix_timers); \ + SYSCALL_ALIAS(__ia32_sys_##name, sys_ni_posix_timers) + +#else /* CONFIG_IA32_EMULATION */ +#define __IA32_COMPAT_SYS_STUBx(x, name, ...) +#define __IA32_SYS_STUBx(x, fullname, name, ...) +#endif /* CONFIG_IA32_EMULATION */ + + +#ifdef CONFIG_X86_X32 +/* + * For the x32 ABI, we need to create a stub for compat_sys_*() which is aware + * of the x86-64-style parameter ordering of x32 syscalls. The syscalls common + * with x86_64 obviously do not need such care. + */ +#define __X32_COMPAT_SYS_STUBx(x, name, ...) \ + asmlinkage long __x32_compat_sys##name(const struct pt_regs *regs);\ + ALLOW_ERROR_INJECTION(__x32_compat_sys##name, ERRNO); \ + asmlinkage long __x32_compat_sys##name(const struct pt_regs *regs)\ + { \ + return __se_compat_sys##name(SC_X86_64_REGS_TO_ARGS(x,__VA_ARGS__));\ + } \ + +#else /* CONFIG_X86_X32 */ +#define __X32_COMPAT_SYS_STUBx(x, name, ...) +#endif /* CONFIG_X86_X32 */ + + +#ifdef CONFIG_COMPAT +/* + * Compat means IA32_EMULATION and/or X86_X32. As they use a different + * mapping of registers to parameters, we need to generate stubs for each + * of them. + */ +#define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ + static long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ + static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ + __IA32_COMPAT_SYS_STUBx(x, name, __VA_ARGS__) \ + __X32_COMPAT_SYS_STUBx(x, name, __VA_ARGS__) \ + static long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ + { \ + return __do_compat_sys##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__));\ + } \ + static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) + +/* + * As some compat syscalls may not be implemented, we need to expand + * COND_SYSCALL_COMPAT in kernel/sys_ni.c and COMPAT_SYS_NI in + * kernel/time/posix-stubs.c to cover this case as well. + */ +#define COND_SYSCALL_COMPAT(name) \ + cond_syscall(__ia32_compat_sys_##name); \ + cond_syscall(__x32_compat_sys_##name) + +#define COMPAT_SYS_NI(name) \ + SYSCALL_ALIAS(__ia32_compat_sys_##name, sys_ni_posix_timers); \ + SYSCALL_ALIAS(__x32_compat_sys_##name, sys_ni_posix_timers) + +#endif /* CONFIG_COMPAT */ + + +/* + * Instead of the generic __SYSCALL_DEFINEx() definition, this macro takes + * struct pt_regs *regs as the only argument of the syscall stub named + * __x64_sys_*(). It decodes just the registers it needs and passes them on to + * the __se_sys_*() wrapper performing sign extension and then to the + * __do_sys_*() function doing the actual job. These wrappers and functions + * are inlined (at least in very most cases), meaning that the assembly looks + * as follows (slightly re-ordered for better readability): + * + * <__x64_sys_recv>: <-- syscall with 4 parameters + * callq <__fentry__> + * + * mov 0x70(%rdi),%rdi <-- decode regs->di + * mov 0x68(%rdi),%rsi <-- decode regs->si + * mov 0x60(%rdi),%rdx <-- decode regs->dx + * mov 0x38(%rdi),%rcx <-- decode regs->r10 + * + * xor %r9d,%r9d <-- clear %r9 + * xor %r8d,%r8d <-- clear %r8 + * + * callq __sys_recvfrom <-- do the actual work in __sys_recvfrom() + * which takes 6 arguments + * + * cltq <-- extend return value to 64-bit + * retq <-- return + * + * This approach avoids leaking random user-provided register content down + * the call chain. + * + * If IA32_EMULATION is enabled, this macro generates an additional wrapper + * named __ia32_sys_*() which decodes the struct pt_regs *regs according + * to the i386 calling convention (bx, cx, dx, si, di, bp). + */ +#define __SYSCALL_DEFINEx(x, name, ...) \ + asmlinkage long __x64_sys##name(const struct pt_regs *regs); \ + ALLOW_ERROR_INJECTION(__x64_sys##name, ERRNO); \ + static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ + static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ + asmlinkage long __x64_sys##name(const struct pt_regs *regs) \ + { \ + return __se_sys##name(SC_X86_64_REGS_TO_ARGS(x,__VA_ARGS__));\ + } \ + __IA32_SYS_STUBx(x, name, __VA_ARGS__) \ + static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ + { \ + long ret = __do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__));\ + __MAP(x,__SC_TEST,__VA_ARGS__); \ + __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \ + return ret; \ + } \ + static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) + +/* + * As the generic SYSCALL_DEFINE0() macro does not decode any parameters for + * obvious reasons, and passing struct pt_regs *regs to it in %rdi does not + * hurt, we only need to re-define it here to keep the naming congruent to + * SYSCALL_DEFINEx() -- which is essential for the COND_SYSCALL() and SYS_NI() + * macros to work correctly. + */ +#ifndef SYSCALL_DEFINE0 +#define SYSCALL_DEFINE0(sname) \ + SYSCALL_METADATA(_##sname, 0); \ + asmlinkage long __x64_sys_##sname(void); \ + ALLOW_ERROR_INJECTION(__x64_sys_##sname, ERRNO); \ + asmlinkage long __x64_sys_##sname(void) +#endif + +#ifndef COND_SYSCALL +#define COND_SYSCALL(name) cond_syscall(__x64_sys_##name) +#endif + +#ifndef SYS_NI +#define SYS_NI(name) SYSCALL_ALIAS(__x64_sys_##name, sys_ni_posix_timers); +#endif + + +/* + * For VSYSCALLS, we need to declare these three syscalls with the new + * pt_regs-based calling convention for in-kernel use. + */ +struct pt_regs; +asmlinkage long __x64_sys_getcpu(const struct pt_regs *regs); +asmlinkage long __x64_sys_gettimeofday(const struct pt_regs *regs); +asmlinkage long __x64_sys_time(const struct pt_regs *regs); + +#endif /* _ASM_X86_SYSCALL_WRAPPER_H */ diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index bad25bb80679..9fa979dd0d9d 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -17,6 +17,13 @@ /* Common in X86_32 and X86_64 */ /* kernel/ioport.c */ +long ksys_ioperm(unsigned long from, unsigned long num, int turn_on); + +#ifdef CONFIG_X86_32 +/* + * These definitions are only valid on pure 32-bit systems; x86-64 uses a + * different syscall calling convention + */ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); asmlinkage long sys_iopl(unsigned int); @@ -31,25 +38,14 @@ asmlinkage long sys_set_thread_area(struct user_desc __user *); asmlinkage long sys_get_thread_area(struct user_desc __user *); /* X86_32 only */ -#ifdef CONFIG_X86_32 /* kernel/signal.c */ -asmlinkage unsigned long sys_sigreturn(void); +asmlinkage long sys_sigreturn(void); /* kernel/vm86_32.c */ struct vm86_struct; asmlinkage long sys_vm86old(struct vm86_struct __user *); asmlinkage long sys_vm86(unsigned long, unsigned long); -#else /* CONFIG_X86_32 */ - -/* X86_64 only */ -/* kernel/process_64.c */ -asmlinkage long sys_arch_prctl(int, unsigned long); - -/* kernel/sys_x86_64.c */ -asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, - unsigned long, unsigned long, unsigned long); - #endif /* CONFIG_X86_32 */ #endif /* _ASM_X86_SYSCALLS_H */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 2b8f18ca5874..6690cd3fc8b1 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -131,7 +131,12 @@ static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) { VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); - VM_WARN_ON_ONCE(!this_cpu_has(X86_FEATURE_PCID)); + /* + * Use boot_cpu_has() instead of this_cpu_has() as this function + * might be called during early boot. This should work even after + * boot because all CPU's the have same capabilities: + */ + VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID)); return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH; } @@ -140,7 +145,7 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) #else #define __flush_tlb() __native_flush_tlb() #define __flush_tlb_global() __native_flush_tlb_global() -#define __flush_tlb_single(addr) __native_flush_tlb_single(addr) +#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr) #endif static inline bool tlb_defer_switch_to_init_mm(void) @@ -400,7 +405,7 @@ static inline void __native_flush_tlb_global(void) /* * flush one page in the user mapping */ -static inline void __native_flush_tlb_single(unsigned long addr) +static inline void __native_flush_tlb_one_user(unsigned long addr) { u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); @@ -437,18 +442,31 @@ static inline void __flush_tlb_all(void) /* * flush one page in the kernel mapping */ -static inline void __flush_tlb_one(unsigned long addr) +static inline void __flush_tlb_one_kernel(unsigned long addr) { count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); - __flush_tlb_single(addr); + + /* + * If PTI is off, then __flush_tlb_one_user() is just INVLPG or its + * paravirt equivalent. Even with PCID, this is sufficient: we only + * use PCID if we also use global PTEs for the kernel mapping, and + * INVLPG flushes global translations across all address spaces. + * + * If PTI is on, then the kernel is mapped with non-global PTEs, and + * __flush_tlb_one_user() will flush the given address for the current + * kernel address space and for its usermode counterpart, but it does + * not flush it for other address spaces. + */ + __flush_tlb_one_user(addr); if (!static_cpu_has(X86_FEATURE_PTI)) return; /* - * __flush_tlb_single() will have cleared the TLB entry for this ASID, - * but since kernel space is replicated across all, we must also - * invalidate all others. + * See above. We need to propagate the flush to all other address + * spaces. In principle, we only need to propagate it to kernelmode + * address spaces, but the extra bookkeeping we would need is not + * worth it. */ invalidate_other_asid(); } diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index cf5d53c3f9ea..2701d221583a 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -31,6 +31,7 @@ static inline cycles_t get_cycles(void) } extern struct system_counterval_t convert_art_to_tsc(u64 art); +extern struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns); extern void tsc_early_delay_calibrate(void); extern void tsc_init(void); diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h index ecb9ddef128f..62c79e26a59a 100644 --- a/arch/x86/include/asm/uv/uv_mmrs.h +++ b/arch/x86/include/asm/uv/uv_mmrs.h @@ -3833,7 +3833,7 @@ union uvh_rh_gam_mmioh_overlay_config0_mmr_u { #define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR uv_undefined("UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR") #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR uv_undefined("UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR") #define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR 0x1603000UL -#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR 0x483000UL +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR 0x484000UL #define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR ( \ is_uv1_hub() ? UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR : \ is_uv2_hub() ? UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR : \ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 8b6780751132..5db8b0b10766 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -352,6 +352,7 @@ enum vmcs_field { #define INTR_TYPE_NMI_INTR (2 << 8) /* NMI */ #define INTR_TYPE_HARD_EXCEPTION (3 << 8) /* processor exception */ #define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ +#define INTR_TYPE_PRIV_SW_EXCEPTION (5 << 8) /* ICE breakpoint - undocumented */ #define INTR_TYPE_SOFT_EXCEPTION (6 << 8) /* software exception */ /* GUEST_INTERRUPTIBILITY_INFO flags. */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index fc2f082ac635..ce8b4da07e35 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -122,12 +122,24 @@ struct x86_init_pci { * @guest_late_init: guest late init * @x2apic_available: X2APIC detection * @init_mem_mapping: setup early mappings during init_mem_mapping() + * @init_after_bootmem: guest init after boot allocator is finished */ struct x86_hyper_init { void (*init_platform)(void); void (*guest_late_init)(void); bool (*x2apic_available)(void); void (*init_mem_mapping)(void); + void (*init_after_bootmem)(void); +}; + +/** + * struct x86_init_acpi - x86 ACPI init functions + * @get_root_pointer: get RSDP address + * @reduced_hw_early_init: hardware reduced platform early init + */ +struct x86_init_acpi { + u64 (*get_root_pointer)(void); + void (*reduced_hw_early_init)(void); }; /** @@ -144,6 +156,7 @@ struct x86_init_ops { struct x86_init_iommu iommu; struct x86_init_pci pci; struct x86_hyper_init hyper; + struct x86_init_acpi acpi; }; /** @@ -274,16 +287,16 @@ struct x86_msi_ops { void (*restore_msi_irqs)(struct pci_dev *dev); }; -struct x86_io_apic_ops { - unsigned int (*read) (unsigned int apic, unsigned int reg); - void (*disable)(void); +struct x86_apic_ops { + unsigned int (*io_apic_read) (unsigned int apic, unsigned int reg); + void (*restore)(void); }; extern struct x86_init_ops x86_init; extern struct x86_cpuinit_ops x86_cpuinit; extern struct x86_platform_ops x86_platform; extern struct x86_msi_ops x86_msi; -extern struct x86_io_apic_ops x86_io_apic_ops; +extern struct x86_apic_ops x86_apic_ops; extern void x86_early_init_platform_quirks(void); extern void x86_init_noop(void); diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index aebf60357758..a06cbf019744 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -137,15 +137,15 @@ struct boot_e820_entry { * setup data structure. */ struct jailhouse_setup_data { - u16 version; - u16 compatible_version; - u16 pm_timer_address; - u16 num_cpus; - u64 pci_mmconfig_base; - u32 tsc_khz; - u32 apic_khz; - u8 standard_ioapic; - u8 cpu_ids[255]; + __u16 version; + __u16 compatible_version; + __u16 pm_timer_address; + __u16 num_cpus; + __u64 pci_mmconfig_base; + __u32 tsc_khz; + __u32 apic_khz; + __u8 standard_ioapic; + __u8 cpu_ids[255]; } __attribute__((packed)); /* The so-called "zeropage" */ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index f3a960488eae..c535c2fdea13 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -354,8 +354,25 @@ struct kvm_xcrs { __u64 padding[16]; }; -/* definition of registers in kvm_run */ +#define KVM_SYNC_X86_REGS (1UL << 0) +#define KVM_SYNC_X86_SREGS (1UL << 1) +#define KVM_SYNC_X86_EVENTS (1UL << 2) + +#define KVM_SYNC_X86_VALID_FIELDS \ + (KVM_SYNC_X86_REGS| \ + KVM_SYNC_X86_SREGS| \ + KVM_SYNC_X86_EVENTS) + +/* kvm_sync_regs struct included by kvm_run struct */ struct kvm_sync_regs { + /* Members of this structure are potentially malicious. + * Care must be taken by code reading, esp. interpreting, + * data fields from them inside KVM to prevent TOCTOU and + * double-fetch types of vulnerabilities. + */ + struct kvm_regs regs; + struct kvm_sregs sregs; + struct kvm_vcpu_events events; }; #define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 7a2ade4aa235..4c851ebb3ceb 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -3,15 +3,16 @@ #define _UAPI_ASM_X86_KVM_PARA_H #include <linux/types.h> -#include <asm/hyperv.h> /* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It * should be used to determine that a VM is running under KVM. */ #define KVM_CPUID_SIGNATURE 0x40000000 -/* This CPUID returns a feature bitmap in eax. Before enabling a particular - * paravirtualization, the appropriate feature bit should be checked. +/* This CPUID returns two feature bitmaps in eax, edx. Before enabling + * a particular paravirtualization, the appropriate feature bit should + * be checked in eax. The performance hint feature bit should be checked + * in edx. */ #define KVM_CPUID_FEATURES 0x40000001 #define KVM_FEATURE_CLOCKSOURCE 0 @@ -26,6 +27,9 @@ #define KVM_FEATURE_PV_EOI 6 #define KVM_FEATURE_PV_UNHALT 7 #define KVM_FEATURE_PV_TLB_FLUSH 9 +#define KVM_FEATURE_ASYNC_PF_VMEXIT 10 + +#define KVM_HINTS_DEDICATED 0 /* The last 8 bits are used to indicate how to interpret the flags field * in pvclock structure. If no bits are set, all flags are ignored. diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h index 91723461dc1f..955c2a2e1cf9 100644 --- a/arch/x86/include/uapi/asm/mce.h +++ b/arch/x86/include/uapi/asm/mce.h @@ -5,31 +5,36 @@ #include <linux/types.h> #include <linux/ioctl.h> -/* Fields are zero when not available */ +/* + * Fields are zero when not available. Also, this struct is shared with + * userspace mcelog and thus must keep existing fields at current offsets. + * Only add new fields to the end of the structure + */ struct mce { - __u64 status; - __u64 misc; - __u64 addr; - __u64 mcgstatus; - __u64 ip; - __u64 tsc; /* cpu time stamp counter */ - __u64 time; /* wall time_t when error was detected */ - __u8 cpuvendor; /* cpu vendor as encoded in system.h */ - __u8 inject_flags; /* software inject flags */ - __u8 severity; + __u64 status; /* Bank's MCi_STATUS MSR */ + __u64 misc; /* Bank's MCi_MISC MSR */ + __u64 addr; /* Bank's MCi_ADDR MSR */ + __u64 mcgstatus; /* Machine Check Global Status MSR */ + __u64 ip; /* Instruction Pointer when the error happened */ + __u64 tsc; /* CPU time stamp counter */ + __u64 time; /* Wall time_t when error was detected */ + __u8 cpuvendor; /* Kernel's X86_VENDOR enum */ + __u8 inject_flags; /* Software inject flags */ + __u8 severity; /* Error severity */ __u8 pad; - __u32 cpuid; /* CPUID 1 EAX */ - __u8 cs; /* code segment */ - __u8 bank; /* machine check bank */ - __u8 cpu; /* cpu number; obsolete; use extcpu now */ - __u8 finished; /* entry is valid */ - __u32 extcpu; /* linux cpu number that detected the error */ - __u32 socketid; /* CPU socket ID */ - __u32 apicid; /* CPU initial apic ID */ - __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ - __u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */ - __u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */ - __u64 ppin; /* Protected Processor Inventory Number */ + __u32 cpuid; /* CPUID 1 EAX */ + __u8 cs; /* Code segment */ + __u8 bank; /* Machine check bank reporting the error */ + __u8 cpu; /* CPU number; obsoleted by extcpu */ + __u8 finished; /* Entry is valid */ + __u32 extcpu; /* Linux CPU number that detected the error */ + __u32 socketid; /* CPU socket ID */ + __u32 apicid; /* CPU initial APIC ID */ + __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ + __u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */ + __u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */ + __u64 ppin; /* Protected Processor Inventory Number */ + __u32 microcode; /* Microcode revision */ }; #define MCE_GET_RECORD_LEN _IOR('M', 1, int) diff --git a/arch/x86/include/uapi/asm/msgbuf.h b/arch/x86/include/uapi/asm/msgbuf.h index 809134c644a6..90ab9a795b49 100644 --- a/arch/x86/include/uapi/asm/msgbuf.h +++ b/arch/x86/include/uapi/asm/msgbuf.h @@ -1 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __ASM_X64_MSGBUF_H +#define __ASM_X64_MSGBUF_H + +#if !defined(__x86_64__) || !defined(__ILP32__) #include <asm-generic/msgbuf.h> +#else +/* + * The msqid64_ds structure for x86 architecture with x32 ABI. + * + * On x86-32 and x86-64 we can just use the generic definition, but + * x32 uses the same binary layout as x86_64, which is differnet + * from other 32-bit architectures. + */ + +struct msqid64_ds { + struct ipc64_perm msg_perm; + __kernel_time_t msg_stime; /* last msgsnd time */ + __kernel_time_t msg_rtime; /* last msgrcv time */ + __kernel_time_t msg_ctime; /* last change time */ + __kernel_ulong_t msg_cbytes; /* current number of bytes on queue */ + __kernel_ulong_t msg_qnum; /* number of messages in queue */ + __kernel_ulong_t msg_qbytes; /* max number of bytes on queue */ + __kernel_pid_t msg_lspid; /* pid of last msgsnd */ + __kernel_pid_t msg_lrpid; /* last receive pid */ + __kernel_ulong_t __unused4; + __kernel_ulong_t __unused5; +}; + +#endif + +#endif /* __ASM_GENERIC_MSGBUF_H */ diff --git a/arch/x86/include/uapi/asm/shmbuf.h b/arch/x86/include/uapi/asm/shmbuf.h index 83c05fc2de38..644421f3823b 100644 --- a/arch/x86/include/uapi/asm/shmbuf.h +++ b/arch/x86/include/uapi/asm/shmbuf.h @@ -1 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __ASM_X86_SHMBUF_H +#define __ASM_X86_SHMBUF_H + +#if !defined(__x86_64__) || !defined(__ILP32__) #include <asm-generic/shmbuf.h> +#else +/* + * The shmid64_ds structure for x86 architecture with x32 ABI. + * + * On x86-32 and x86-64 we can just use the generic definition, but + * x32 uses the same binary layout as x86_64, which is differnet + * from other 32-bit architectures. + */ + +struct shmid64_ds { + struct ipc64_perm shm_perm; /* operation perms */ + size_t shm_segsz; /* size of segment (bytes) */ + __kernel_time_t shm_atime; /* last attach time */ + __kernel_time_t shm_dtime; /* last detach time */ + __kernel_time_t shm_ctime; /* last change time */ + __kernel_pid_t shm_cpid; /* pid of creator */ + __kernel_pid_t shm_lpid; /* pid of last operator */ + __kernel_ulong_t shm_nattch; /* no. of current attaches */ + __kernel_ulong_t __unused4; + __kernel_ulong_t __unused5; +}; + +struct shminfo64 { + __kernel_ulong_t shmmax; + __kernel_ulong_t shmmin; + __kernel_ulong_t shmmni; + __kernel_ulong_t shmseg; + __kernel_ulong_t shmall; + __kernel_ulong_t __unused1; + __kernel_ulong_t __unused2; + __kernel_ulong_t __unused3; + __kernel_ulong_t __unused4; +}; + +#endif + +#endif /* __ASM_X86_SHMBUF_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 29786c87e864..02d6f5cf4e70 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -57,7 +57,7 @@ obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o -obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o +obj-y += alternative.o i8253.o hw_breakpoint.o obj-y += tsc.o tsc_msr.o io_delay.o rtc.o obj-y += pci-iommu_table.o obj-y += resource.o @@ -146,6 +146,6 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o - obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o + obj-$(CONFIG_MMCONF_FAM10H) += mmconf-fam10h_64.o obj-y += vsmp_64.o endif diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 2aa92094b59d..3b20607d581b 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -200,7 +200,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) { struct acpi_madt_local_x2apic *processor = NULL; #ifdef CONFIG_X86_X2APIC - int apic_id; + u32 apic_id; u8 enabled; #endif @@ -215,6 +215,10 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) apic_id = processor->local_apic_id; enabled = processor->lapic_flags & ACPI_MADT_ENABLED; + /* Ignore invalid ID */ + if (apic_id == 0xffffffff) + return 0; + /* * We need to register disabled CPU as well to permit * counting disabled CPUs. This allows us to size @@ -222,10 +226,13 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) * to not preallocating memory for all NR_CPUS * when we use CPU hotplug. */ - if (!apic->apic_id_valid(apic_id) && enabled) - printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); - else - acpi_register_lapic(apic_id, processor->uid, enabled); + if (!apic->apic_id_valid(apic_id)) { + if (enabled) + pr_warn(PREFIX "x2apic entry ignored\n"); + return 0; + } + + acpi_register_lapic(apic_id, processor->uid, enabled); #else printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); #endif @@ -1376,17 +1383,21 @@ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) * * We initialize the Hardware-reduced ACPI model here: */ +void __init acpi_generic_reduced_hw_init(void) +{ + /* + * Override x86_init functions and bypass legacy PIC in + * hardware reduced ACPI mode. + */ + x86_init.timers.timer_init = x86_init_noop; + x86_init.irqs.pre_vector_init = x86_init_noop; + legacy_pic = &null_legacy_pic; +} + static void __init acpi_reduced_hw_init(void) { - if (acpi_gbl_reduced_hardware) { - /* - * Override x86_init functions and bypass legacy pic - * in Hardware-reduced ACPI mode - */ - x86_init.timers.timer_init = x86_init_noop; - x86_init.irqs.pre_vector_init = x86_init_noop; - legacy_pic = &null_legacy_pic; - } + if (acpi_gbl_reduced_hardware) + x86_init.acpi.reduced_hw_early_init(); } /* diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index ecd486cb06ab..f299d8a479bb 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -480,30 +480,21 @@ static void * gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag, unsigned long attrs) { - dma_addr_t paddr; - unsigned long align_mask; - struct page *page; - - if (force_iommu && !(flag & GFP_DMA)) { - flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); - page = alloc_pages(flag | __GFP_ZERO, get_order(size)); - if (!page) - return NULL; - - align_mask = (1UL << get_order(size)) - 1; - paddr = dma_map_area(dev, page_to_phys(page), size, - DMA_BIDIRECTIONAL, align_mask); - - flush_gart(); - if (paddr != bad_dma_addr) { - *dma_addr = paddr; - return page_address(page); - } - __free_pages(page, get_order(size)); - } else - return dma_generic_alloc_coherent(dev, size, dma_addr, flag, - attrs); + void *vaddr; + + vaddr = dma_direct_alloc(dev, size, dma_addr, flag, attrs); + if (!vaddr || + !force_iommu || dev->coherent_dma_mask <= DMA_BIT_MASK(24)) + return vaddr; + *dma_addr = dma_map_area(dev, virt_to_phys(vaddr), size, + DMA_BIDIRECTIONAL, (1UL << get_order(size)) - 1); + flush_gart(); + if (unlikely(*dma_addr == bad_dma_addr)) + goto out_free; + return vaddr; +out_free: + dma_direct_free(dev, size, vaddr, *dma_addr, attrs); return NULL; } @@ -513,7 +504,7 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_addr, unsigned long attrs) { gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, 0); - dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs); + dma_direct_free(dev, size, vaddr, dma_addr, attrs); } static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr) @@ -705,7 +696,7 @@ static const struct dma_map_ops gart_dma_ops = { .alloc = gart_alloc_coherent, .free = gart_free_coherent, .mapping_error = gart_mapping_error, - .dma_supported = x86_dma_supported, + .dma_supported = dma_direct_supported, }; static void gart_iommu_shutdown(void) diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 6db28f17ff28..c88e0b127810 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -235,7 +235,7 @@ int amd_cache_northbridges(void) if (boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model >= 0x8 && (boot_cpu_data.x86_model > 0x9 || - boot_cpu_data.x86_mask >= 0x1)) + boot_cpu_data.x86_stepping >= 0x1)) amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; if (boot_cpu_data.x86 == 0x15) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 25ddf02598d2..2aabd4cb0e3f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -546,7 +546,7 @@ static DEFINE_PER_CPU(struct clock_event_device, lapic_events); static u32 hsx_deadline_rev(void) { - switch (boot_cpu_data.x86_mask) { + switch (boot_cpu_data.x86_stepping) { case 0x02: return 0x3a; /* EP */ case 0x04: return 0x0f; /* EX */ } @@ -556,7 +556,7 @@ static u32 hsx_deadline_rev(void) static u32 bdx_deadline_rev(void) { - switch (boot_cpu_data.x86_mask) { + switch (boot_cpu_data.x86_stepping) { case 0x02: return 0x00000011; case 0x03: return 0x0700000e; case 0x04: return 0x0f00000c; @@ -568,7 +568,7 @@ static u32 bdx_deadline_rev(void) static u32 skx_deadline_rev(void) { - switch (boot_cpu_data.x86_mask) { + switch (boot_cpu_data.x86_stepping) { case 0x03: return 0x01000136; case 0x04: return 0x02000014; } @@ -1408,22 +1408,69 @@ static void lapic_setup_esr(void) oldvalue, value); } +static void apic_pending_intr_clear(void) +{ + long long max_loops = cpu_khz ? cpu_khz : 1000000; + unsigned long long tsc = 0, ntsc; + unsigned int queued; + unsigned long value; + int i, j, acked = 0; + + if (boot_cpu_has(X86_FEATURE_TSC)) + tsc = rdtsc(); + /* + * After a crash, we no longer service the interrupts and a pending + * interrupt from previous kernel might still have ISR bit set. + * + * Most probably by now CPU has serviced that pending interrupt and + * it might not have done the ack_APIC_irq() because it thought, + * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it + * does not clear the ISR bit and cpu thinks it has already serivced + * the interrupt. Hence a vector might get locked. It was noticed + * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. + */ + do { + queued = 0; + for (i = APIC_ISR_NR - 1; i >= 0; i--) + queued |= apic_read(APIC_IRR + i*0x10); + + for (i = APIC_ISR_NR - 1; i >= 0; i--) { + value = apic_read(APIC_ISR + i*0x10); + for_each_set_bit(j, &value, 32) { + ack_APIC_irq(); + acked++; + } + } + if (acked > 256) { + pr_err("LAPIC pending interrupts after %d EOI\n", acked); + break; + } + if (queued) { + if (boot_cpu_has(X86_FEATURE_TSC) && cpu_khz) { + ntsc = rdtsc(); + max_loops = (cpu_khz << 10) - (ntsc - tsc); + } else { + max_loops--; + } + } + } while (queued && max_loops > 0); + WARN_ON(max_loops <= 0); +} + /** * setup_local_APIC - setup the local APIC * * Used to setup local APIC while initializing BSP or bringing up APs. * Always called with preemption disabled. */ -void setup_local_APIC(void) +static void setup_local_APIC(void) { int cpu = smp_processor_id(); - unsigned int value, queued; - int i, j, acked = 0; - unsigned long long tsc = 0, ntsc; - long long max_loops = cpu_khz ? cpu_khz : 1000000; + unsigned int value; +#ifdef CONFIG_X86_32 + int logical_apicid, ldr_apicid; +#endif - if (boot_cpu_has(X86_FEATURE_TSC)) - tsc = rdtsc(); if (disable_apic) { disable_ioapic_support(); @@ -1460,11 +1507,11 @@ void setup_local_APIC(void) * initialized during get_smp_config(), make sure it matches the * actual value. */ - i = early_per_cpu(x86_cpu_to_logical_apicid, cpu); - WARN_ON(i != BAD_APICID && i != logical_smp_processor_id()); + logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); + ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); + WARN_ON(logical_apicid != BAD_APICID && logical_apicid != ldr_apicid); /* always use the value from LDR */ - early_per_cpu(x86_cpu_to_logical_apicid, cpu) = - logical_smp_processor_id(); + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid; #endif /* @@ -1475,45 +1522,7 @@ void setup_local_APIC(void) value &= ~APIC_TPRI_MASK; apic_write(APIC_TASKPRI, value); - /* - * After a crash, we no longer service the interrupts and a pending - * interrupt from previous kernel might still have ISR bit set. - * - * Most probably by now CPU has serviced that pending interrupt and - * it might not have done the ack_APIC_irq() because it thought, - * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it - * does not clear the ISR bit and cpu thinks it has already serivced - * the interrupt. Hence a vector might get locked. It was noticed - * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. - */ - do { - queued = 0; - for (i = APIC_ISR_NR - 1; i >= 0; i--) - queued |= apic_read(APIC_IRR + i*0x10); - - for (i = APIC_ISR_NR - 1; i >= 0; i--) { - value = apic_read(APIC_ISR + i*0x10); - for (j = 31; j >= 0; j--) { - if (value & (1<<j)) { - ack_APIC_irq(); - acked++; - } - } - } - if (acked > 256) { - printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n", - acked); - break; - } - if (queued) { - if (boot_cpu_has(X86_FEATURE_TSC) && cpu_khz) { - ntsc = rdtsc(); - max_loops = (cpu_khz << 10) - (ntsc - tsc); - } else - max_loops--; - } - } while (queued && max_loops > 0); - WARN_ON(max_loops <= 0); + apic_pending_intr_clear(); /* * Now that we are all set up, enable the APIC @@ -1570,7 +1579,7 @@ void setup_local_APIC(void) * TODO: set up through-local-APIC from through-I/O-APIC? --macro */ value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; - if (!cpu && (pic_mode || !value)) { + if (!cpu && (pic_mode || !value || skip_ioapic_setup)) { value = APIC_DM_EXTINT; apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", cpu); } else { diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c index a360801779ae..02b4839478b1 100644 --- a/arch/x86/kernel/apic/apic_common.c +++ b/arch/x86/kernel/apic/apic_common.c @@ -40,7 +40,7 @@ int default_check_phys_apicid_present(int phys_apicid) return physid_isset(phys_apicid, phys_cpu_present_map); } -int default_apic_id_valid(int apicid) +int default_apic_id_valid(u32 apicid) { return (apicid < 255); } diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 134e04506ab4..78778b54f904 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -56,7 +56,7 @@ static u32 numachip2_set_apic_id(unsigned int id) return id << 24; } -static int numachip_apic_id_valid(int apicid) +static int numachip_apic_id_valid(u32 apicid) { /* Trust what bootloader passes in MADT */ return 1; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 8ad2e410974f..7553819c74c3 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -587,7 +587,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) mpc_ioapic_id(apic), pin); } -static void clear_IO_APIC (void) +void clear_IO_APIC (void) { int apic, pin; @@ -1410,7 +1410,7 @@ void __init enable_IO_APIC(void) clear_IO_APIC(); } -void native_disable_io_apic(void) +void native_restore_boot_irq_mode(void) { /* * If the i8259 is routed through an IOAPIC @@ -1438,20 +1438,12 @@ void native_disable_io_apic(void) disconnect_bsp_APIC(ioapic_i8259.pin != -1); } -/* - * Not an __init, needed by the reboot code - */ -void disable_IO_APIC(void) +void restore_boot_irq_mode(void) { - /* - * Clear the IO-APIC before rebooting: - */ - clear_IO_APIC(); - if (!nr_legacy_irqs()) return; - x86_io_apic_ops.disable(); + x86_apic_ops.restore(); } #ifdef CONFIG_X86_32 @@ -1603,7 +1595,7 @@ static void __init delay_with_tsc(void) do { rep_nop(); now = rdtsc(); - } while ((now - start) < 40000000000UL / HZ && + } while ((now - start) < 40000000000ULL / HZ && time_before_eq(jiffies, end)); } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 3cc471beb50b..bb6f7a2148d7 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -134,21 +134,40 @@ static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, { struct apic_chip_data *apicd = apic_chip_data(irqd); struct irq_desc *desc = irq_data_to_desc(irqd); + bool managed = irqd_affinity_is_managed(irqd); lockdep_assert_held(&vector_lock); trace_vector_update(irqd->irq, newvec, newcpu, apicd->vector, apicd->cpu); - /* Setup the vector move, if required */ - if (apicd->vector && cpu_online(apicd->cpu)) { + /* + * If there is no vector associated or if the associated vector is + * the shutdown vector, which is associated to make PCI/MSI + * shutdown mode work, then there is nothing to release. Clear out + * prev_vector for this and the offlined target case. + */ + apicd->prev_vector = 0; + if (!apicd->vector || apicd->vector == MANAGED_IRQ_SHUTDOWN_VECTOR) + goto setnew; + /* + * If the target CPU of the previous vector is online, then mark + * the vector as move in progress and store it for cleanup when the + * first interrupt on the new vector arrives. If the target CPU is + * offline then the regular release mechanism via the cleanup + * vector is not possible and the vector can be immediately freed + * in the underlying matrix allocator. + */ + if (cpu_online(apicd->cpu)) { apicd->move_in_progress = true; apicd->prev_vector = apicd->vector; apicd->prev_cpu = apicd->cpu; } else { - apicd->prev_vector = 0; + irq_matrix_free(vector_matrix, apicd->cpu, apicd->vector, + managed); } +setnew: apicd->vector = newvec; apicd->cpu = newcpu; BUG_ON(!IS_ERR_OR_NULL(per_cpu(vector_irq, newcpu)[newvec])); diff --git a/arch/x86/kernel/apic/x2apic.h b/arch/x86/kernel/apic/x2apic.h index b107de381cb5..a49b3604027f 100644 --- a/arch/x86/kernel/apic/x2apic.h +++ b/arch/x86/kernel/apic/x2apic.h @@ -1,6 +1,6 @@ /* Common bits for X2APIC cluster/physical modes. */ -int x2apic_apic_id_valid(int apicid); +int x2apic_apic_id_valid(u32 apicid); int x2apic_apic_id_registered(void); void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest); unsigned int x2apic_get_apic_id(unsigned long id); diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index f8d9d69994e6..b5cf9e7b3830 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -14,7 +14,7 @@ int x2apic_phys; static struct apic apic_x2apic_phys; -static int set_x2apic_phys_mode(char *arg) +static int __init set_x2apic_phys_mode(char *arg) { x2apic_phys = 1; return 0; @@ -101,7 +101,7 @@ static int x2apic_phys_probe(void) } /* Common x2apic functions, also used by x2apic_cluster */ -int x2apic_apic_id_valid(int apicid) +int x2apic_apic_id_valid(u32 apicid) { return 1; } diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 46b675aaf20b..efaf2d4f9c3c 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -557,7 +557,7 @@ static void uv_send_IPI_all(int vector) uv_send_IPI_mask(cpu_online_mask, vector); } -static int uv_apic_id_valid(int apicid) +static int uv_apic_id_valid(u32 apicid) { return 1; } @@ -1176,16 +1176,25 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) uv_gre_table = gre; for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { + unsigned long size = ((unsigned long)(gre->limit - lgre) + << UV_GAM_RANGE_SHFT); + int order = 0; + char suffix[] = " KMGTPE"; + + while (size > 9999 && order < sizeof(suffix)) { + size /= 1024; + order++; + } + if (!index) { pr_info("UV: GAM Range Table...\n"); pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", "SID", "PN"); } - pr_info("UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x\n", + pr_info("UV: %2d: 0x%014lx-0x%014lx %5lu%c %3d %04x %02x %02x\n", index++, (unsigned long)lgre << UV_GAM_RANGE_SHFT, (unsigned long)gre->limit << UV_GAM_RANGE_SHFT, - ((unsigned long)(gre->limit - lgre)) >> - (30 - UV_GAM_RANGE_SHFT), /* 64M -> 1G */ + size, suffix[order], gre->type, gre->nasid, gre->sockid, gre->pnode); lgre = gre->limit; diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index fa1261eefa16..f91ba53e06c8 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -18,7 +18,7 @@ void foo(void) OFFSET(CPUINFO_x86, cpuinfo_x86, x86); OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor); OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model); - OFFSET(CPUINFO_x86_mask, cpuinfo_x86, x86_mask); + OFFSET(CPUINFO_x86_stepping, cpuinfo_x86, x86_stepping); OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level); OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability); OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 570e8bb1f386..a66229f51b12 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -28,7 +28,7 @@ obj-y += cpuid-deps.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o -obj-$(CONFIG_CPU_SUP_INTEL) += intel.o +obj-$(CONFIG_CPU_SUP_INTEL) += intel.o intel_pconfig.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 5bddbdcbc4a3..12bc0a1139da 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -119,7 +119,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c) return; } - if (c->x86_model == 6 && c->x86_mask == 1) { + if (c->x86_model == 6 && c->x86_stepping == 1) { const int K6_BUG_LOOP = 1000000; int n; void (*f_vide)(void); @@ -149,7 +149,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c) /* K6 with old style WHCR */ if (c->x86_model < 8 || - (c->x86_model == 8 && c->x86_mask < 8)) { + (c->x86_model == 8 && c->x86_stepping < 8)) { /* We can only write allocate on the low 508Mb */ if (mbytes > 508) mbytes = 508; @@ -168,7 +168,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c) return; } - if ((c->x86_model == 8 && c->x86_mask > 7) || + if ((c->x86_model == 8 && c->x86_stepping > 7) || c->x86_model == 9 || c->x86_model == 13) { /* The more serious chips .. */ @@ -221,7 +221,7 @@ static void init_amd_k7(struct cpuinfo_x86 *c) * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx * As per AMD technical note 27212 0.2 */ - if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { + if ((c->x86_model == 8 && c->x86_stepping >= 1) || (c->x86_model > 8)) { rdmsr(MSR_K7_CLK_CTL, l, h); if ((l & 0xfff00000) != 0x20000000) { pr_info("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", @@ -241,12 +241,12 @@ static void init_amd_k7(struct cpuinfo_x86 *c) * but they are not certified as MP capable. */ /* Athlon 660/661 is valid. */ - if ((c->x86_model == 6) && ((c->x86_mask == 0) || - (c->x86_mask == 1))) + if ((c->x86_model == 6) && ((c->x86_stepping == 0) || + (c->x86_stepping == 1))) return; /* Duron 670 is valid */ - if ((c->x86_model == 7) && (c->x86_mask == 0)) + if ((c->x86_model == 7) && (c->x86_stepping == 0)) return; /* @@ -256,8 +256,8 @@ static void init_amd_k7(struct cpuinfo_x86 *c) * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for * more. */ - if (((c->x86_model == 6) && (c->x86_mask >= 2)) || - ((c->x86_model == 7) && (c->x86_mask >= 1)) || + if (((c->x86_model == 6) && (c->x86_stepping >= 2)) || + ((c->x86_model == 7) && (c->x86_stepping >= 1)) || (c->x86_model > 7)) if (cpu_has(c, X86_FEATURE_MP)) return; @@ -628,7 +628,7 @@ static void early_init_amd(struct cpuinfo_x86 *c) /* Set MTRR capability flag if appropriate */ if (c->x86 == 5) if (c->x86_model == 13 || c->x86_model == 9 || - (c->x86_model == 8 && c->x86_mask >= 8)) + (c->x86_model == 8 && c->x86_stepping >= 8)) set_cpu_cap(c, X86_FEATURE_K6_MTRR); #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) @@ -716,7 +716,7 @@ static void init_amd_k8(struct cpuinfo_x86 *c) static void init_amd_gh(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_64 +#ifdef CONFIG_MMCONF_FAM10H /* do this for boot cpu */ if (c == &boot_cpu_data) check_enable_amd_mmconf_dmi(); @@ -795,7 +795,7 @@ static void init_amd_zn(struct cpuinfo_x86 *c) * Fix erratum 1076: CPB feature bit not being set in CPUID. It affects * all up to and including B1. */ - if (c->x86_model <= 1 && c->x86_mask <= 1) + if (c->x86_model <= 1 && c->x86_stepping <= 1) set_cpu_cap(c, X86_FEATURE_CPB); } @@ -906,11 +906,11 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) /* AMD errata T13 (order #21922) */ if ((c->x86 == 6)) { /* Duron Rev A0 */ - if (c->x86_model == 3 && c->x86_mask == 0) + if (c->x86_model == 3 && c->x86_stepping == 0) size = 64; /* Tbird rev A1/A2 */ if (c->x86_model == 4 && - (c->x86_mask == 0 || c->x86_mask == 1)) + (c->x86_stepping == 0 || c->x86_stepping == 1)) size = 256; } return size; @@ -1047,7 +1047,7 @@ static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) } /* OSVW unavailable or ID unknown, match family-model-stepping range */ - ms = (cpu->x86_model << 4) | cpu->x86_mask; + ms = (cpu->x86_model << 4) | cpu->x86_stepping; while ((range = *erratum++)) if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && (ms >= AMD_MODEL_RANGE_START(range)) && diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 71949bf2de5a..bfca937bdcc3 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -162,8 +162,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) if (cmdline_find_option_bool(boot_command_line, "nospectre_v2")) return SPECTRE_V2_CMD_NONE; else { - ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, - sizeof(arg)); + ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); if (ret < 0) return SPECTRE_V2_CMD_AUTO; @@ -175,8 +174,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) } if (i >= ARRAY_SIZE(mitigation_options)) { - pr_err("unknown option (%s). Switching to AUTO select\n", - mitigation_options[i].option); + pr_err("unknown option (%s). Switching to AUTO select\n", arg); return SPECTRE_V2_CMD_AUTO; } } @@ -185,8 +183,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) cmd == SPECTRE_V2_CMD_RETPOLINE_AMD || cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) && !IS_ENABLED(CONFIG_RETPOLINE)) { - pr_err("%s selected but not compiled in. Switching to AUTO select\n", - mitigation_options[i].option); + pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } @@ -256,14 +253,14 @@ static void __init spectre_v2_select_mitigation(void) goto retpoline_auto; break; } - pr_err("kernel not compiled with retpoline; no mitigation available!"); + pr_err("Spectre mitigation: kernel not compiled with retpoline; no mitigation available!"); return; retpoline_auto: if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { retpoline_amd: if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { - pr_err("LFENCE not serializing. Switching to generic retpoline\n"); + pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n"); goto retpoline_generic; } mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_AMD : @@ -281,7 +278,7 @@ retpoline_auto: pr_info("%s\n", spectre_v2_strings[mode]); /* - * If neither SMEP or KPTI are available, there is a risk of + * If neither SMEP nor PTI are available, there is a risk of * hitting userspace addresses in the RSB after a context switch * from a shallow call stack to a deeper one. To prevent this fill * the entire RSB, even when using IBRS. @@ -295,21 +292,29 @@ retpoline_auto: if ((!boot_cpu_has(X86_FEATURE_PTI) && !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) { setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); - pr_info("Filling RSB on context switch\n"); + pr_info("Spectre v2 mitigation: Filling RSB on context switch\n"); } /* Initialize Indirect Branch Prediction Barrier if supported */ if (boot_cpu_has(X86_FEATURE_IBPB)) { setup_force_cpu_cap(X86_FEATURE_USE_IBPB); - pr_info("Enabling Indirect Branch Prediction Barrier\n"); + pr_info("Spectre v2 mitigation: Enabling Indirect Branch Prediction Barrier\n"); + } + + /* + * Retpoline means the kernel is safe because it has no indirect + * branches. But firmware isn't, so use IBRS to protect that. + */ + if (boot_cpu_has(X86_FEATURE_IBRS)) { + setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); + pr_info("Enabling Restricted Speculation for firmware calls\n"); } } #undef pr_fmt #ifdef CONFIG_SYSFS -ssize_t cpu_show_meltdown(struct device *dev, - struct device_attribute *attr, char *buf) +ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) { if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) return sprintf(buf, "Not affected\n"); @@ -318,28 +323,21 @@ ssize_t cpu_show_meltdown(struct device *dev, return sprintf(buf, "Vulnerable\n"); } -ssize_t cpu_show_spectre_v1(struct device *dev, - struct device_attribute *attr, char *buf) +ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf) { if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1)) return sprintf(buf, "Not affected\n"); return sprintf(buf, "Mitigation: __user pointer sanitization\n"); } -ssize_t cpu_show_spectre_v2(struct device *dev, - struct device_attribute *attr, char *buf) +ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf) { if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) return sprintf(buf, "Not affected\n"); - return sprintf(buf, "%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], + return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "", + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", spectre_v2_module_string()); } #endif - -void __ibp_barrier(void) -{ - __wrmsr(MSR_IA32_PRED_CMD, PRED_CMD_IBPB, 0); -} -EXPORT_SYMBOL_GPL(__ibp_barrier); diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index c578cd29c2d2..e5ec0f11c0de 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -140,7 +140,7 @@ static void init_centaur(struct cpuinfo_x86 *c) clear_cpu_cap(c, X86_FEATURE_TSC); break; case 8: - switch (c->x86_mask) { + switch (c->x86_stepping) { default: name = "2"; break; @@ -215,7 +215,7 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) * - Note, it seems this may only be in engineering samples. */ if ((c->x86 == 6) && (c->x86_model == 9) && - (c->x86_mask == 1) && (size == 65)) + (c->x86_stepping == 1) && (size == 65)) size -= 1; return size; } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d63f4b5706e4..8a5b185735e1 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -487,7 +487,7 @@ void load_percpu_segment(int cpu) loadsegment(fs, __KERNEL_PERCPU); #else __loadsegment_simple(gs, 0); - wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu)); + wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); #endif load_stack_canary_segment(); } @@ -731,7 +731,7 @@ void cpu_detect(struct cpuinfo_x86 *c) cpuid(0x00000001, &tfms, &misc, &junk, &cap0); c->x86 = x86_family(tfms); c->x86_model = x86_model(tfms); - c->x86_mask = x86_stepping(tfms); + c->x86_stepping = x86_stepping(tfms); if (cap0 & (1<<19)) { c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; @@ -848,18 +848,6 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_power = edx; } - if (c->extended_cpuid_level >= 0x80000008) { - cpuid(0x80000008, &eax, &ebx, &ecx, &edx); - - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - c->x86_capability[CPUID_8000_0008_EBX] = ebx; - } -#ifdef CONFIG_X86_32 - else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36)) - c->x86_phys_bits = 36; -#endif - if (c->extended_cpuid_level >= 0x8000000a) c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); @@ -874,6 +862,23 @@ void get_cpu_cap(struct cpuinfo_x86 *c) apply_forced_caps(c); } +static void get_cpu_address_sizes(struct cpuinfo_x86 *c) +{ + u32 eax, ebx, ecx, edx; + + if (c->extended_cpuid_level >= 0x80000008) { + cpuid(0x80000008, &eax, &ebx, &ecx, &edx); + + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; + c->x86_capability[CPUID_8000_0008_EBX] = ebx; + } +#ifdef CONFIG_X86_32 + else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36)) + c->x86_phys_bits = 36; +#endif +} + static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_32 @@ -965,6 +970,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) cpu_detect(c); get_cpu_vendor(c); get_cpu_cap(c); + get_cpu_address_sizes(c); setup_force_cpu_cap(X86_FEATURE_CPUID); if (this_cpu->c_early_init) @@ -1097,6 +1103,8 @@ static void generic_identify(struct cpuinfo_x86 *c) get_cpu_cap(c); + get_cpu_address_sizes(c); + if (c->cpuid_level >= 0x00000001) { c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; #ifdef CONFIG_X86_32 @@ -1184,9 +1192,9 @@ static void identify_cpu(struct cpuinfo_x86 *c) int i; c->loops_per_jiffy = loops_per_jiffy; - c->x86_cache_size = -1; + c->x86_cache_size = 0; c->x86_vendor = X86_VENDOR_UNKNOWN; - c->x86_model = c->x86_mask = 0; /* So far unknown... */ + c->x86_model = c->x86_stepping = 0; /* So far unknown... */ c->x86_vendor_id[0] = '\0'; /* Unset */ c->x86_model_id[0] = '\0'; /* Unset */ c->x86_max_cores = 1; @@ -1378,8 +1386,8 @@ void print_cpu_info(struct cpuinfo_x86 *c) pr_cont(" (family: 0x%x, model: 0x%x", c->x86, c->x86_model); - if (c->x86_mask || c->cpuid_level >= 0) - pr_cont(", stepping: 0x%x)\n", c->x86_mask); + if (c->x86_stepping || c->cpuid_level >= 0) + pr_cont(", stepping: 0x%x)\n", c->x86_stepping); else pr_cont(")\n"); } @@ -1398,6 +1406,7 @@ __setup("clearcpuid=", setup_clearcpuid); #ifdef CONFIG_X86_64 DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE) __visible; +EXPORT_PER_CPU_SYMBOL_GPL(irq_stack_union); /* * The following percpu variables are hot. Align current_task to @@ -1749,3 +1758,33 @@ static int __init init_cpu_syscore(void) return 0; } core_initcall(init_cpu_syscore); + +/* + * The microcode loader calls this upon late microcode load to recheck features, + * only when microcode has been updated. Caller holds microcode_mutex and CPU + * hotplug lock. + */ +void microcode_check(void) +{ + struct cpuinfo_x86 info; + + perf_check_microcode(); + + /* Reload CPUID max function as it might've changed. */ + info.cpuid_level = cpuid_eax(0); + + /* + * Copy all capability leafs to pick up the synthetic ones so that + * memcmp() below doesn't fail on that. The ones coming from CPUID will + * get overwritten in get_cpu_cap(). + */ + memcpy(&info.x86_capability, &boot_cpu_data.x86_capability, sizeof(info.x86_capability)); + + get_cpu_cap(&info); + + if (!memcmp(&info.x86_capability, &boot_cpu_data.x86_capability, sizeof(info.x86_capability))) + return; + + pr_warn("x86/CPU: CPU features have changed after loading microcode, but might not take effect.\n"); + pr_warn("x86/CPU: Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); +} diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 904b0a3c4e53..2c0bd38a44ab 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -19,7 +19,7 @@ struct cpuid_dep { * called from cpu hotplug. It shouldn't do anything in this case, * but it's difficult to tell that to the init reference checker. */ -const static struct cpuid_dep cpuid_deps[] = { +static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE }, { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE }, { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE }, diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 6b4bb335641f..8949b7ae6d92 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -215,7 +215,7 @@ static void init_cyrix(struct cpuinfo_x86 *c) /* common case step number/rev -- exceptions handled below */ c->x86_model = (dir1 >> 4) + 1; - c->x86_mask = dir1 & 0xf; + c->x86_stepping = dir1 & 0xf; /* Now cook; the original recipe is by Channing Corn, from Cyrix. * We do the same thing for each generation: we work out diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 319bf989fad1..60d1897041da 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -105,7 +105,7 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c) /* * Early microcode releases for the Spectre v2 mitigation were broken. * Information taken from; - * - https://newsroom.intel.com/wp-content/uploads/sites/11/2018/01/microcode-update-guidance.pdf + * - https://newsroom.intel.com/wp-content/uploads/sites/11/2018/03/microcode-update-guidance.pdf * - https://kb.vmware.com/s/article/52345 * - Microcode revisions observed in the wild * - Release note from 20180108 microcode release @@ -116,15 +116,13 @@ struct sku_microcode { u32 microcode; }; static const struct sku_microcode spectre_bad_microcodes[] = { - { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0B, 0x84 }, - { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0A, 0x84 }, - { INTEL_FAM6_KABYLAKE_DESKTOP, 0x09, 0x84 }, - { INTEL_FAM6_KABYLAKE_MOBILE, 0x0A, 0x84 }, - { INTEL_FAM6_KABYLAKE_MOBILE, 0x09, 0x84 }, + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0B, 0x80 }, + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0A, 0x80 }, + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x09, 0x80 }, + { INTEL_FAM6_KABYLAKE_MOBILE, 0x0A, 0x80 }, + { INTEL_FAM6_KABYLAKE_MOBILE, 0x09, 0x80 }, { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e }, { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c }, - { INTEL_FAM6_SKYLAKE_MOBILE, 0x03, 0xc2 }, - { INTEL_FAM6_SKYLAKE_DESKTOP, 0x03, 0xc2 }, { INTEL_FAM6_BROADWELL_CORE, 0x04, 0x28 }, { INTEL_FAM6_BROADWELL_GT3E, 0x01, 0x1b }, { INTEL_FAM6_BROADWELL_XEON_D, 0x02, 0x14 }, @@ -136,8 +134,6 @@ static const struct sku_microcode spectre_bad_microcodes[] = { { INTEL_FAM6_HASWELL_X, 0x02, 0x3b }, { INTEL_FAM6_HASWELL_X, 0x04, 0x10 }, { INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a }, - /* Updated in the 20180108 release; blacklist until we know otherwise */ - { INTEL_FAM6_ATOM_GEMINI_LAKE, 0x01, 0x22 }, /* Observed in the wild */ { INTEL_FAM6_SANDYBRIDGE_X, 0x06, 0x61b }, { INTEL_FAM6_SANDYBRIDGE_X, 0x07, 0x712 }, @@ -147,9 +143,16 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c) { int i; + /* + * We know that the hypervisor lie to us on the microcode version so + * we may as well hope that it is running the correct version. + */ + if (cpu_has(c, X86_FEATURE_HYPERVISOR)) + return false; + for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) { if (c->x86_model == spectre_bad_microcodes[i].model && - c->x86_mask == spectre_bad_microcodes[i].stepping) + c->x86_stepping == spectre_bad_microcodes[i].stepping) return (c->microcode <= spectre_bad_microcodes[i].microcode); } return false; @@ -196,7 +199,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) * need the microcode to have already been loaded... so if it is * not, recommend a BIOS update and disable large pages. */ - if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2 && + if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_stepping <= 2 && c->microcode < 0x20e) { pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n"); clear_cpu_cap(c, X86_FEATURE_PSE); @@ -212,7 +215,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) /* CPUID workaround for 0F33/0F34 CPU */ if (c->x86 == 0xF && c->x86_model == 0x3 - && (c->x86_mask == 0x3 || c->x86_mask == 0x4)) + && (c->x86_stepping == 0x3 || c->x86_stepping == 0x4)) c->x86_phys_bits = 36; /* @@ -310,7 +313,7 @@ int ppro_with_ram_bug(void) if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 1 && - boot_cpu_data.x86_mask < 8) { + boot_cpu_data.x86_stepping < 8) { pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n"); return 1; } @@ -327,7 +330,7 @@ static void intel_smp_check(struct cpuinfo_x86 *c) * Mask B, Pentium, but not Pentium MMX */ if (c->x86 == 5 && - c->x86_mask >= 1 && c->x86_mask <= 4 && + c->x86_stepping >= 1 && c->x86_stepping <= 4 && c->x86_model <= 3) { /* * Remember we have B step Pentia with bugs @@ -370,7 +373,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until * model 3 mask 3 */ - if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) + if ((c->x86<<8 | c->x86_model<<4 | c->x86_stepping) < 0x633) clear_cpu_cap(c, X86_FEATURE_SEP); /* @@ -388,7 +391,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * P4 Xeon erratum 037 workaround. * Hardware prefetcher may cause stale data to be loaded into the cache. */ - if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { + if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_stepping == 1)) { if (msr_set_bit(MSR_IA32_MISC_ENABLE, MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) > 0) { pr_info("CPU: C0 stepping P4 Xeon detected.\n"); @@ -403,7 +406,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * Specification Update"). */ if (boot_cpu_has(X86_FEATURE_APIC) && (c->x86<<8 | c->x86_model<<4) == 0x520 && - (c->x86_mask < 0x6 || c->x86_mask == 0xb)) + (c->x86_stepping < 0x6 || c->x86_stepping == 0xb)) set_cpu_bug(c, X86_BUG_11AP); @@ -506,6 +509,90 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c) } } +#define MSR_IA32_TME_ACTIVATE 0x982 + +/* Helpers to access TME_ACTIVATE MSR */ +#define TME_ACTIVATE_LOCKED(x) (x & 0x1) +#define TME_ACTIVATE_ENABLED(x) (x & 0x2) + +#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */ +#define TME_ACTIVATE_POLICY_AES_XTS_128 0 + +#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */ + +#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */ +#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1 + +/* Values for mktme_status (SW only construct) */ +#define MKTME_ENABLED 0 +#define MKTME_DISABLED 1 +#define MKTME_UNINITIALIZED 2 +static int mktme_status = MKTME_UNINITIALIZED; + +static void detect_tme(struct cpuinfo_x86 *c) +{ + u64 tme_activate, tme_policy, tme_crypto_algs; + int keyid_bits = 0, nr_keyids = 0; + static u64 tme_activate_cpu0 = 0; + + rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate); + + if (mktme_status != MKTME_UNINITIALIZED) { + if (tme_activate != tme_activate_cpu0) { + /* Broken BIOS? */ + pr_err_once("x86/tme: configuration is inconsistent between CPUs\n"); + pr_err_once("x86/tme: MKTME is not usable\n"); + mktme_status = MKTME_DISABLED; + + /* Proceed. We may need to exclude bits from x86_phys_bits. */ + } + } else { + tme_activate_cpu0 = tme_activate; + } + + if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) { + pr_info_once("x86/tme: not enabled by BIOS\n"); + mktme_status = MKTME_DISABLED; + return; + } + + if (mktme_status != MKTME_UNINITIALIZED) + goto detect_keyid_bits; + + pr_info("x86/tme: enabled by BIOS\n"); + + tme_policy = TME_ACTIVATE_POLICY(tme_activate); + if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128) + pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy); + + tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate); + if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) { + pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n", + tme_crypto_algs); + mktme_status = MKTME_DISABLED; + } +detect_keyid_bits: + keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate); + nr_keyids = (1UL << keyid_bits) - 1; + if (nr_keyids) { + pr_info_once("x86/mktme: enabled by BIOS\n"); + pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids); + } else { + pr_info_once("x86/mktme: disabled by BIOS\n"); + } + + if (mktme_status == MKTME_UNINITIALIZED) { + /* MKTME is usable */ + mktme_status = MKTME_ENABLED; + } + + /* + * KeyID bits effectively lower the number of physical address + * bits. Update cpuinfo_x86::x86_phys_bits accordingly. + */ + c->x86_phys_bits -= keyid_bits; +} + static void init_intel_energy_perf(struct cpuinfo_x86 *c) { u64 epb; @@ -650,7 +737,7 @@ static void init_intel(struct cpuinfo_x86 *c) case 6: if (l2 == 128) p = "Celeron (Mendocino)"; - else if (c->x86_mask == 0 || c->x86_mask == 5) + else if (c->x86_stepping == 0 || c->x86_stepping == 5) p = "Celeron-A"; break; @@ -676,6 +763,9 @@ static void init_intel(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_VMX)) detect_vmx_virtcap(c); + if (cpu_has(c, X86_FEATURE_TME)) + detect_tme(c); + init_intel_energy_perf(c); init_intel_misc_features(c); @@ -745,6 +835,9 @@ static const struct _tlb_table intel_tlb_table[] = { { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" }, { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" }, { 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" }, + { 0x6b, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 8-way associative" }, + { 0x6c, TLB_DATA_2M_4M, 128, " TLB_DATA 2 MByte or 4 MByte pages, 8-way associative" }, + { 0x6d, TLB_DATA_1G, 16, " TLB_DATA 1 GByte pages, fully associative" }, { 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" }, { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" }, { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" }, diff --git a/arch/x86/kernel/cpu/intel_pconfig.c b/arch/x86/kernel/cpu/intel_pconfig.c new file mode 100644 index 000000000000..0771a905b286 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_pconfig.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel PCONFIG instruction support. + * + * Copyright (C) 2017 Intel Corporation + * + * Author: + * Kirill A. Shutemov <kirill.shutemov@linux.intel.com> + */ + +#include <asm/cpufeature.h> +#include <asm/intel_pconfig.h> + +#define PCONFIG_CPUID 0x1b + +#define PCONFIG_CPUID_SUBLEAF_MASK ((1 << 12) - 1) + +/* Subleaf type (EAX) for PCONFIG CPUID leaf (0x1B) */ +enum { + PCONFIG_CPUID_SUBLEAF_INVALID = 0, + PCONFIG_CPUID_SUBLEAF_TARGETID = 1, +}; + +/* Bitmask of supported targets */ +static u64 targets_supported __read_mostly; + +int pconfig_target_supported(enum pconfig_target target) +{ + /* + * We would need to re-think the implementation once we get > 64 + * PCONFIG targets. Spec allows up to 2^32 targets. + */ + BUILD_BUG_ON(PCONFIG_TARGET_NR >= 64); + + if (WARN_ON_ONCE(target >= 64)) + return 0; + return targets_supported & (1ULL << target); +} + +static int __init intel_pconfig_init(void) +{ + int subleaf; + + if (!boot_cpu_has(X86_FEATURE_PCONFIG)) + return 0; + + /* + * Scan subleafs of PCONFIG CPUID leaf. + * + * Subleafs of the same type need not to be consecutive. + * + * Stop on the first invalid subleaf type. All subleafs after the first + * invalid are invalid too. + */ + for (subleaf = 0; subleaf < INT_MAX; subleaf++) { + struct cpuid_regs regs; + + cpuid_count(PCONFIG_CPUID, subleaf, + ®s.eax, ®s.ebx, ®s.ecx, ®s.edx); + + switch (regs.eax & PCONFIG_CPUID_SUBLEAF_MASK) { + case PCONFIG_CPUID_SUBLEAF_INVALID: + /* Stop on the first invalid subleaf */ + goto out; + case PCONFIG_CPUID_SUBLEAF_TARGETID: + /* Mark supported PCONFIG targets */ + if (regs.ebx < 64) + targets_supported |= (1ULL << regs.ebx); + if (regs.ecx < 64) + targets_supported |= (1ULL << regs.ecx); + if (regs.edx < 64) + targets_supported |= (1ULL << regs.edx); + break; + default: + /* Unknown CPUID.PCONFIG subleaf: ignore */ + break; + } + } +out: + return 0; +} +arch_initcall(intel_pconfig_init); diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 410629f10ad3..589b948e6e01 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -819,7 +819,7 @@ static __init void rdt_quirks(void) cache_alloc_hsw_probe(); break; case INTEL_FAM6_SKYLAKE_X: - if (boot_cpu_data.x86_mask <= 4) + if (boot_cpu_data.x86_stepping <= 4) set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat"); } } diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index bdab7d2f51af..fca759d272a1 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -1804,6 +1804,7 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, goto out_common_fail; } closid = ret; + ret = 0; rdtgrp->closid = closid; list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 231ad23b24a9..475cb4f5f14f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -491,7 +491,7 @@ static void do_inject(void) unsigned int cpu = i_mce.extcpu; u8 b = i_mce.bank; - rdtscll(i_mce.tsc); + i_mce.tsc = rdtsc_ordered(); if (i_mce.misc) i_mce.status |= MCI_STATUS_MISCV; diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index aa0d5df9dc60..374d1aa66952 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -113,6 +113,76 @@ static inline void mce_register_injector_chain(struct notifier_block *nb) { } static inline void mce_unregister_injector_chain(struct notifier_block *nb) { } #endif +#ifndef CONFIG_X86_64 +/* + * On 32-bit systems it would be difficult to safely unmap a poison page + * from the kernel 1:1 map because there are no non-canonical addresses that + * we can use to refer to the address without risking a speculative access. + * However, this isn't much of an issue because: + * 1) Few unmappable pages are in the 1:1 map. Most are in HIGHMEM which + * are only mapped into the kernel as needed + * 2) Few people would run a 32-bit kernel on a machine that supports + * recoverable errors because they have too much memory to boot 32-bit. + */ +static inline void mce_unmap_kpfn(unsigned long pfn) {} +#define mce_unmap_kpfn mce_unmap_kpfn +#endif + +struct mca_config { + bool dont_log_ce; + bool cmci_disabled; + bool ignore_ce; + + __u64 lmce_disabled : 1, + disabled : 1, + ser : 1, + recovery : 1, + bios_cmci_threshold : 1, + __reserved : 59; + + u8 banks; + s8 bootlog; + int tolerant; + int monarch_timeout; + int panic_timeout; + u32 rip_msr; +}; + extern struct mca_config mca_cfg; +struct mce_vendor_flags { + /* + * Indicates that overflow conditions are not fatal, when set. + */ + __u64 overflow_recov : 1, + + /* + * (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and + * Recovery. It indicates support for data poisoning in HW and deferred + * error interrupts. + */ + succor : 1, + + /* + * (AMD) SMCA: This bit indicates support for Scalable MCA which expands + * the register space for each MCA bank and also increases number of + * banks. Also, to accommodate the new banks and registers, the MCA + * register space is moved to a new MSR range. + */ + smca : 1, + + __reserved_0 : 61; +}; + +extern struct mce_vendor_flags mce_flags; + +struct mca_msr_regs { + u32 (*ctl) (int bank); + u32 (*status) (int bank); + u32 (*addr) (int bank); + u32 (*misc) (int bank); +}; + +extern struct mca_msr_regs msr_ops; + #endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 3a8e88a611eb..42cf2880d0ed 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -56,6 +56,9 @@ static DEFINE_MUTEX(mce_log_mutex); +/* sysfs synchronization */ +static DEFINE_MUTEX(mce_sysfs_mutex); + #define CREATE_TRACE_POINTS #include <trace/events/mce.h> @@ -105,6 +108,10 @@ static struct irq_work mce_irq_work; static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); +#ifndef mce_unmap_kpfn +static void mce_unmap_kpfn(unsigned long pfn); +#endif + /* * CPU/chipset specific EDAC code can register a notifier call here to print * MCE errors in a human-readable form. @@ -126,6 +133,8 @@ void mce_setup(struct mce *m) if (this_cpu_has(X86_FEATURE_INTEL_PPIN)) rdmsrl(MSR_PPIN, m->ppin); + + m->microcode = boot_cpu_data.microcode; } DEFINE_PER_CPU(struct mce, injectm); @@ -234,7 +243,7 @@ static void __print_mce(struct mce *m) m->cs, m->ip); if (m->cs == __KERNEL_CS) - pr_cont("{%pS}", (void *)m->ip); + pr_cont("{%pS}", (void *)(unsigned long)m->ip); pr_cont("\n"); } @@ -258,13 +267,15 @@ static void __print_mce(struct mce *m) */ pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, - cpu_data(m->extcpu).microcode); + m->microcode); } static void print_mce(struct mce *m) { __print_mce(m); - pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); + + if (m->cpuvendor != X86_VENDOR_AMD) + pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); } #define PANIC_TIMEOUT 5 /* 5 seconds */ @@ -590,7 +601,8 @@ static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) { pfn = mce->addr >> PAGE_SHIFT; - memory_failure(pfn, 0); + if (!memory_failure(pfn, 0)) + mce_unmap_kpfn(pfn); } return NOTIFY_OK; @@ -1057,12 +1069,13 @@ static int do_memory_failure(struct mce *m) ret = memory_failure(m->addr >> PAGE_SHIFT, flags); if (ret) pr_err("Memory error not recovered"); + else + mce_unmap_kpfn(m->addr >> PAGE_SHIFT); return ret; } -#if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE) - -void arch_unmap_kpfn(unsigned long pfn) +#ifndef mce_unmap_kpfn +static void mce_unmap_kpfn(unsigned long pfn) { unsigned long decoy_addr; @@ -1073,7 +1086,7 @@ void arch_unmap_kpfn(unsigned long pfn) * We would like to just call: * set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1); * but doing that would radically increase the odds of a - * speculative access to the posion page because we'd have + * speculative access to the poison page because we'd have * the virtual address of the kernel 1:1 mapping sitting * around in registers. * Instead we get tricky. We create a non-canonical address @@ -1082,23 +1095,10 @@ void arch_unmap_kpfn(unsigned long pfn) * a legal address. */ -/* - * Build time check to see if we have a spare virtual bit. Don't want - * to leave this until run time because most developers don't have a - * system that can exercise this code path. This will only become a - * problem if/when we move beyond 5-level page tables. - * - * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD) - */ -#if PGDIR_SHIFT + 9 < 63 decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); -#else -#error "no unused virtual bit available" -#endif if (set_memory_np(decoy_addr, 1)) pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); - } #endif @@ -1506,7 +1506,7 @@ static int __mcheck_cpu_cap_init(void) mca_cfg.rip_msr = MSR_IA32_MCG_EIP; if (cap & MCG_SER_P) - mca_cfg.ser = true; + mca_cfg.ser = 1; return 0; } @@ -1814,12 +1814,12 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) return; if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { - mca_cfg.disabled = true; + mca_cfg.disabled = 1; return; } if (mce_gen_pool_init()) { - mca_cfg.disabled = true; + mca_cfg.disabled = 1; pr_emerg("Couldn't allocate MCE records pool!\n"); return; } @@ -1897,11 +1897,11 @@ static int __init mcheck_enable(char *str) if (*str == '=') str++; if (!strcmp(str, "off")) - cfg->disabled = true; + cfg->disabled = 1; else if (!strcmp(str, "no_cmci")) cfg->cmci_disabled = true; else if (!strcmp(str, "no_lmce")) - cfg->lmce_disabled = true; + cfg->lmce_disabled = 1; else if (!strcmp(str, "dont_log_ce")) cfg->dont_log_ce = true; else if (!strcmp(str, "ignore_ce")) @@ -1909,9 +1909,9 @@ static int __init mcheck_enable(char *str) else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) cfg->bootlog = (str[0] == 'b'); else if (!strcmp(str, "bios_cmci_threshold")) - cfg->bios_cmci_threshold = true; + cfg->bios_cmci_threshold = 1; else if (!strcmp(str, "recovery")) - cfg->recovery = true; + cfg->recovery = 1; else if (isdigit(str[0])) { if (get_option(&str, &cfg->tolerant) == 2) get_option(&str, &(cfg->monarch_timeout)); @@ -2081,6 +2081,7 @@ static ssize_t set_ignore_ce(struct device *s, if (kstrtou64(buf, 0, &new) < 0) return -EINVAL; + mutex_lock(&mce_sysfs_mutex); if (mca_cfg.ignore_ce ^ !!new) { if (new) { /* disable ce features */ @@ -2093,6 +2094,8 @@ static ssize_t set_ignore_ce(struct device *s, on_each_cpu(mce_enable_ce, (void *)1, 1); } } + mutex_unlock(&mce_sysfs_mutex); + return size; } @@ -2105,6 +2108,7 @@ static ssize_t set_cmci_disabled(struct device *s, if (kstrtou64(buf, 0, &new) < 0) return -EINVAL; + mutex_lock(&mce_sysfs_mutex); if (mca_cfg.cmci_disabled ^ !!new) { if (new) { /* disable cmci */ @@ -2116,6 +2120,8 @@ static ssize_t set_cmci_disabled(struct device *s, on_each_cpu(mce_enable_ce, NULL, 1); } } + mutex_unlock(&mce_sysfs_mutex); + return size; } @@ -2123,8 +2129,19 @@ static ssize_t store_int_with_restart(struct device *s, struct device_attribute *attr, const char *buf, size_t size) { - ssize_t ret = device_store_int(s, attr, buf, size); + unsigned long old_check_interval = check_interval; + ssize_t ret = device_store_ulong(s, attr, buf, size); + + if (check_interval == old_check_interval) + return ret; + + if (check_interval < 1) + check_interval = 1; + + mutex_lock(&mce_sysfs_mutex); mce_restart(); + mutex_unlock(&mce_sysfs_mutex); + return ret; } @@ -2328,6 +2345,12 @@ static __init int mcheck_init_device(void) { int err; + /* + * Check if we have a spare virtual bit. This will only become + * a problem if/when we move beyond 5-level page tables. + */ + MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63); + if (!mce_available(&boot_cpu_data)) { err = -EIO; goto err_out; @@ -2376,7 +2399,7 @@ device_initcall_sync(mcheck_init_device); */ static int __init mcheck_disable(char *str) { - mca_cfg.disabled = true; + mca_cfg.disabled = 1; return 1; } __setup("nomce", mcheck_disable); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 0f32ad242324..f7666eef4a87 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -82,6 +82,7 @@ static struct smca_bank_name smca_names[] = { [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" }, [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" }, [SMCA_DE] = { "decode_unit", "Decode Unit" }, + [SMCA_RESERVED] = { "reserved", "Reserved" }, [SMCA_EX] = { "execution_unit", "Execution Unit" }, [SMCA_FP] = { "floating_point", "Floating Point Unit" }, [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" }, @@ -110,14 +111,14 @@ const char *smca_get_long_name(enum smca_bank_types t) } EXPORT_SYMBOL_GPL(smca_get_long_name); -static enum smca_bank_types smca_get_bank_type(struct mce *m) +static enum smca_bank_types smca_get_bank_type(unsigned int bank) { struct smca_bank *b; - if (m->bank >= N_SMCA_BANK_TYPES) + if (bank >= MAX_NR_BANKS) return N_SMCA_BANK_TYPES; - b = &smca_banks[m->bank]; + b = &smca_banks[bank]; if (!b->hwid) return N_SMCA_BANK_TYPES; @@ -127,6 +128,9 @@ static enum smca_bank_types smca_get_bank_type(struct mce *m) static struct smca_hwid smca_hwid_mcatypes[] = { /* { bank_type, hwid_mcatype, xec_bitmap } */ + /* Reserved type */ + { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0), 0x0 }, + /* ZN Core (HWID=0xB0) MCA types */ { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFEF }, { SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF }, @@ -427,35 +431,58 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) wrmsr(MSR_CU_DEF_ERR, low, high); } +static u32 smca_get_block_address(unsigned int cpu, unsigned int bank, + unsigned int block) +{ + u32 low, high; + u32 addr = 0; + + if (smca_get_bank_type(bank) == SMCA_RESERVED) + return addr; + + if (!block) + return MSR_AMD64_SMCA_MCx_MISC(bank); + + /* + * For SMCA enabled processors, BLKPTR field of the first MISC register + * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4). + */ + if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) + return addr; + + if (!(low & MCI_CONFIG_MCAX)) + return addr; + + if (!rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && + (low & MASK_BLKPTR_LO)) + return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); + + return addr; +} + static u32 get_block_address(unsigned int cpu, u32 current_addr, u32 low, u32 high, unsigned int bank, unsigned int block) { u32 addr = 0, offset = 0; - if (mce_flags.smca) { - if (!block) { - addr = MSR_AMD64_SMCA_MCx_MISC(bank); - } else { - /* - * For SMCA enabled processors, BLKPTR field of the - * first MISC register (MCx_MISC0) indicates presence of - * additional MISC register set (MISC1-4). - */ - u32 low, high; + if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS)) + return addr; - if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) - return addr; + /* Get address from already initialized block. */ + if (per_cpu(threshold_banks, cpu)) { + struct threshold_bank *bankp = per_cpu(threshold_banks, cpu)[bank]; - if (!(low & MCI_CONFIG_MCAX)) - return addr; + if (bankp && bankp->blocks) { + struct threshold_block *blockp = &bankp->blocks[block]; - if (!rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && - (low & MASK_BLKPTR_LO)) - addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); + if (blockp) + return blockp->address; } - return addr; } + if (mce_flags.smca) + return smca_get_block_address(cpu, bank, block); + /* Fall back to method we used for older processors: */ switch (block) { case 0: @@ -760,7 +787,7 @@ bool amd_mce_is_memory_error(struct mce *m) u8 xec = (m->status >> 16) & 0x1f; if (mce_flags.smca) - return smca_get_bank_type(m) == SMCA_UMC && xec == 0x0; + return smca_get_bank_type(m->bank) == SMCA_UMC && xec == 0x0; return m->bank == 4 && xec == 0x8; } @@ -1063,7 +1090,7 @@ static struct kobj_type threshold_ktype = { static const char *get_name(unsigned int bank, struct threshold_block *b) { - unsigned int bank_type; + enum smca_bank_types bank_type; if (!mce_flags.smca) { if (b && bank == 4) @@ -1072,11 +1099,10 @@ static const char *get_name(unsigned int bank, struct threshold_block *b) return th_names[bank]; } - if (!smca_banks[bank].hwid) + bank_type = smca_get_bank_type(bank); + if (bank_type >= N_SMCA_BANK_TYPES) return NULL; - bank_type = smca_banks[bank].hwid->bank_type; - if (b && bank_type == SMCA_UMC) { if (b->block < ARRAY_SIZE(smca_umc_block_names)) return smca_umc_block_names[b->block]; diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 330b8462d426..0624957aa068 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -58,7 +58,7 @@ static u8 amd_ucode_patch[PATCH_MAX_SIZE]; /* * Microcode patch container file is prepended to the initrd in cpio - * format. See Documentation/x86/early-microcode.txt + * format. See Documentation/x86/microcode.txt */ static const char ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin"; @@ -339,7 +339,7 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) return -EINVAL; ret = load_microcode_amd(true, x86_family(cpuid_1_eax), desc.data, desc.size); - if (ret != UCODE_OK) + if (ret > UCODE_UPDATED) return -EINVAL; return 0; @@ -498,7 +498,7 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size, return patch_size; } -static int apply_microcode_amd(int cpu) +static enum ucode_state apply_microcode_amd(int cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); struct microcode_amd *mc_amd; @@ -512,7 +512,7 @@ static int apply_microcode_amd(int cpu) p = find_patch(cpu); if (!p) - return 0; + return UCODE_NFOUND; mc_amd = p->data; uci->mc = p->data; @@ -523,13 +523,13 @@ static int apply_microcode_amd(int cpu) if (rev >= mc_amd->hdr.patch_id) { c->microcode = rev; uci->cpu_sig.rev = rev; - return 0; + return UCODE_OK; } if (__apply_microcode_amd(mc_amd)) { pr_err("CPU%d: update failed for patch_level=0x%08x\n", cpu, mc_amd->hdr.patch_id); - return -1; + return UCODE_ERROR; } pr_info("CPU%d: new patch_level=0x%08x\n", cpu, mc_amd->hdr.patch_id); @@ -537,7 +537,7 @@ static int apply_microcode_amd(int cpu) uci->cpu_sig.rev = mc_amd->hdr.patch_id; c->microcode = mc_amd->hdr.patch_id; - return 0; + return UCODE_UPDATED; } static int install_equiv_cpu_table(const u8 *buf) @@ -683,27 +683,35 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, static enum ucode_state load_microcode_amd(bool save, u8 family, const u8 *data, size_t size) { + struct ucode_patch *p; enum ucode_state ret; /* free old equiv table */ free_equiv_cpu_table(); ret = __load_microcode_amd(family, data, size); - - if (ret != UCODE_OK) + if (ret != UCODE_OK) { cleanup(); + return ret; + } -#ifdef CONFIG_X86_32 - /* save BSP's matching patch for early load */ - if (save) { - struct ucode_patch *p = find_patch(0); - if (p) { - memset(amd_ucode_patch, 0, PATCH_MAX_SIZE); - memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), - PATCH_MAX_SIZE)); - } + p = find_patch(0); + if (!p) { + return ret; + } else { + if (boot_cpu_data.microcode == p->patch_id) + return ret; + + ret = UCODE_NEW; } -#endif + + /* save BSP's matching patch for early load */ + if (!save) + return ret; + + memset(amd_ucode_patch, 0, PATCH_MAX_SIZE); + memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), PATCH_MAX_SIZE)); + return ret; } diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 319dd65f98a2..77e201301528 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -22,13 +22,16 @@ #define pr_fmt(fmt) "microcode: " fmt #include <linux/platform_device.h> +#include <linux/stop_machine.h> #include <linux/syscore_ops.h> #include <linux/miscdevice.h> #include <linux/capability.h> #include <linux/firmware.h> #include <linux/kernel.h> +#include <linux/delay.h> #include <linux/mutex.h> #include <linux/cpu.h> +#include <linux/nmi.h> #include <linux/fs.h> #include <linux/mm.h> @@ -64,6 +67,11 @@ LIST_HEAD(microcode_cache); */ static DEFINE_MUTEX(microcode_mutex); +/* + * Serialize late loading so that CPUs get updated one-by-one. + */ +static DEFINE_SPINLOCK(update_lock); + struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; struct cpu_info_ctx { @@ -373,26 +381,23 @@ static int collect_cpu_info(int cpu) return ret; } -struct apply_microcode_ctx { - int err; -}; - static void apply_microcode_local(void *arg) { - struct apply_microcode_ctx *ctx = arg; + enum ucode_state *err = arg; - ctx->err = microcode_ops->apply_microcode(smp_processor_id()); + *err = microcode_ops->apply_microcode(smp_processor_id()); } static int apply_microcode_on_target(int cpu) { - struct apply_microcode_ctx ctx = { .err = 0 }; + enum ucode_state err; int ret; - ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1); - if (!ret) - ret = ctx.err; - + ret = smp_call_function_single(cpu, apply_microcode_local, &err, 1); + if (!ret) { + if (err == UCODE_ERROR) + ret = 1; + } return ret; } @@ -489,31 +494,122 @@ static void __exit microcode_dev_exit(void) /* fake device for request_firmware */ static struct platform_device *microcode_pdev; -static int reload_for_cpu(int cpu) +/* + * Late loading dance. Why the heavy-handed stomp_machine effort? + * + * - HT siblings must be idle and not execute other code while the other sibling + * is loading microcode in order to avoid any negative interactions caused by + * the loading. + * + * - In addition, microcode update on the cores must be serialized until this + * requirement can be relaxed in the future. Right now, this is conservative + * and good. + */ +#define SPINUNIT 100 /* 100 nsec */ + +static int check_online_cpus(void) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - enum ucode_state ustate; - int err = 0; + if (num_online_cpus() == num_present_cpus()) + return 0; - if (!uci->valid) - return err; + pr_err("Not all CPUs online, aborting microcode update.\n"); - ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, true); - if (ustate == UCODE_OK) - apply_microcode_on_target(cpu); - else - if (ustate == UCODE_ERROR) - err = -EINVAL; - return err; + return -EINVAL; +} + +static atomic_t late_cpus_in; +static atomic_t late_cpus_out; + +static int __wait_for_cpus(atomic_t *t, long long timeout) +{ + int all_cpus = num_online_cpus(); + + atomic_inc(t); + + while (atomic_read(t) < all_cpus) { + if (timeout < SPINUNIT) { + pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n", + all_cpus - atomic_read(t)); + return 1; + } + + ndelay(SPINUNIT); + timeout -= SPINUNIT; + + touch_nmi_watchdog(); + } + return 0; +} + +/* + * Returns: + * < 0 - on error + * 0 - no update done + * 1 - microcode was updated + */ +static int __reload_late(void *info) +{ + int cpu = smp_processor_id(); + enum ucode_state err; + int ret = 0; + + /* + * Wait for all CPUs to arrive. A load will not be attempted unless all + * CPUs show up. + * */ + if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC)) + return -1; + + spin_lock(&update_lock); + apply_microcode_local(&err); + spin_unlock(&update_lock); + + /* siblings return UCODE_OK because their engine got updated already */ + if (err > UCODE_NFOUND) { + pr_warn("Error reloading microcode on CPU %d\n", cpu); + ret = -1; + } else if (err == UCODE_UPDATED || err == UCODE_OK) { + ret = 1; + } + + /* + * Increase the wait timeout to a safe value here since we're + * serializing the microcode update and that could take a while on a + * large number of CPUs. And that is fine as the *actual* timeout will + * be determined by the last CPU finished updating and thus cut short. + */ + if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC * num_online_cpus())) + panic("Timeout during microcode update!\n"); + + return ret; +} + +/* + * Reload microcode late on all CPUs. Wait for a sec until they + * all gather together. + */ +static int microcode_reload_late(void) +{ + int ret; + + atomic_set(&late_cpus_in, 0); + atomic_set(&late_cpus_out, 0); + + ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask); + if (ret > 0) + microcode_check(); + + return ret; } static ssize_t reload_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t size) { + enum ucode_state tmp_ret = UCODE_OK; + int bsp = boot_cpu_data.cpu_index; unsigned long val; - int cpu; - ssize_t ret = 0, tmp_ret; + ssize_t ret = 0; ret = kstrtoul(buf, 0, &val); if (ret) @@ -522,23 +618,24 @@ static ssize_t reload_store(struct device *dev, if (val != 1) return size; + tmp_ret = microcode_ops->request_microcode_fw(bsp, µcode_pdev->dev, true); + if (tmp_ret != UCODE_NEW) + return size; + get_online_cpus(); - mutex_lock(µcode_mutex); - for_each_online_cpu(cpu) { - tmp_ret = reload_for_cpu(cpu); - if (tmp_ret != 0) - pr_warn("Error reloading microcode on CPU %d\n", cpu); - /* save retval of the first encountered reload error */ - if (!ret) - ret = tmp_ret; - } - if (!ret) - perf_check_microcode(); + ret = check_online_cpus(); + if (ret) + goto put; + + mutex_lock(µcode_mutex); + ret = microcode_reload_late(); mutex_unlock(µcode_mutex); + +put: put_online_cpus(); - if (!ret) + if (ret >= 0) ret = size; return ret; @@ -606,10 +703,8 @@ static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw) if (system_state != SYSTEM_RUNNING) return UCODE_NFOUND; - ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, - refresh_fw); - - if (ustate == UCODE_OK) { + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, refresh_fw); + if (ustate == UCODE_NEW) { pr_debug("CPU%d updated upon init\n", cpu); apply_microcode_on_target(cpu); } diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index f7c55b0e753a..1c2cfa0644aa 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -485,7 +485,6 @@ static void show_saved_mc(void) */ static void save_mc_for_early(u8 *mc, unsigned int size) { -#ifdef CONFIG_HOTPLUG_CPU /* Synchronization during CPU hotplug. */ static DEFINE_MUTEX(x86_cpu_microcode_mutex); @@ -495,7 +494,6 @@ static void save_mc_for_early(u8 *mc, unsigned int size) show_saved_mc(); mutex_unlock(&x86_cpu_microcode_mutex); -#endif } static bool load_builtin_intel_microcode(struct cpio_data *cp) @@ -589,6 +587,23 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) if (!mc) return 0; + /* + * Save us the MSR write below - which is a particular expensive + * operation - when the other hyperthread has updated the microcode + * already. + */ + rev = intel_get_microcode_revision(); + if (rev >= mc->hdr.rev) { + uci->cpu_sig.rev = rev; + return UCODE_OK; + } + + /* + * Writeback and invalidate caches before updating microcode to avoid + * internal issues depending on what the microcode is updating. + */ + native_wbinvd(); + /* write microcode via MSR 0x79 */ native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); @@ -772,27 +787,44 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) return 0; } -static int apply_microcode_intel(int cpu) +static enum ucode_state apply_microcode_intel(int cpu) { + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct cpuinfo_x86 *c = &cpu_data(cpu); struct microcode_intel *mc; - struct ucode_cpu_info *uci; - struct cpuinfo_x86 *c; static int prev_rev; u32 rev; /* We should bind the task to the CPU */ if (WARN_ON(raw_smp_processor_id() != cpu)) - return -1; + return UCODE_ERROR; - uci = ucode_cpu_info + cpu; - mc = uci->mc; + /* Look for a newer patch in our cache: */ + mc = find_patch(uci); if (!mc) { - /* Look for a newer patch in our cache: */ - mc = find_patch(uci); + mc = uci->mc; if (!mc) - return 0; + return UCODE_NFOUND; } + /* + * Save us the MSR write below - which is a particular expensive + * operation - when the other hyperthread has updated the microcode + * already. + */ + rev = intel_get_microcode_revision(); + if (rev >= mc->hdr.rev) { + uci->cpu_sig.rev = rev; + c->microcode = rev; + return UCODE_OK; + } + + /* + * Writeback and invalidate caches before updating microcode to avoid + * internal issues depending on what the microcode is updating. + */ + native_wbinvd(); + /* write microcode via MSR 0x79 */ wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); @@ -801,7 +833,7 @@ static int apply_microcode_intel(int cpu) if (rev != mc->hdr.rev) { pr_err("CPU%d update to revision 0x%x failed\n", cpu, mc->hdr.rev); - return -1; + return UCODE_ERROR; } if (rev != prev_rev) { @@ -813,12 +845,10 @@ static int apply_microcode_intel(int cpu) prev_rev = rev; } - c = &cpu_data(cpu); - uci->cpu_sig.rev = rev; c->microcode = rev; - return 0; + return UCODE_UPDATED; } static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, @@ -830,6 +860,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, unsigned int leftover = size; unsigned int curr_mc_size = 0, new_mc_size = 0; unsigned int csig, cpf; + enum ucode_state ret = UCODE_OK; while (leftover) { struct microcode_header_intel mc_header; @@ -871,6 +902,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, new_mc = mc; new_mc_size = mc_size; mc = NULL; /* trigger new vmalloc */ + ret = UCODE_NEW; } ucode_ptr += mc_size; @@ -900,7 +932,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); - return UCODE_OK; + return ret; } static int get_ucode_fw(void *to, const void *from, size_t n) @@ -921,7 +953,7 @@ static bool is_blacklisted(unsigned int cpu) */ if (c->x86 == 6 && c->x86_model == INTEL_FAM6_BROADWELL_X && - c->x86_mask == 0x01 && + c->x86_stepping == 0x01 && llc_size_per_core > 2621440 && c->microcode < 0x0b000021) { pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode); @@ -944,7 +976,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device, return UCODE_NFOUND; sprintf(name, "intel-ucode/%02x-%02x-%02x", - c->x86, c->x86_model, c->x86_mask); + c->x86, c->x86_model, c->x86_stepping); if (request_firmware_direct(&firmware, name, device)) { pr_debug("data file %s load failed\n", name); @@ -982,7 +1014,7 @@ static struct microcode_ops microcode_intel_ops = { static int __init calc_llc_size_per_core(struct cpuinfo_x86 *c) { - u64 llc_size = c->x86_cache_size * 1024; + u64 llc_size = c->x86_cache_size * 1024ULL; do_div(llc_size, c->x86_max_cores); diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 9340f41ce8d3..031082c96db8 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -22,7 +22,7 @@ #include <linux/kexec.h> #include <asm/processor.h> #include <asm/hypervisor.h> -#include <asm/hyperv.h> +#include <asm/hyperv-tlfs.h> #include <asm/mshyperv.h> #include <asm/desc.h> #include <asm/irq_regs.h> @@ -37,6 +37,7 @@ EXPORT_SYMBOL_GPL(ms_hyperv); #if IS_ENABLED(CONFIG_HYPERV) static void (*vmbus_handler)(void); +static void (*hv_stimer0_handler)(void); static void (*hv_kexec_handler)(void); static void (*hv_crash_handler)(struct pt_regs *regs); @@ -69,6 +70,41 @@ void hv_remove_vmbus_irq(void) EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq); EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq); +/* + * Routines to do per-architecture handling of stimer0 + * interrupts when in Direct Mode + */ + +__visible void __irq_entry hv_stimer0_vector_handler(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + entering_irq(); + inc_irq_stat(hyperv_stimer0_count); + if (hv_stimer0_handler) + hv_stimer0_handler(); + ack_APIC_irq(); + + exiting_irq(); + set_irq_regs(old_regs); +} + +int hv_setup_stimer0_irq(int *irq, int *vector, void (*handler)(void)) +{ + *vector = HYPERV_STIMER0_VECTOR; + *irq = 0; /* Unused on x86/x64 */ + hv_stimer0_handler = handler; + return 0; +} +EXPORT_SYMBOL_GPL(hv_setup_stimer0_irq); + +void hv_remove_stimer0_irq(int irq) +{ + /* We have no way to deallocate the interrupt gate */ + hv_stimer0_handler = NULL; +} +EXPORT_SYMBOL_GPL(hv_remove_stimer0_irq); + void hv_setup_kexec_handler(void (*handler)(void)) { hv_kexec_handler = handler; @@ -180,8 +216,8 @@ static void __init ms_hyperv_init_platform(void) pr_info("Hyper-V: features 0x%x, hints 0x%x\n", ms_hyperv.features, ms_hyperv.hints); - ms_hyperv.max_vp_index = cpuid_eax(HVCPUID_IMPLEMENTATION_LIMITS); - ms_hyperv.max_lp_index = cpuid_ebx(HVCPUID_IMPLEMENTATION_LIMITS); + ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS); + ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS); pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n", ms_hyperv.max_vp_index, ms_hyperv.max_lp_index); @@ -189,11 +225,12 @@ static void __init ms_hyperv_init_platform(void) /* * Extract host information. */ - if (cpuid_eax(HVCPUID_VENDOR_MAXFUNCTION) >= HVCPUID_VERSION) { - hv_host_info_eax = cpuid_eax(HVCPUID_VERSION); - hv_host_info_ebx = cpuid_ebx(HVCPUID_VERSION); - hv_host_info_ecx = cpuid_ecx(HVCPUID_VERSION); - hv_host_info_edx = cpuid_edx(HVCPUID_VERSION); + if (cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS) >= + HYPERV_CPUID_VERSION) { + hv_host_info_eax = cpuid_eax(HYPERV_CPUID_VERSION); + hv_host_info_ebx = cpuid_ebx(HYPERV_CPUID_VERSION); + hv_host_info_ecx = cpuid_ecx(HYPERV_CPUID_VERSION); + hv_host_info_edx = cpuid_edx(HYPERV_CPUID_VERSION); pr_info("Hyper-V Host Build:%d-%d.%d-%d-%d.%d\n", hv_host_info_eax, hv_host_info_ebx >> 16, @@ -207,6 +244,11 @@ static void __init ms_hyperv_init_platform(void) x86_platform.calibrate_cpu = hv_get_tsc_khz; } + if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED) { + ms_hyperv.nested_features = + cpuid_eax(HYPERV_CPUID_NESTED_FEATURES); + } + #ifdef CONFIG_X86_LOCAL_APIC if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS && ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { @@ -257,6 +299,10 @@ static void __init ms_hyperv_init_platform(void) alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR, hyperv_reenlightenment_vector); + /* Setup the IDT for stimer0 */ + if (ms_hyperv.misc_features & HV_X64_STIMER_DIRECT_MODE_AVAILABLE) + alloc_intr_gate(HYPERV_STIMER0_VECTOR, + hv_stimer0_callback_vector); #endif } diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index fdc55215d44d..e12ee86906c6 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -859,7 +859,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, */ if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 1 && - boot_cpu_data.x86_mask <= 7) { + boot_cpu_data.x86_stepping <= 7) { if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); return -EINVAL; diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 40d5a8a75212..7468de429087 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -711,8 +711,8 @@ void __init mtrr_bp_init(void) if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_data.x86 == 0xF && boot_cpu_data.x86_model == 0x3 && - (boot_cpu_data.x86_mask == 0x3 || - boot_cpu_data.x86_mask == 0x4)) + (boot_cpu_data.x86_stepping == 0x3 || + boot_cpu_data.x86_stepping == 0x4)) phys_addr = 36; size_or_mask = SIZE_OR_MASK_BITS(phys_addr); diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index e7ecedafa1c8..2c8522a39ed5 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -72,8 +72,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) c->x86_model, c->x86_model_id[0] ? c->x86_model_id : "unknown"); - if (c->x86_mask || c->cpuid_level >= 0) - seq_printf(m, "stepping\t: %d\n", c->x86_mask); + if (c->x86_stepping || c->cpuid_level >= 0) + seq_printf(m, "stepping\t: %d\n", c->x86_stepping); else seq_puts(m, "stepping\t: unknown\n"); if (c->microcode) @@ -91,8 +91,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) } /* Cache size */ - if (c->x86_cache_size >= 0) - seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); + if (c->x86_cache_size) + seq_printf(m, "cache size\t: %u KB\n", c->x86_cache_size); show_cpuinfo_core(m, c, cpu); show_cpuinfo_misc(m, c); diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 0931a105ffe1..1d300f96df4b 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -40,6 +40,7 @@ #include <linux/notifier.h> #include <linux/uaccess.h> #include <linux/gfp.h> +#include <linux/completion.h> #include <asm/processor.h> #include <asm/msr.h> @@ -47,19 +48,27 @@ static struct class *cpuid_class; static enum cpuhp_state cpuhp_cpuid_state; +struct cpuid_regs_done { + struct cpuid_regs regs; + struct completion done; +}; + static void cpuid_smp_cpuid(void *cmd_block) { - struct cpuid_regs *cmd = (struct cpuid_regs *)cmd_block; + struct cpuid_regs_done *cmd = cmd_block; + + cpuid_count(cmd->regs.eax, cmd->regs.ecx, + &cmd->regs.eax, &cmd->regs.ebx, + &cmd->regs.ecx, &cmd->regs.edx); - cpuid_count(cmd->eax, cmd->ecx, - &cmd->eax, &cmd->ebx, &cmd->ecx, &cmd->edx); + complete(&cmd->done); } static ssize_t cpuid_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { char __user *tmp = buf; - struct cpuid_regs cmd; + struct cpuid_regs_done cmd; int cpu = iminor(file_inode(file)); u64 pos = *ppos; ssize_t bytes = 0; @@ -68,19 +77,28 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, if (count % 16) return -EINVAL; /* Invalid chunk size */ + init_completion(&cmd.done); for (; count; count -= 16) { - cmd.eax = pos; - cmd.ecx = pos >> 32; - err = smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1); + call_single_data_t csd = { + .func = cpuid_smp_cpuid, + .info = &cmd, + }; + + cmd.regs.eax = pos; + cmd.regs.ecx = pos >> 32; + + err = smp_call_function_single_async(cpu, &csd); if (err) break; - if (copy_to_user(tmp, &cmd, 16)) { + wait_for_completion(&cmd.done); + if (copy_to_user(tmp, &cmd.regs, 16)) { err = -EFAULT; break; } tmp += 16; bytes += 16; *ppos = ++pos; + reinit_completion(&cmd.done); } return bytes ? bytes : err; diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 10e74d4778a1..f631a3f15587 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -38,37 +38,6 @@ #include <asm/virtext.h> #include <asm/intel_pt.h> -/* Alignment required for elf header segment */ -#define ELF_CORE_HEADER_ALIGN 4096 - -/* This primarily represents number of split ranges due to exclusion */ -#define CRASH_MAX_RANGES 16 - -struct crash_mem_range { - u64 start, end; -}; - -struct crash_mem { - unsigned int nr_ranges; - struct crash_mem_range ranges[CRASH_MAX_RANGES]; -}; - -/* Misc data about ram ranges needed to prepare elf headers */ -struct crash_elf_data { - struct kimage *image; - /* - * Total number of ram ranges we have after various adjustments for - * crash reserved region, etc. - */ - unsigned int max_nr_ranges; - - /* Pointer to elf header */ - void *ehdr; - /* Pointer to next phdr */ - void *bufp; - struct crash_mem mem; -}; - /* Used while preparing memory map entries for second kernel */ struct crash_memmap_data { struct boot_params *params; @@ -199,9 +168,10 @@ void native_machine_crash_shutdown(struct pt_regs *regs) #ifdef CONFIG_X86_IO_APIC /* Prevent crash_kexec() from deadlocking on ioapic_lock. */ ioapic_zap_locks(); - disable_IO_APIC(); + clear_IO_APIC(); #endif lapic_shutdown(); + restore_boot_irq_mode(); #ifdef CONFIG_HPET_TIMER hpet_disable(); #endif @@ -217,124 +187,49 @@ static int get_nr_ram_ranges_callback(struct resource *res, void *arg) return 0; } - /* Gather all the required information to prepare elf headers for ram regions */ -static void fill_up_crash_elf_data(struct crash_elf_data *ced, - struct kimage *image) +static struct crash_mem *fill_up_crash_elf_data(void) { unsigned int nr_ranges = 0; - - ced->image = image; + struct crash_mem *cmem; walk_system_ram_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback); + if (!nr_ranges) + return NULL; - ced->max_nr_ranges = nr_ranges; - - /* Exclusion of crash region could split memory ranges */ - ced->max_nr_ranges++; - - /* If crashk_low_res is not 0, another range split possible */ - if (crashk_low_res.end) - ced->max_nr_ranges++; -} - -static int exclude_mem_range(struct crash_mem *mem, - unsigned long long mstart, unsigned long long mend) -{ - int i, j; - unsigned long long start, end; - struct crash_mem_range temp_range = {0, 0}; - - for (i = 0; i < mem->nr_ranges; i++) { - start = mem->ranges[i].start; - end = mem->ranges[i].end; - - if (mstart > end || mend < start) - continue; - - /* Truncate any area outside of range */ - if (mstart < start) - mstart = start; - if (mend > end) - mend = end; - - /* Found completely overlapping range */ - if (mstart == start && mend == end) { - mem->ranges[i].start = 0; - mem->ranges[i].end = 0; - if (i < mem->nr_ranges - 1) { - /* Shift rest of the ranges to left */ - for (j = i; j < mem->nr_ranges - 1; j++) { - mem->ranges[j].start = - mem->ranges[j+1].start; - mem->ranges[j].end = - mem->ranges[j+1].end; - } - } - mem->nr_ranges--; - return 0; - } - - if (mstart > start && mend < end) { - /* Split original range */ - mem->ranges[i].end = mstart - 1; - temp_range.start = mend + 1; - temp_range.end = end; - } else if (mstart != start) - mem->ranges[i].end = mstart - 1; - else - mem->ranges[i].start = mend + 1; - break; - } + /* + * Exclusion of crash region and/or crashk_low_res may cause + * another range split. So add extra two slots here. + */ + nr_ranges += 2; + cmem = vzalloc(sizeof(struct crash_mem) + + sizeof(struct crash_mem_range) * nr_ranges); + if (!cmem) + return NULL; - /* If a split happend, add the split to array */ - if (!temp_range.end) - return 0; + cmem->max_nr_ranges = nr_ranges; + cmem->nr_ranges = 0; - /* Split happened */ - if (i == CRASH_MAX_RANGES - 1) { - pr_err("Too many crash ranges after split\n"); - return -ENOMEM; - } - - /* Location where new range should go */ - j = i + 1; - if (j < mem->nr_ranges) { - /* Move over all ranges one slot towards the end */ - for (i = mem->nr_ranges - 1; i >= j; i--) - mem->ranges[i + 1] = mem->ranges[i]; - } - - mem->ranges[j].start = temp_range.start; - mem->ranges[j].end = temp_range.end; - mem->nr_ranges++; - return 0; + return cmem; } /* * Look for any unwanted ranges between mstart, mend and remove them. This - * might lead to split and split ranges are put in ced->mem.ranges[] array + * might lead to split and split ranges are put in cmem->ranges[] array */ -static int elf_header_exclude_ranges(struct crash_elf_data *ced, - unsigned long long mstart, unsigned long long mend) +static int elf_header_exclude_ranges(struct crash_mem *cmem) { - struct crash_mem *cmem = &ced->mem; int ret = 0; - memset(cmem->ranges, 0, sizeof(cmem->ranges)); - - cmem->ranges[0].start = mstart; - cmem->ranges[0].end = mend; - cmem->nr_ranges = 1; - /* Exclude crashkernel region */ - ret = exclude_mem_range(cmem, crashk_res.start, crashk_res.end); + ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end); if (ret) return ret; if (crashk_low_res.end) { - ret = exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end); + ret = crash_exclude_mem_range(cmem, crashk_low_res.start, + crashk_low_res.end); if (ret) return ret; } @@ -344,144 +239,12 @@ static int elf_header_exclude_ranges(struct crash_elf_data *ced, static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg) { - struct crash_elf_data *ced = arg; - Elf64_Ehdr *ehdr; - Elf64_Phdr *phdr; - unsigned long mstart, mend; - struct kimage *image = ced->image; - struct crash_mem *cmem; - int ret, i; - - ehdr = ced->ehdr; - - /* Exclude unwanted mem ranges */ - ret = elf_header_exclude_ranges(ced, res->start, res->end); - if (ret) - return ret; - - /* Go through all the ranges in ced->mem.ranges[] and prepare phdr */ - cmem = &ced->mem; - - for (i = 0; i < cmem->nr_ranges; i++) { - mstart = cmem->ranges[i].start; - mend = cmem->ranges[i].end; - - phdr = ced->bufp; - ced->bufp += sizeof(Elf64_Phdr); - - phdr->p_type = PT_LOAD; - phdr->p_flags = PF_R|PF_W|PF_X; - phdr->p_offset = mstart; - - /* - * If a range matches backup region, adjust offset to backup - * segment. - */ - if (mstart == image->arch.backup_src_start && - (mend - mstart + 1) == image->arch.backup_src_sz) - phdr->p_offset = image->arch.backup_load_addr; - - phdr->p_paddr = mstart; - phdr->p_vaddr = (unsigned long long) __va(mstart); - phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; - phdr->p_align = 0; - ehdr->e_phnum++; - pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", - phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, - ehdr->e_phnum, phdr->p_offset); - } - - return ret; -} - -static int prepare_elf64_headers(struct crash_elf_data *ced, - void **addr, unsigned long *sz) -{ - Elf64_Ehdr *ehdr; - Elf64_Phdr *phdr; - unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz; - unsigned char *buf, *bufp; - unsigned int cpu; - unsigned long long notes_addr; - int ret; + struct crash_mem *cmem = arg; - /* extra phdr for vmcoreinfo elf note */ - nr_phdr = nr_cpus + 1; - nr_phdr += ced->max_nr_ranges; - - /* - * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping - * area on x86_64 (ffffffff80000000 - ffffffffa0000000). - * I think this is required by tools like gdb. So same physical - * memory will be mapped in two elf headers. One will contain kernel - * text virtual addresses and other will have __va(physical) addresses. - */ + cmem->ranges[cmem->nr_ranges].start = res->start; + cmem->ranges[cmem->nr_ranges].end = res->end; + cmem->nr_ranges++; - nr_phdr++; - elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr); - elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN); - - buf = vzalloc(elf_sz); - if (!buf) - return -ENOMEM; - - bufp = buf; - ehdr = (Elf64_Ehdr *)bufp; - bufp += sizeof(Elf64_Ehdr); - memcpy(ehdr->e_ident, ELFMAG, SELFMAG); - ehdr->e_ident[EI_CLASS] = ELFCLASS64; - ehdr->e_ident[EI_DATA] = ELFDATA2LSB; - ehdr->e_ident[EI_VERSION] = EV_CURRENT; - ehdr->e_ident[EI_OSABI] = ELF_OSABI; - memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD); - ehdr->e_type = ET_CORE; - ehdr->e_machine = ELF_ARCH; - ehdr->e_version = EV_CURRENT; - ehdr->e_phoff = sizeof(Elf64_Ehdr); - ehdr->e_ehsize = sizeof(Elf64_Ehdr); - ehdr->e_phentsize = sizeof(Elf64_Phdr); - - /* Prepare one phdr of type PT_NOTE for each present cpu */ - for_each_present_cpu(cpu) { - phdr = (Elf64_Phdr *)bufp; - bufp += sizeof(Elf64_Phdr); - phdr->p_type = PT_NOTE; - notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu)); - phdr->p_offset = phdr->p_paddr = notes_addr; - phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t); - (ehdr->e_phnum)++; - } - - /* Prepare one PT_NOTE header for vmcoreinfo */ - phdr = (Elf64_Phdr *)bufp; - bufp += sizeof(Elf64_Phdr); - phdr->p_type = PT_NOTE; - phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note(); - phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE; - (ehdr->e_phnum)++; - -#ifdef CONFIG_X86_64 - /* Prepare PT_LOAD type program header for kernel text region */ - phdr = (Elf64_Phdr *)bufp; - bufp += sizeof(Elf64_Phdr); - phdr->p_type = PT_LOAD; - phdr->p_flags = PF_R|PF_W|PF_X; - phdr->p_vaddr = (Elf64_Addr)_text; - phdr->p_filesz = phdr->p_memsz = _end - _text; - phdr->p_offset = phdr->p_paddr = __pa_symbol(_text); - (ehdr->e_phnum)++; -#endif - - /* Prepare PT_LOAD headers for system ram chunks. */ - ced->ehdr = ehdr; - ced->bufp = bufp; - ret = walk_system_ram_res(0, -1, ced, - prepare_elf64_ram_headers_callback); - if (ret < 0) - return ret; - - *addr = buf; - *sz = elf_sz; return 0; } @@ -489,18 +252,46 @@ static int prepare_elf64_headers(struct crash_elf_data *ced, static int prepare_elf_headers(struct kimage *image, void **addr, unsigned long *sz) { - struct crash_elf_data *ced; - int ret; + struct crash_mem *cmem; + Elf64_Ehdr *ehdr; + Elf64_Phdr *phdr; + int ret, i; - ced = kzalloc(sizeof(*ced), GFP_KERNEL); - if (!ced) + cmem = fill_up_crash_elf_data(); + if (!cmem) return -ENOMEM; - fill_up_crash_elf_data(ced, image); + ret = walk_system_ram_res(0, -1, cmem, + prepare_elf64_ram_headers_callback); + if (ret) + goto out; + + /* Exclude unwanted mem ranges */ + ret = elf_header_exclude_ranges(cmem); + if (ret) + goto out; /* By default prepare 64bit headers */ - ret = prepare_elf64_headers(ced, addr, sz); - kfree(ced); + ret = crash_prepare_elf64_headers(cmem, + IS_ENABLED(CONFIG_X86_64), addr, sz); + if (ret) + goto out; + + /* + * If a range matches backup region, adjust offset to backup + * segment. + */ + ehdr = (Elf64_Ehdr *)*addr; + phdr = (Elf64_Phdr *)(ehdr + 1); + for (i = 0; i < ehdr->e_phnum; phdr++, i++) + if (phdr->p_type == PT_LOAD && + phdr->p_paddr == image->arch.backup_src_start && + phdr->p_memsz == image->arch.backup_src_sz) { + phdr->p_offset = image->arch.backup_load_addr; + break; + } +out: + vfree(cmem); return ret; } @@ -546,14 +337,14 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, /* Exclude Backup region */ start = image->arch.backup_load_addr; end = start + image->arch.backup_src_sz - 1; - ret = exclude_mem_range(cmem, start, end); + ret = crash_exclude_mem_range(cmem, start, end); if (ret) return ret; /* Exclude elf header region */ start = image->arch.elf_load_addr; end = start + image->arch.elf_headers_sz - 1; - return exclude_mem_range(cmem, start, end); + return crash_exclude_mem_range(cmem, start, end); } /* Prepare memory map for crash dump kernel */ diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 25de5f6ca997..f39f3a06c26f 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -11,6 +11,7 @@ #include <linux/of_address.h> #include <linux/of_platform.h> #include <linux/of_irq.h> +#include <linux/libfdt.h> #include <linux/slab.h> #include <linux/pci.h> #include <linux/of_pci.h> @@ -130,34 +131,52 @@ static void __init dtb_setup_hpet(void) #endif } +#ifdef CONFIG_X86_LOCAL_APIC + +static void __init dtb_cpu_setup(void) +{ + struct device_node *dn; + u32 apic_id, version; + int ret; + + version = GET_APIC_VERSION(apic_read(APIC_LVR)); + for_each_node_by_type(dn, "cpu") { + ret = of_property_read_u32(dn, "reg", &apic_id); + if (ret < 0) { + pr_warn("%pOF: missing local APIC ID\n", dn); + continue; + } + generic_processor_info(apic_id, version); + } +} + static void __init dtb_lapic_setup(void) { -#ifdef CONFIG_X86_LOCAL_APIC struct device_node *dn; struct resource r; + unsigned long lapic_addr = APIC_DEFAULT_PHYS_BASE; int ret; dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-lapic"); - if (!dn) - return; - - ret = of_address_to_resource(dn, 0, &r); - if (WARN_ON(ret)) - return; + if (dn) { + ret = of_address_to_resource(dn, 0, &r); + if (WARN_ON(ret)) + return; + lapic_addr = r.start; + } /* Did the boot loader setup the local APIC ? */ if (!boot_cpu_has(X86_FEATURE_APIC)) { - if (apic_force_enable(r.start)) + if (apic_force_enable(lapic_addr)) return; } smp_found_config = 1; pic_mode = 1; - register_lapic_address(r.start); - generic_processor_info(boot_cpu_physical_apicid, - GET_APIC_VERSION(apic_read(APIC_LVR))); -#endif + register_lapic_address(lapic_addr); } +#endif /* CONFIG_X86_LOCAL_APIC */ + #ifdef CONFIG_X86_IO_APIC static unsigned int ioapic_id; @@ -194,19 +213,22 @@ static struct of_ioapic_type of_ioapic_type[] = static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { - struct of_phandle_args *irq_data = (void *)arg; + struct irq_fwspec *fwspec = (struct irq_fwspec *)arg; struct of_ioapic_type *it; struct irq_alloc_info tmp; + int type_index; - if (WARN_ON(irq_data->args_count < 2)) + if (WARN_ON(fwspec->param_count < 2)) return -EINVAL; - if (irq_data->args[1] >= ARRAY_SIZE(of_ioapic_type)) + + type_index = fwspec->param[1]; + if (type_index >= ARRAY_SIZE(of_ioapic_type)) return -EINVAL; - it = &of_ioapic_type[irq_data->args[1]]; + it = &of_ioapic_type[type_index]; ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity); tmp.ioapic_id = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain)); - tmp.ioapic_pin = irq_data->args[0]; + tmp.ioapic_pin = fwspec->param[0]; return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp); } @@ -255,11 +277,14 @@ static void __init dtb_ioapic_setup(void) {} static void __init dtb_apic_setup(void) { +#ifdef CONFIG_X86_LOCAL_APIC dtb_lapic_setup(); + dtb_cpu_setup(); +#endif dtb_ioapic_setup(); } -#ifdef CONFIG_OF_FLATTREE +#ifdef CONFIG_OF_EARLY_FLATTREE static void __init x86_flattree_get_config(void) { u32 size, map_len; @@ -270,14 +295,15 @@ static void __init x86_flattree_get_config(void) map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128); - initial_boot_params = dt = early_memremap(initial_dtb, map_len); - size = of_get_flat_dt_size(); + dt = early_memremap(initial_dtb, map_len); + size = fdt_totalsize(dt); if (map_len < size) { early_memunmap(dt, map_len); - initial_boot_params = dt = early_memremap(initial_dtb, size); + dt = early_memremap(initial_dtb, size); map_len = size; } + early_init_dt_verify(dt); unflatten_and_copy_device_tree(); early_memunmap(dt, map_len); } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index a2d8a3908670..18fa9d74c182 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -24,7 +24,7 @@ int panic_on_unrecovered_nmi; int panic_on_io_nmi; -unsigned int code_bytes = 64; +static unsigned int code_bytes = 64; static int die_counter; bool in_task_stack(unsigned long *stack, struct task_struct *task, @@ -375,3 +375,50 @@ static int __init code_bytes_setup(char *s) return 1; } __setup("code_bytes=", code_bytes_setup); + +void show_regs(struct pt_regs *regs) +{ + bool all = true; + int i; + + show_regs_print_info(KERN_DEFAULT); + + if (IS_ENABLED(CONFIG_X86_32)) + all = !user_mode(regs); + + __show_regs(regs, all); + + /* + * When in-kernel, we also print out the stack and code at the + * time of the fault.. + */ + if (!user_mode(regs)) { + unsigned int code_prologue = code_bytes * 43 / 64; + unsigned int code_len = code_bytes; + unsigned char c; + u8 *ip; + + show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); + + printk(KERN_DEFAULT "Code: "); + + ip = (u8 *)regs->ip - code_prologue; + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { + /* try starting at IP */ + ip = (u8 *)regs->ip; + code_len = code_len - code_prologue + 1; + } + for (i = 0; i < code_len; i++, ip++) { + if (ip < (u8 *)PAGE_OFFSET || + probe_kernel_address(ip, c)) { + pr_cont(" Bad RIP value."); + break; + } + if (ip == (u8 *)regs->ip) + pr_cont("<%02x> ", c); + else + pr_cont("%02x ", c); + } + } + pr_cont("\n"); +} diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 04170f63e3a1..cd53f3030e40 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -127,45 +127,3 @@ unknown: info->type = STACK_TYPE_UNKNOWN; return -EINVAL; } - -void show_regs(struct pt_regs *regs) -{ - int i; - - show_regs_print_info(KERN_EMERG); - __show_regs(regs, !user_mode(regs)); - - /* - * When in-kernel, we also print out the stack and code at the - * time of the fault.. - */ - if (!user_mode(regs)) { - unsigned int code_prologue = code_bytes * 43 / 64; - unsigned int code_len = code_bytes; - unsigned char c; - u8 *ip; - - show_trace_log_lvl(current, regs, NULL, KERN_EMERG); - - pr_emerg("Code:"); - - ip = (u8 *)regs->ip - code_prologue; - if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - /* try starting at IP */ - ip = (u8 *)regs->ip; - code_len = code_len - code_prologue + 1; - } - for (i = 0; i < code_len; i++, ip++) { - if (ip < (u8 *)PAGE_OFFSET || - probe_kernel_address(ip, c)) { - pr_cont(" Bad EIP value."); - break; - } - if (ip == (u8 *)regs->ip) - pr_cont(" <%02x>", c); - else - pr_cont(" %02x", c); - } - } - pr_cont("\n"); -} diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 563e28d14f2c..5cdb9e84da57 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -149,45 +149,3 @@ unknown: info->type = STACK_TYPE_UNKNOWN; return -EINVAL; } - -void show_regs(struct pt_regs *regs) -{ - int i; - - show_regs_print_info(KERN_DEFAULT); - __show_regs(regs, 1); - - /* - * When in-kernel, we also print out the stack and code at the - * time of the fault.. - */ - if (!user_mode(regs)) { - unsigned int code_prologue = code_bytes * 43 / 64; - unsigned int code_len = code_bytes; - unsigned char c; - u8 *ip; - - show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); - - printk(KERN_DEFAULT "Code: "); - - ip = (u8 *)regs->ip - code_prologue; - if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - /* try starting at IP */ - ip = (u8 *)regs->ip; - code_len = code_len - code_prologue + 1; - } - for (i = 0; i < code_len; i++, ip++) { - if (ip < (u8 *)PAGE_OFFSET || - probe_kernel_address(ip, c)) { - pr_cont(" Bad RIP value."); - break; - } - if (ip == (u8 *)regs->ip) - pr_cont("<%02x> ", c); - else - pr_cont("%02x ", c); - } - } - pr_cont("\n"); -} diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 71c11ad5643e..6a2cb1442e05 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -924,6 +924,24 @@ static int __init parse_memmap_one(char *p) } else if (*p == '!') { start_at = memparse(p+1, &p); e820__range_add(start_at, mem_size, E820_TYPE_PRAM); + } else if (*p == '%') { + enum e820_type from = 0, to = 0; + + start_at = memparse(p + 1, &p); + if (*p == '-') + from = simple_strtoull(p + 1, &p, 0); + if (*p == '+') + to = simple_strtoull(p + 1, &p, 0); + if (*p != '\0') + return -EINVAL; + if (from && to) + e820__range_update(start_at, mem_size, from, to); + else if (to) + e820__range_add(start_at, mem_size, to); + else if (from) + e820__range_remove(start_at, mem_size, from, 1); + else + e820__range_remove(start_at, mem_size, 0, 0); } else { e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1); } diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index e5ec3cafa72e..aebd0d5bc086 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -195,6 +195,10 @@ void init_espfix_ap(int cpu) pte_p = pte_offset_kernel(&pmd, addr); stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0)); + /* + * __PAGE_KERNEL_* includes _PAGE_GLOBAL, which we want since + * this is mapped to userspace. + */ pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask)); for (n = 0; n < ESPFIX_PTE_CLONES; n++) set_pte(&pte_p[n*PTE_STRIDE], pte); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 7ba5d819ebe3..0c408f8c4ed4 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -32,6 +32,11 @@ #include <asm/microcode.h> #include <asm/kasan.h> +#ifdef CONFIG_X86_5LEVEL +#undef pgtable_l5_enabled +#define pgtable_l5_enabled __pgtable_l5_enabled +#endif + /* * Manage page tables very early on. */ @@ -39,6 +44,24 @@ extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; static unsigned int __initdata next_early_pgt; pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); +#ifdef CONFIG_X86_5LEVEL +unsigned int __pgtable_l5_enabled __ro_after_init; +EXPORT_SYMBOL(__pgtable_l5_enabled); +unsigned int pgdir_shift __ro_after_init = 39; +EXPORT_SYMBOL(pgdir_shift); +unsigned int ptrs_per_p4d __ro_after_init = 1; +EXPORT_SYMBOL(ptrs_per_p4d); +#endif + +#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT +unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4; +EXPORT_SYMBOL(page_offset_base); +unsigned long vmalloc_base __ro_after_init = __VMALLOC_BASE_L4; +EXPORT_SYMBOL(vmalloc_base); +unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4; +EXPORT_SYMBOL(vmemmap_base); +#endif + #define __head __section(.head.text) static void __head *fixup_pointer(void *ptr, unsigned long physaddr) @@ -46,6 +69,41 @@ static void __head *fixup_pointer(void *ptr, unsigned long physaddr) return ptr - (void *)_text + (void *)physaddr; } +static unsigned long __head *fixup_long(void *ptr, unsigned long physaddr) +{ + return fixup_pointer(ptr, physaddr); +} + +#ifdef CONFIG_X86_5LEVEL +static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr) +{ + return fixup_pointer(ptr, physaddr); +} + +static bool __head check_la57_support(unsigned long physaddr) +{ + if (native_cpuid_eax(0) < 7) + return false; + + if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) + return false; + + *fixup_int(&pgtable_l5_enabled, physaddr) = 1; + *fixup_int(&pgdir_shift, physaddr) = 48; + *fixup_int(&ptrs_per_p4d, physaddr) = 512; + *fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE_L5; + *fixup_long(&vmalloc_base, physaddr) = __VMALLOC_BASE_L5; + *fixup_long(&vmemmap_base, physaddr) = __VMEMMAP_BASE_L5; + + return true; +} +#else +static bool __head check_la57_support(unsigned long physaddr) +{ + return false; +} +#endif + unsigned long __head __startup_64(unsigned long physaddr, struct boot_params *bp) { @@ -55,9 +113,12 @@ unsigned long __head __startup_64(unsigned long physaddr, p4dval_t *p4d; pudval_t *pud; pmdval_t *pmd, pmd_entry; + bool la57; int i; unsigned int *next_pgt_ptr; + la57 = check_la57_support(physaddr); + /* Is the address too large? */ if (physaddr >> MAX_PHYSMEM_BITS) for (;;); @@ -81,9 +142,14 @@ unsigned long __head __startup_64(unsigned long physaddr, /* Fixup the physical addresses in the page table */ pgd = fixup_pointer(&early_top_pgt, physaddr); - pgd[pgd_index(__START_KERNEL_map)] += load_delta; - - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p = pgd + pgd_index(__START_KERNEL_map); + if (la57) + *p = (unsigned long)level4_kernel_pgt; + else + *p = (unsigned long)level3_kernel_pgt; + *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta; + + if (la57) { p4d = fixup_pointer(&level4_kernel_pgt, physaddr); p4d[511] += load_delta; } @@ -108,7 +174,7 @@ unsigned long __head __startup_64(unsigned long physaddr, pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask(); - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + if (la57) { p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; @@ -129,6 +195,8 @@ unsigned long __head __startup_64(unsigned long physaddr, pud[i + 1] = (pudval_t)pmd + pgtable_flags; pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; + /* Filter out unsupported __PAGE_KERNEL_* bits: */ + pmd_entry &= __supported_pte_mask; pmd_entry += sme_get_me_mask(); pmd_entry += physaddr; @@ -154,8 +222,7 @@ unsigned long __head __startup_64(unsigned long physaddr, * Fixup phys_base - remove the memory encryption mask to obtain * the true physical address. */ - p = fixup_pointer(&phys_base, physaddr); - *p += load_delta - sme_get_me_mask(); + *fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask(); /* Encrypt the kernel and related (if SME is active) */ sme_encrypt_kernel(bp); @@ -206,7 +273,7 @@ again: * critical -- __PAGE_OFFSET would point us back into the dynamic * range and we might end up looping forever... */ - if (!IS_ENABLED(CONFIG_X86_5LEVEL)) + if (!pgtable_l5_enabled) p4d_p = pgd_p; else if (pgd) p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); @@ -322,7 +389,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0); BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0); BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); - BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == + MAYBE_BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == (__START_KERNEL & PGDIR_MASK))); BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index c29020907886..b59e4fb40fd9 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -37,7 +37,7 @@ #define X86 new_cpu_data+CPUINFO_x86 #define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor #define X86_MODEL new_cpu_data+CPUINFO_x86_model -#define X86_MASK new_cpu_data+CPUINFO_x86_mask +#define X86_STEPPING new_cpu_data+CPUINFO_x86_stepping #define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math #define X86_CPUID new_cpu_data+CPUINFO_cpuid_level #define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability @@ -332,7 +332,7 @@ ENTRY(startup_32_smp) shrb $4,%al movb %al,X86_MODEL andb $0x0f,%cl # mask mask revision - movb %cl,X86_MASK + movb %cl,X86_STEPPING movl %edx,X86_CAPABILITY .Lis486: diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 04a625f0fcda..8344dd2f310a 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -23,6 +23,7 @@ #include <asm/nops.h> #include "../entry/calling.h" #include <asm/export.h> +#include <asm/nospec-branch.h> #ifdef CONFIG_PARAVIRT #include <asm/asm-offsets.h> @@ -38,12 +39,12 @@ * */ +#define l4_index(x) (((x) >> 39) & 511) #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) -#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) -PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) -PGD_START_KERNEL = pgd_index(__START_KERNEL_map) -#endif +L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE_L4) +L4_START_KERNEL = l4_index(__START_KERNEL_map) + L3_START_KERNEL = pud_index(__START_KERNEL_map) .text @@ -124,7 +125,10 @@ ENTRY(secondary_startup_64) /* Enable PAE mode, PGE and LA57 */ movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx #ifdef CONFIG_X86_5LEVEL + testl $1, __pgtable_l5_enabled(%rip) + jz 1f orl $X86_CR4_LA57, %ecx +1: #endif movq %rcx, %cr4 @@ -134,6 +138,7 @@ ENTRY(secondary_startup_64) /* Ensure I am executing from virtual addresses */ movq $1f, %rax + ANNOTATE_RETPOLINE_SAFE jmp *%rax 1: UNWIND_HINT_EMPTY @@ -372,12 +377,7 @@ GLOBAL(name) __INITDATA NEXT_PGD_PAGE(early_top_pgt) - .fill 511,8,0 -#ifdef CONFIG_X86_5LEVEL - .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC -#else - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC -#endif + .fill 512,8,0 .fill PTI_USER_PGD_FILL,8,0 NEXT_PAGE(early_dynamic_pgts) @@ -388,9 +388,9 @@ NEXT_PAGE(early_dynamic_pgts) #if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) NEXT_PGD_PAGE(init_top_pgt) .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC - .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 + .org init_top_pgt + L4_PAGE_OFFSET*8, 0 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC - .org init_top_pgt + PGD_START_KERNEL*8, 0 + .org init_top_pgt + L4_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC .fill PTI_USER_PGD_FILL,8,0 @@ -399,8 +399,13 @@ NEXT_PAGE(level3_ident_pgt) .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .fill 511, 8, 0 NEXT_PAGE(level2_ident_pgt) - /* Since I easily can, map the first 1G. + /* + * Since I easily can, map the first 1G. * Don't set NX because code runs from these pages. + * + * Note: This sets _PAGE_GLOBAL despite whether + * the CPU supports it or it is enabled. But, + * the CPU should ignore the bit. */ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) #else @@ -431,6 +436,10 @@ NEXT_PAGE(level2_kernel_pgt) * (NOTE: at +512MB starts the module area, see MODULES_VADDR. * If you want to increase this then increase MODULES_VADDR * too.) + * + * This table is eventually used by the kernel during normal + * runtime. Care must be taken to clear out undesired bits + * later, like _PAGE_RW or _PAGE_GLOBAL in some cases. */ PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 56d99be3706a..2c3a1b4294eb 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -140,6 +140,9 @@ static const __initconst struct idt_data apic_idts[] = { # ifdef CONFIG_IRQ_WORK INTG(IRQ_WORK_VECTOR, irq_work_interrupt), # endif +#ifdef CONFIG_X86_UV + INTG(UV_BAU_MESSAGE, uv_bau_message_intr1), +#endif INTG(SPURIOUS_APIC_VECTOR, spurious_interrupt), INTG(ERROR_APIC_VECTOR, error_interrupt), #endif @@ -160,7 +163,6 @@ static const __initconst struct idt_data early_pf_idts[] = { */ static const __initconst struct idt_data dbg_idts[] = { INTG(X86_TRAP_DB, debug), - INTG(X86_TRAP_BP, int3), }; #endif @@ -183,7 +185,6 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; static const __initconst struct idt_data ist_idts[] = { ISTG(X86_TRAP_DB, debug, DEBUG_STACK), ISTG(X86_TRAP_NMI, nmi, NMI_STACK), - SISTG(X86_TRAP_BP, int3, DEBUG_STACK), ISTG(X86_TRAP_DF, double_fault, DOUBLEFAULT_STACK), #ifdef CONFIG_X86_MCE ISTG(X86_TRAP_MC, &machine_check, MCE_STACK), diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 2f723301eb58..0fe1c8782208 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -23,7 +23,7 @@ /* * this changes the io permissions bitmap in the current task. */ -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) +long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) { struct thread_struct *t = ¤t->thread; struct tss_struct *tss; @@ -96,6 +96,11 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) return 0; } +SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on) +{ + return ksys_ioperm(from, num, turn_on); +} + /* * sys_iopl has to be used when you want to access the IO ports * beyond the 0x3ff range: to get the full 65536 ports bitmapped diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 45fb4d2565f8..328d027d829d 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -150,6 +150,13 @@ int arch_show_interrupts(struct seq_file *p, int prec) irq_stats(j)->irq_hv_reenlightenment_count); seq_puts(p, " Hyper-V reenlightenment interrupts\n"); } + if (test_bit(HYPERV_STIMER0_VECTOR, system_vectors)) { + seq_printf(p, "%*s: ", prec, "HVS"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->hyperv_stimer0_count); + seq_puts(p, " Hyper-V stimer0 interrupts\n"); + } #endif seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index a539410c4ea9..772196c1b8c4 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -61,9 +61,14 @@ void __init init_ISA_irqs(void) struct irq_chip *chip = legacy_pic->chip; int i; -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) + /* + * Try to set up the through-local-APIC virtual wire mode earlier. + * + * On some 32-bit UP machines, whose APIC has been disabled by BIOS + * and then got re-enabled by "lapic", it hangs at boot time without this. + */ init_bsp_APIC(); -#endif + legacy_pic->init(0); for (i = 0; i < nr_legacy_irqs(); i++) diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index b68fd895235a..a15fe0e92cf9 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL2.0 +// SPDX-License-Identifier: GPL-2.0 /* * Jailhouse paravirt_ops implementation * @@ -124,6 +124,14 @@ static int __init jailhouse_pci_arch_init(void) if (pcibios_last_bus < 0) pcibios_last_bus = 0xff; +#ifdef CONFIG_PCI_MMCONFIG + if (setup_data.pci_mmconfig_base) { + pci_mmconfig_add(0, 0, pcibios_last_bus, + setup_data.pci_mmconfig_base); + pci_mmcfg_arch_init(); + } +#endif + return 0; } diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index fb095ba0c02f..7326078eaa7a 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -334,7 +334,6 @@ static void *bzImage64_load(struct kimage *image, char *kernel, unsigned long setup_header_size, params_cmdline_sz; struct boot_params *params; unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr; - unsigned long purgatory_load_addr; struct bzimage64_data *ldata; struct kexec_entry64_regs regs64; void *stack; @@ -342,6 +341,8 @@ static void *bzImage64_load(struct kimage *image, char *kernel, unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset; struct kexec_buf kbuf = { .image = image, .buf_max = ULONG_MAX, .top_down = true }; + struct kexec_buf pbuf = { .image = image, .buf_min = MIN_PURGATORY_ADDR, + .buf_max = ULONG_MAX, .top_down = true }; header = (struct setup_header *)(kernel + setup_hdr_offset); setup_sects = header->setup_sects; @@ -379,14 +380,13 @@ static void *bzImage64_load(struct kimage *image, char *kernel, * Load purgatory. For 64bit entry point, purgatory code can be * anywhere. */ - ret = kexec_load_purgatory(image, MIN_PURGATORY_ADDR, ULONG_MAX, 1, - &purgatory_load_addr); + ret = kexec_load_purgatory(image, &pbuf); if (ret) { pr_err("Loading purgatory failed\n"); return ERR_PTR(ret); } - pr_debug("Loaded purgatory at 0x%lx\n", purgatory_load_addr); + pr_debug("Loaded purgatory at 0x%lx\n", pbuf.mem); /* @@ -398,11 +398,10 @@ static void *bzImage64_load(struct kimage *image, char *kernel, * little bit simple */ efi_map_sz = efi_get_runtime_map_size(); - efi_map_sz = ALIGN(efi_map_sz, 16); params_cmdline_sz = sizeof(struct boot_params) + cmdline_len + MAX_ELFCOREHDR_STR_LEN; params_cmdline_sz = ALIGN(params_cmdline_sz, 16); - kbuf.bufsz = params_cmdline_sz + efi_map_sz + + kbuf.bufsz = params_cmdline_sz + ALIGN(efi_map_sz, 16) + sizeof(struct setup_data) + sizeof(struct efi_setup_data); @@ -410,7 +409,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel, if (!params) return ERR_PTR(-ENOMEM); efi_map_offset = params_cmdline_sz; - efi_setup_data_offset = efi_map_offset + efi_map_sz; + efi_setup_data_offset = efi_map_offset + ALIGN(efi_map_sz, 16); /* Copy setup header onto bootparams. Documentation/x86/boot.txt */ setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset; @@ -538,7 +537,7 @@ static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len) } #endif -struct kexec_file_ops kexec_bzImage64_ops = { +const struct kexec_file_ops kexec_bzImage64_ops = { .probe = bzImage64_probe, .load = bzImage64_load, .cleanup = bzImage64_cleanup, diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index bd36f3c33cd0..0715f827607c 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -1168,10 +1168,18 @@ NOKPROBE_SYMBOL(longjmp_break_handler); bool arch_within_kprobe_blacklist(unsigned long addr) { + bool is_in_entry_trampoline_section = false; + +#ifdef CONFIG_X86_64 + is_in_entry_trampoline_section = + (addr >= (unsigned long)__entry_trampoline_start && + addr < (unsigned long)__entry_trampoline_end); +#endif return (addr >= (unsigned long)__kprobes_text_start && addr < (unsigned long)__kprobes_text_end) || (addr >= (unsigned long)__entry_text_start && - addr < (unsigned long)__entry_text_end); + addr < (unsigned long)__entry_text_end) || + is_in_entry_trampoline_section; } int __init arch_init_kprobes(void) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 4e37d1a851a6..7867417cfaff 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -49,7 +49,7 @@ static int kvmapf = 1; -static int parse_no_kvmapf(char *arg) +static int __init parse_no_kvmapf(char *arg) { kvmapf = 0; return 0; @@ -58,7 +58,7 @@ static int parse_no_kvmapf(char *arg) early_param("no-kvmapf", parse_no_kvmapf); static int steal_acc = 1; -static int parse_no_stealacc(char *arg) +static int __init parse_no_stealacc(char *arg) { steal_acc = 0; return 0; @@ -67,7 +67,7 @@ static int parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); static int kvmclock_vsyscall = 1; -static int parse_no_kvmclock_vsyscall(char *arg) +static int __init parse_no_kvmclock_vsyscall(char *arg) { kvmclock_vsyscall = 0; return 0; @@ -341,10 +341,10 @@ static void kvm_guest_cpu_init(void) #endif pa |= KVM_ASYNC_PF_ENABLED; - /* Async page fault support for L1 hypervisor is optional */ - if (wrmsr_safe(MSR_KVM_ASYNC_PF_EN, - (pa | KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT) & 0xffffffff, pa >> 32) < 0) - wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT)) + pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; + + wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); __this_cpu_write(apf_reason.enabled, 1); printk(KERN_INFO"KVM setup async PF for cpu %d\n", smp_processor_id()); @@ -454,6 +454,13 @@ static void __init sev_map_percpu_data(void) } #ifdef CONFIG_SMP +static void __init kvm_smp_prepare_cpus(unsigned int max_cpus) +{ + native_smp_prepare_cpus(max_cpus); + if (kvm_para_has_hint(KVM_HINTS_DEDICATED)) + static_branch_disable(&virt_spin_lock_key); +} + static void __init kvm_smp_prepare_boot_cpu(void) { /* @@ -545,7 +552,9 @@ static void __init kvm_guest_init(void) pv_time_ops.steal_clock = kvm_steal_clock; } - if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH)) + if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && + !kvm_para_has_hint(KVM_HINTS_DEDICATED) && + kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others; if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) @@ -555,6 +564,7 @@ static void __init kvm_guest_init(void) kvm_setup_vsyscall_timeinfo(); #ifdef CONFIG_SMP + smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus; smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online", kvm_cpu_online, kvm_cpu_down_prepare) < 0) @@ -604,6 +614,11 @@ unsigned int kvm_arch_para_features(void) return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES); } +unsigned int kvm_arch_para_hints(void) +{ + return cpuid_edx(kvm_cpuid_base() | KVM_CPUID_FEATURES); +} + static uint32_t __init kvm_detect(void) { return kvm_cpuid_base(); @@ -633,7 +648,9 @@ static __init int kvm_setup_pv_tlb_flush(void) { int cpu; - if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH)) { + if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && + !kvm_para_has_hint(KVM_HINTS_DEDICATED) && + kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { for_each_possible_cpu(cpu) { zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu), GFP_KERNEL, cpu_to_node(cpu)); @@ -728,6 +745,9 @@ void __init kvm_spinlock_init(void) if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) return; + if (kvm_para_has_hint(KVM_HINTS_DEDICATED)) + return; + __pv_init_lock_hash(); pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 26d713ecad34..c9b14020f4dd 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -145,6 +145,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) unsigned long offset = i << PAGE_SHIFT; const void *src = (char *)ldt->entries + offset; unsigned long pfn; + pgprot_t pte_prot; pte_t pte, *ptep; va = (unsigned long)ldt_slot_va(slot) + offset; @@ -163,7 +164,10 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) * target via some kernel interface which misses a * permission check. */ - pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)); + pte_prot = __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL); + /* Filter out unsuppored __PAGE_KERNEL* bits: */ + pgprot_val(pte_prot) &= __supported_pte_mask; + pte = pfn_pte(pfn, pte_prot); set_pte_at(mm, va, ptep, pte); pte_unmap_unlock(ptep, ptl); } diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index edfede768688..60cdec6628b0 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -195,11 +195,11 @@ void machine_kexec(struct kimage *image) /* * We need to put APICs in legacy mode so that we can * get timer interrupts in second kernel. kexec/kdump - * paths already have calls to disable_IO_APIC() in - * one form or other. kexec jump path also need - * one. + * paths already have calls to restore_boot_irq_mode() + * in one form or other. kexec jump path also need one. */ - disable_IO_APIC(); + clear_IO_APIC(); + restore_boot_irq_mode(); #endif } diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 1f790cf9d38f..a5e55d832d0a 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -30,8 +30,9 @@ #include <asm/set_memory.h> #ifdef CONFIG_KEXEC_FILE -static struct kexec_file_ops *kexec_file_loaders[] = { +const struct kexec_file_ops * const kexec_file_loaders[] = { &kexec_bzImage64_ops, + NULL }; #endif @@ -293,11 +294,11 @@ void machine_kexec(struct kimage *image) /* * We need to put APICs in legacy mode so that we can * get timer interrupts in second kernel. kexec/kdump - * paths already have calls to disable_IO_APIC() in - * one form or other. kexec jump path also need - * one. + * paths already have calls to restore_boot_irq_mode() + * in one form or other. kexec jump path also need one. */ - disable_IO_APIC(); + clear_IO_APIC(); + restore_boot_irq_mode(); #endif } @@ -350,6 +351,7 @@ void arch_crash_save_vmcoreinfo(void) { VMCOREINFO_NUMBER(phys_base); VMCOREINFO_SYMBOL(init_top_pgt); + VMCOREINFO_NUMBER(pgtable_l5_enabled); #ifdef CONFIG_NUMA VMCOREINFO_SYMBOL(node_data); @@ -363,27 +365,6 @@ void arch_crash_save_vmcoreinfo(void) /* arch-dependent functionality related to kexec file-based syscall */ #ifdef CONFIG_KEXEC_FILE -int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, - unsigned long buf_len) -{ - int i, ret = -ENOEXEC; - struct kexec_file_ops *fops; - - for (i = 0; i < ARRAY_SIZE(kexec_file_loaders); i++) { - fops = kexec_file_loaders[i]; - if (!fops || !fops->probe) - continue; - - ret = fops->probe(buf, buf_len); - if (!ret) { - image->fops = fops; - return ret; - } - } - - return ret; -} - void *arch_kexec_kernel_image_load(struct kimage *image) { vfree(image->arch.elf_headers); @@ -398,88 +379,53 @@ void *arch_kexec_kernel_image_load(struct kimage *image) image->cmdline_buf_len); } -int arch_kimage_file_post_load_cleanup(struct kimage *image) -{ - if (!image->fops || !image->fops->cleanup) - return 0; - - return image->fops->cleanup(image->image_loader_data); -} - -#ifdef CONFIG_KEXEC_VERIFY_SIG -int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel, - unsigned long kernel_len) -{ - if (!image->fops || !image->fops->verify_sig) { - pr_debug("kernel loader does not support signature verification."); - return -EKEYREJECTED; - } - - return image->fops->verify_sig(kernel, kernel_len); -} -#endif - /* * Apply purgatory relocations. * - * ehdr: Pointer to elf headers - * sechdrs: Pointer to section headers. - * relsec: section index of SHT_RELA section. + * @pi: Purgatory to be relocated. + * @section: Section relocations applying to. + * @relsec: Section containing RELAs. + * @symtabsec: Corresponding symtab. * * TODO: Some of the code belongs to generic code. Move that in kexec.c. */ -int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr, - Elf64_Shdr *sechdrs, unsigned int relsec) +int arch_kexec_apply_relocations_add(struct purgatory_info *pi, + Elf_Shdr *section, const Elf_Shdr *relsec, + const Elf_Shdr *symtabsec) { unsigned int i; Elf64_Rela *rel; Elf64_Sym *sym; void *location; - Elf64_Shdr *section, *symtabsec; unsigned long address, sec_base, value; const char *strtab, *name, *shstrtab; + const Elf_Shdr *sechdrs; - /* - * ->sh_offset has been modified to keep the pointer to section - * contents in memory - */ - rel = (void *)sechdrs[relsec].sh_offset; - - /* Section to which relocations apply */ - section = &sechdrs[sechdrs[relsec].sh_info]; - - pr_debug("Applying relocate section %u to %u\n", relsec, - sechdrs[relsec].sh_info); - - /* Associated symbol table */ - symtabsec = &sechdrs[sechdrs[relsec].sh_link]; - - /* String table */ - if (symtabsec->sh_link >= ehdr->e_shnum) { - /* Invalid strtab section number */ - pr_err("Invalid string table section index %d\n", - symtabsec->sh_link); - return -ENOEXEC; - } + /* String & section header string table */ + sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; + strtab = (char *)pi->ehdr + sechdrs[symtabsec->sh_link].sh_offset; + shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset; - strtab = (char *)sechdrs[symtabsec->sh_link].sh_offset; + rel = (void *)pi->ehdr + relsec->sh_offset; - /* section header string table */ - shstrtab = (char *)sechdrs[ehdr->e_shstrndx].sh_offset; + pr_debug("Applying relocate section %s to %u\n", + shstrtab + relsec->sh_name, relsec->sh_info); - for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + for (i = 0; i < relsec->sh_size / sizeof(*rel); i++) { /* * rel[i].r_offset contains byte offset from beginning * of section to the storage unit affected. * - * This is location to update (->sh_offset). This is temporary - * buffer where section is currently loaded. This will finally - * be loaded to a different address later, pointed to by + * This is location to update. This is temporary buffer + * where section is currently loaded. This will finally be + * loaded to a different address later, pointed to by * ->sh_addr. kexec takes care of moving it * (kexec_load_segment()). */ - location = (void *)(section->sh_offset + rel[i].r_offset); + location = pi->purgatory_buf; + location += section->sh_offset; + location += rel[i].r_offset; /* Final address of the location */ address = section->sh_addr + rel[i].r_offset; @@ -490,8 +436,8 @@ int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr, * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get * these respectively. */ - sym = (Elf64_Sym *)symtabsec->sh_offset + - ELF64_R_SYM(rel[i].r_info); + sym = (void *)pi->ehdr + symtabsec->sh_offset; + sym += ELF64_R_SYM(rel[i].r_info); if (sym->st_name) name = strtab + sym->st_name; @@ -514,12 +460,12 @@ int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr, if (sym->st_shndx == SHN_ABS) sec_base = 0; - else if (sym->st_shndx >= ehdr->e_shnum) { + else if (sym->st_shndx >= pi->ehdr->e_shnum) { pr_err("Invalid section %d for symbol %s\n", sym->st_shndx, name); return -ENOEXEC; } else - sec_base = sechdrs[sym->st_shndx].sh_addr; + sec_base = pi->sechdrs[sym->st_shndx].sh_addr; value = sym->st_value; value += sec_base; @@ -542,6 +488,7 @@ int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr, goto overflow; break; case R_X86_64_PC32: + case R_X86_64_PLT32: value -= (u64)address; *(u32 *)location = value; break; diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index da0c160e5589..f58336af095c 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -191,6 +191,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, goto overflow; break; case R_X86_64_PC32: + case R_X86_64_PLT32: if (*(u32 *)loc != 0) goto invalid_relocation; val -= (u64)loc; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 27d0a1712663..f1c5eb99d445 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -410,7 +410,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) processor.apicver = mpc_default_type > 4 ? 0x10 : 0x01; processor.cpuflag = CPU_ENABLED; processor.cpufeature = (boot_cpu_data.x86 << 8) | - (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_stepping; processor.featureflag = boot_cpu_data.x86_capability[CPUID_1_EDX]; processor.reserved[0] = 0; processor.reserved[1] = 0; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 041096bdef86..99dc79e76bdc 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -200,9 +200,9 @@ static void native_flush_tlb_global(void) __native_flush_tlb_global(); } -static void native_flush_tlb_single(unsigned long addr) +static void native_flush_tlb_one_user(unsigned long addr) { - __native_flush_tlb_single(addr); + __native_flush_tlb_one_user(addr); } struct static_key paravirt_steal_enabled; @@ -401,7 +401,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .flush_tlb_user = native_flush_tlb, .flush_tlb_kernel = native_flush_tlb_global, - .flush_tlb_single = native_flush_tlb_single, + .flush_tlb_one_user = native_flush_tlb_one_user, .flush_tlb_others = native_flush_tlb_others, .pgd_alloc = __paravirt_pgd_alloc, diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 35c461f21815..bbfc8b1e9104 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -33,6 +33,7 @@ #include <linux/string.h> #include <linux/crash_dump.h> #include <linux/dma-mapping.h> +#include <linux/dma-direct.h> #include <linux/bitmap.h> #include <linux/pci_ids.h> #include <linux/pci.h> @@ -445,8 +446,6 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, npages = size >> PAGE_SHIFT; order = get_order(size); - flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); - /* alloc enough pages (and possibly more) */ ret = (void *)__get_free_pages(flag, order); if (!ret) @@ -493,7 +492,7 @@ static const struct dma_map_ops calgary_dma_ops = { .map_page = calgary_map_page, .unmap_page = calgary_unmap_page, .mapping_error = calgary_mapping_error, - .dma_supported = x86_dma_supported, + .dma_supported = dma_direct_supported, }; static inline void __iomem * busno_to_bbar(unsigned char num) diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index df7ab02f959f..77625b60a510 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -6,7 +6,6 @@ #include <linux/bootmem.h> #include <linux/gfp.h> #include <linux/pci.h> -#include <linux/kmemleak.h> #include <asm/proto.h> #include <asm/dma.h> @@ -18,7 +17,7 @@ static int forbid_dac __read_mostly; -const struct dma_map_ops *dma_ops = &nommu_dma_ops; +const struct dma_map_ops *dma_ops = &dma_direct_ops; EXPORT_SYMBOL(dma_ops); static int iommu_sac_force __read_mostly; @@ -76,70 +75,12 @@ void __init pci_iommu_alloc(void) } } } -void *dma_generic_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_addr, gfp_t flag, - unsigned long attrs) -{ - unsigned long dma_mask; - struct page *page; - unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; - dma_addr_t addr; - - dma_mask = dma_alloc_coherent_mask(dev, flag); - -again: - page = NULL; - /* CMA can be used only in the context which permits sleeping */ - if (gfpflags_allow_blocking(flag)) { - page = dma_alloc_from_contiguous(dev, count, get_order(size), - flag); - if (page) { - addr = phys_to_dma(dev, page_to_phys(page)); - if (addr + size > dma_mask) { - dma_release_from_contiguous(dev, page, count); - page = NULL; - } - } - } - /* fallback */ - if (!page) - page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); - if (!page) - return NULL; - - addr = phys_to_dma(dev, page_to_phys(page)); - if (addr + size > dma_mask) { - __free_pages(page, get_order(size)); - - if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) { - flag = (flag & ~GFP_DMA32) | GFP_DMA; - goto again; - } - - return NULL; - } - memset(page_address(page), 0, size); - *dma_addr = addr; - return page_address(page); -} - -void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_addr, unsigned long attrs) -{ - unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; - struct page *page = virt_to_page(vaddr); - - if (!dma_release_from_contiguous(dev, page, count)) - free_pages((unsigned long)vaddr, get_order(size)); -} bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp) { if (!*dev) *dev = &x86_dma_fallback_dev; - *gfp = dma_alloc_coherent_gfp_flags(*dev, *gfp); - if (!is_device_dma_capable(*dev)) return false; return true; @@ -245,16 +186,6 @@ int arch_dma_supported(struct device *dev, u64 mask) } EXPORT_SYMBOL(arch_dma_supported); -int x86_dma_supported(struct device *dev, u64 mask) -{ - /* Copied from i386. Doesn't make much sense, because it will - only work for pci_alloc_coherent. - The caller just has to use GFP_DMA in this case. */ - if (mask < DMA_BIT_MASK(24)) - return 0; - return 1; -} - static int __init pci_iommu_init(void) { struct iommu_table_entry *p; diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c deleted file mode 100644 index 618285e475c6..000000000000 --- a/arch/x86/kernel/pci-nommu.c +++ /dev/null @@ -1,109 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Fallback functions when the main IOMMU code is not compiled in. This - code is roughly equivalent to i386. */ -#include <linux/dma-direct.h> -#include <linux/scatterlist.h> -#include <linux/string.h> -#include <linux/gfp.h> -#include <linux/pci.h> -#include <linux/mm.h> - -#include <asm/processor.h> -#include <asm/iommu.h> -#include <asm/dma.h> - -#define NOMMU_MAPPING_ERROR 0 - -static int -check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) -{ - if (hwdev && !dma_capable(hwdev, bus, size)) { - if (*hwdev->dma_mask >= DMA_BIT_MASK(32)) - printk(KERN_ERR - "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", - name, (long long)bus, size, - (long long)*hwdev->dma_mask); - return 0; - } - return 1; -} - -static dma_addr_t nommu_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, - unsigned long attrs) -{ - dma_addr_t bus = phys_to_dma(dev, page_to_phys(page)) + offset; - WARN_ON(size == 0); - if (!check_addr("map_single", dev, bus, size)) - return NOMMU_MAPPING_ERROR; - flush_write_buffers(); - return bus; -} - -/* Map a set of buffers described by scatterlist in streaming - * mode for DMA. This is the scatter-gather version of the - * above pci_map_single interface. Here the scatter gather list - * elements are each tagged with the appropriate dma address - * and length. They are obtained via sg_dma_{address,length}(SG). - * - * NOTE: An implementation may be able to use a smaller number of - * DMA address/length pairs than there are SG table elements. - * (for example via virtual mapping capabilities) - * The routine returns the number of addr/length pairs actually - * used, at most nents. - * - * Device ownership issues as mentioned above for pci_map_single are - * the same here. - */ -static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, - int nents, enum dma_data_direction dir, - unsigned long attrs) -{ - struct scatterlist *s; - int i; - - WARN_ON(nents == 0 || sg[0].length == 0); - - for_each_sg(sg, s, nents, i) { - BUG_ON(!sg_page(s)); - s->dma_address = sg_phys(s); - if (!check_addr("map_sg", hwdev, s->dma_address, s->length)) - return 0; - s->dma_length = s->length; - } - flush_write_buffers(); - return nents; -} - -static void nommu_sync_single_for_device(struct device *dev, - dma_addr_t addr, size_t size, - enum dma_data_direction dir) -{ - flush_write_buffers(); -} - - -static void nommu_sync_sg_for_device(struct device *dev, - struct scatterlist *sg, int nelems, - enum dma_data_direction dir) -{ - flush_write_buffers(); -} - -static int nommu_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ - return dma_addr == NOMMU_MAPPING_ERROR; -} - -const struct dma_map_ops nommu_dma_ops = { - .alloc = dma_generic_alloc_coherent, - .free = dma_generic_free_coherent, - .map_sg = nommu_map_sg, - .map_page = nommu_map_page, - .sync_single_for_device = nommu_sync_single_for_device, - .sync_sg_for_device = nommu_sync_sg_for_device, - .is_phys = 1, - .mapping_error = nommu_mapping_error, - .dma_supported = x86_dma_supported, -}; diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 0ee0f8f34251..661583662430 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -17,52 +17,6 @@ int swiotlb __read_mostly; -void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, gfp_t flags, - unsigned long attrs) -{ - void *vaddr; - - /* - * Don't print a warning when the first allocation attempt fails. - * swiotlb_alloc_coherent() will print a warning when the DMA - * memory allocation ultimately failed. - */ - flags |= __GFP_NOWARN; - - vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags, - attrs); - if (vaddr) - return vaddr; - - return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags); -} - -void x86_swiotlb_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_addr, - unsigned long attrs) -{ - if (is_swiotlb_buffer(dma_to_phys(dev, dma_addr))) - swiotlb_free_coherent(dev, size, vaddr, dma_addr); - else - dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs); -} - -static const struct dma_map_ops x86_swiotlb_dma_ops = { - .mapping_error = swiotlb_dma_mapping_error, - .alloc = x86_swiotlb_alloc_coherent, - .free = x86_swiotlb_free_coherent, - .sync_single_for_cpu = swiotlb_sync_single_for_cpu, - .sync_single_for_device = swiotlb_sync_single_for_device, - .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, - .sync_sg_for_device = swiotlb_sync_sg_for_device, - .map_sg = swiotlb_map_sg_attrs, - .unmap_sg = swiotlb_unmap_sg_attrs, - .map_page = swiotlb_map_page, - .unmap_page = swiotlb_unmap_page, - .dma_supported = NULL, -}; - /* * pci_swiotlb_detect_override - set swiotlb to 1 if necessary * @@ -112,7 +66,7 @@ void __init pci_swiotlb_init(void) { if (swiotlb) { swiotlb_init(0); - dma_ops = &x86_swiotlb_dma_ops; + dma_ops = &swiotlb_dma_ops; } } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 9eb448c7859d..4b100fe0f508 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -205,6 +205,20 @@ static __always_inline void save_fsgs(struct task_struct *task) save_base_legacy(task, task->thread.gsindex, GS); } +#if IS_ENABLED(CONFIG_KVM) +/* + * While a process is running,current->thread.fsbase and current->thread.gsbase + * may not match the corresponding CPU registers (see save_base_legacy()). KVM + * wants an efficient way to save and restore FSBASE and GSBASE. + * When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE. + */ +void save_fsgs_for_kvm(void) +{ + save_fsgs(current); +} +EXPORT_SYMBOL_GPL(save_fsgs_for_kvm); +#endif + static __always_inline void loadseg(enum which_selector which, unsigned short sel) { diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 2126b9d27c34..725624b6c0c0 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -666,7 +666,7 @@ void native_machine_shutdown(void) * Even without the erratum, it still makes sense to quiet IO APIC * before disabling Local APIC. */ - disable_IO_APIC(); + clear_IO_APIC(); #endif #ifdef CONFIG_SMP @@ -680,6 +680,7 @@ void native_machine_shutdown(void) #endif lapic_shutdown(); + restore_boot_irq_mode(); #ifdef CONFIG_HPET_TIMER hpet_disable(); diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 69ac9cb9cac6..f7b82ed7b5b5 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -41,11 +41,11 @@ EXPORT_SYMBOL(rtc_lock); */ int mach_set_rtc_mmss(const struct timespec *now) { - unsigned long nowtime = now->tv_sec; + unsigned long long nowtime = now->tv_sec; struct rtc_time tm; int retval = 0; - rtc_time_to_tm(nowtime, &tm); + rtc_time64_to_tm(nowtime, &tm); if (!rtc_valid_tm(&tm)) { retval = mc146818_set_time(&tm); if (retval) @@ -53,7 +53,7 @@ int mach_set_rtc_mmss(const struct timespec *now) __func__, retval); } else { printk(KERN_ERR - "%s: Invalid RTC value: write of %lx to RTC failed\n", + "%s: Invalid RTC value: write of %llx to RTC failed\n", __func__, nowtime); retval = -EINVAL; } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 1ae67e982af7..5c623dfe39d1 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -50,6 +50,7 @@ #include <linux/init_ohci1394_dma.h> #include <linux/kvm_para.h> #include <linux/dma-contiguous.h> +#include <xen/xen.h> #include <linux/errno.h> #include <linux/kernel.h> @@ -189,9 +190,7 @@ struct ist_info ist_info; #endif #else -struct cpuinfo_x86 boot_cpu_data __read_mostly = { - .x86_phys_bits = MAX_PHYSMEM_BITS, -}; +struct cpuinfo_x86 boot_cpu_data __read_mostly; EXPORT_SYMBOL(boot_cpu_data); #endif @@ -536,6 +535,11 @@ static void __init reserve_crashkernel(void) high = true; } + if (xen_pv_domain()) { + pr_info("Ignoring crashkernel for a Xen PV domain\n"); + return; + } + /* 0 means: find the address automatically */ if (crash_base <= 0) { /* @@ -851,6 +855,7 @@ void __init setup_arch(char **cmdline_p) __flush_tlb_all(); #else printk(KERN_INFO "Command line: %s\n", boot_command_line); + boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS; #endif /* @@ -1204,20 +1209,13 @@ void __init setup_arch(char **cmdline_p) kasan_init(); -#ifdef CONFIG_X86_32 - /* sync back kernel address range */ - clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, - swapper_pg_dir + KERNEL_PGD_BOUNDARY, - KERNEL_PGD_PTRS); - /* - * sync back low identity map too. It is used for example - * in the 32-bit EFI stub. + * Sync back kernel address range. + * + * FIXME: Can the later sync in setup_cpu_entry_areas() replace + * this call? */ - clone_pgd_range(initial_page_table, - swapper_pg_dir + KERNEL_PGD_BOUNDARY, - min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); -#endif + sync_initial_page_table(); tboot_probe(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 497aa766fab3..ea554f812ee1 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -287,24 +287,15 @@ void __init setup_per_cpu_areas(void) /* Setup cpu initialized, callin, callout masks */ setup_cpu_local_masks(); -#ifdef CONFIG_X86_32 /* * Sync back kernel address range again. We already did this in * setup_arch(), but percpu data also needs to be available in * the smpboot asm. We can't reliably pick up percpu mappings * using vmalloc_fault(), because exception dispatch needs * percpu data. + * + * FIXME: Can the later sync in setup_cpu_entry_areas() replace + * this call? */ - clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, - swapper_pg_dir + KERNEL_PGD_BOUNDARY, - KERNEL_PGD_PTRS); - - /* - * sync back low identity map too. It is used for example - * in the 32-bit EFI stub. - */ - clone_pgd_range(initial_page_table, - swapper_pg_dir + KERNEL_PGD_BOUNDARY, - min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); -#endif + sync_initial_page_table(); } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 4cdc0b27ec82..da270b95fe4d 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -25,6 +25,7 @@ #include <linux/user-return-notifier.h> #include <linux/uprobes.h> #include <linux/context_tracking.h> +#include <linux/syscalls.h> #include <asm/processor.h> #include <asm/ucontext.h> @@ -601,7 +602,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, * Do a signal return; undo the signal stack. */ #ifdef CONFIG_X86_32 -asmlinkage unsigned long sys_sigreturn(void) +SYSCALL_DEFINE0(sigreturn) { struct pt_regs *regs = current_pt_regs(); struct sigframe __user *frame; @@ -633,7 +634,7 @@ badframe: } #endif /* CONFIG_X86_32 */ -asmlinkage long sys_rt_sigreturn(void) +SYSCALL_DEFINE0(rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index ac057f9b0763..14c057f29979 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -26,8 +26,8 @@ static inline void signal_compat_build_tests(void) * new fields are handled in copy_siginfo_to_user32()! */ BUILD_BUG_ON(NSIGILL != 11); - BUILD_BUG_ON(NSIGFPE != 13); - BUILD_BUG_ON(NSIGSEGV != 4); + BUILD_BUG_ON(NSIGFPE != 15); + BUILD_BUG_ON(NSIGSEGV != 7); BUILD_BUG_ON(NSIGBUS != 5); BUILD_BUG_ON(NSIGTRAP != 4); BUILD_BUG_ON(NSIGCHLD != 6); @@ -43,6 +43,13 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields) != 3 * sizeof(int)); #define CHECK_CSI_OFFSET(name) BUILD_BUG_ON(_sifields_offset != offsetof(compat_siginfo_t, _sifields.name)) + BUILD_BUG_ON(offsetof(siginfo_t, si_signo) != 0); + BUILD_BUG_ON(offsetof(siginfo_t, si_errno) != 4); + BUILD_BUG_ON(offsetof(siginfo_t, si_code) != 8); + + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_signo) != 0); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_errno) != 4); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_code) != 8); /* * Ensure that the size of each si_field never changes. * If it does, it is a sign that the @@ -63,36 +70,94 @@ static inline void signal_compat_build_tests(void) CHECK_CSI_SIZE (_kill, 2*sizeof(int)); CHECK_SI_SIZE (_kill, 2*sizeof(int)); + BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x14); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pid) != 0xC); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_uid) != 0x10); + CHECK_CSI_OFFSET(_timer); CHECK_CSI_SIZE (_timer, 3*sizeof(int)); CHECK_SI_SIZE (_timer, 6*sizeof(int)); + BUILD_BUG_ON(offsetof(siginfo_t, si_tid) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_overrun) != 0x14); + BUILD_BUG_ON(offsetof(siginfo_t, si_value) != 0x18); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_tid) != 0x0C); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_overrun) != 0x10); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_value) != 0x14); + CHECK_CSI_OFFSET(_rt); CHECK_CSI_SIZE (_rt, 3*sizeof(int)); CHECK_SI_SIZE (_rt, 4*sizeof(int)); + BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x14); + BUILD_BUG_ON(offsetof(siginfo_t, si_value) != 0x18); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pid) != 0x0C); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_uid) != 0x10); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_value) != 0x14); + CHECK_CSI_OFFSET(_sigchld); CHECK_CSI_SIZE (_sigchld, 5*sizeof(int)); CHECK_SI_SIZE (_sigchld, 8*sizeof(int)); + BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x14); + BUILD_BUG_ON(offsetof(siginfo_t, si_status) != 0x18); + BUILD_BUG_ON(offsetof(siginfo_t, si_utime) != 0x20); + BUILD_BUG_ON(offsetof(siginfo_t, si_stime) != 0x28); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pid) != 0x0C); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_uid) != 0x10); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_status) != 0x14); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_utime) != 0x18); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_stime) != 0x1C); + #ifdef CONFIG_X86_X32_ABI CHECK_CSI_OFFSET(_sigchld_x32); CHECK_CSI_SIZE (_sigchld_x32, 7*sizeof(int)); /* no _sigchld_x32 in the generic siginfo_t */ + BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields._sigchld_x32._utime) != 0x18); + BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields._sigchld_x32._stime) != 0x20); #endif CHECK_CSI_OFFSET(_sigfault); CHECK_CSI_SIZE (_sigfault, 4*sizeof(int)); CHECK_SI_SIZE (_sigfault, 8*sizeof(int)); + BUILD_BUG_ON(offsetof(siginfo_t, si_addr) != 0x10); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr) != 0x0C); + + BUILD_BUG_ON(offsetof(siginfo_t, si_addr_lsb) != 0x18); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr_lsb) != 0x10); + + BUILD_BUG_ON(offsetof(siginfo_t, si_lower) != 0x20); + BUILD_BUG_ON(offsetof(siginfo_t, si_upper) != 0x28); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_lower) != 0x14); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_upper) != 0x18); + + BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x20); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pkey) != 0x14); + CHECK_CSI_OFFSET(_sigpoll); CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int)); CHECK_SI_SIZE (_sigpoll, 4*sizeof(int)); + BUILD_BUG_ON(offsetof(siginfo_t, si_band) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_fd) != 0x18); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_band) != 0x0C); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_fd) != 0x10); + CHECK_CSI_OFFSET(_sigsys); CHECK_CSI_SIZE (_sigsys, 3*sizeof(int)); CHECK_SI_SIZE (_sigsys, 4*sizeof(int)); + BUILD_BUG_ON(offsetof(siginfo_t, si_call_addr) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_syscall) != 0x18); + BUILD_BUG_ON(offsetof(siginfo_t, si_arch) != 0x1C); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_call_addr) != 0x0C); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_syscall) != 0x10); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_arch) != 0x14); + /* any new si_fields should be added here */ } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 6f27facbaa9b..0f1cbb042f49 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -77,6 +77,8 @@ #include <asm/i8259.h> #include <asm/misc.h> #include <asm/qspinlock.h> +#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> /* Number of siblings per CPU package */ int smp_num_siblings = 1; @@ -390,15 +392,47 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } +/* + * Define snc_cpu[] for SNC (Sub-NUMA Cluster) CPUs. + * + * These are Intel CPUs that enumerate an LLC that is shared by + * multiple NUMA nodes. The LLC on these systems is shared for + * off-package data access but private to the NUMA node (half + * of the package) for on-package access. + * + * CPUID (the source of the information about the LLC) can only + * enumerate the cache as being shared *or* unshared, but not + * this particular configuration. The CPU in this case enumerates + * the cache to be shared across the entire package (spanning both + * NUMA nodes). + */ + +static const struct x86_cpu_id snc_cpu[] = { + { X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X }, + {} +}; + static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { int cpu1 = c->cpu_index, cpu2 = o->cpu_index; - if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID && - per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) - return topology_sane(c, o, "llc"); + /* Do not match if we do not have a valid APICID for cpu: */ + if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID) + return false; - return false; + /* Do not match if LLC id does not match: */ + if (per_cpu(cpu_llc_id, cpu1) != per_cpu(cpu_llc_id, cpu2)) + return false; + + /* + * Allow the SNC topology without warning. Return of false + * means 'c' does not share the LLC of 'o'. This will be + * reflected to userspace. + */ + if (!topology_same_node(c, o) && x86_match_cpu(snc_cpu)) + return false; + + return topology_sane(c, o, "llc"); } /* @@ -456,7 +490,8 @@ static struct sched_domain_topology_level x86_topology[] = { /* * Set if a package/die has multiple NUMA nodes inside. - * AMD Magny-Cours and Intel Cluster-on-Die have this. + * AMD Magny-Cours, Intel Cluster-on-Die, and Intel + * Sub-NUMA Clustering have this. */ static bool x86_has_numa_in_package; @@ -1281,11 +1316,10 @@ void __init native_smp_prepare_boot_cpu(void) cpu_set_state_online(me); } -void __init native_smp_cpus_done(unsigned int max_cpus) +void __init calculate_max_logical_packages(void) { int ncpus; - pr_debug("Boot done\n"); /* * Today neither Intel nor AMD support heterogenous systems so * extrapolate the boot cpu's data to all packages. @@ -1293,6 +1327,13 @@ void __init native_smp_cpus_done(unsigned int max_cpus) ncpus = cpu_data(0).booted_cores * topology_max_smt_threads(); __max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus); pr_info("Max logical packages: %u\n", __max_logical_packages); +} + +void __init native_smp_cpus_done(unsigned int max_cpus) +{ + pr_debug("Boot done\n"); + + calculate_max_logical_packages(); if (x86_has_numa_in_package) set_sched_topology(x86_numa_in_package_topology); @@ -1430,8 +1471,8 @@ static void remove_siblinginfo(int cpu) cpumask_clear(cpu_llc_shared_mask(cpu)); cpumask_clear(topology_sibling_cpumask(cpu)); cpumask_clear(topology_core_cpumask(cpu)); - c->phys_proc_id = 0; c->cpu_core_id = 0; + c->booted_cores = 0; cpumask_clear_cpu(cpu, cpu_sibling_setup_mask); recompute_smt_state(); } @@ -1530,6 +1571,8 @@ static inline void mwait_play_dead(void) void *mwait_ptr; int i; + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return; if (!this_cpu_has(X86_FEATURE_MWAIT)) return; if (!this_cpu_has(X86_FEATURE_CLFLUSH)) diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 676774b9bb8d..a3f15ed545b5 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -97,7 +97,7 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, if (off & ~PAGE_MASK) goto out; - error = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); + error = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); out: return error; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 446c9ef8cfc3..03f3d7695dac 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -181,7 +181,7 @@ int fixup_bug(struct pt_regs *regs, int trapnr) break; case BUG_TRAP_TYPE_WARN: - regs->ip += LEN_UD0; + regs->ip += LEN_UD2; return 1; } @@ -577,7 +577,6 @@ do_general_protection(struct pt_regs *regs, long error_code) } NOKPROBE_SYMBOL(do_general_protection); -/* May run on IST stack. */ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) { #ifdef CONFIG_DYNAMIC_FTRACE @@ -592,6 +591,13 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) if (poke_int3_handler(regs)) return; + /* + * Use ist_enter despite the fact that we don't use an IST stack. + * We can be called from a kprobe in non-CONTEXT_KERNEL kernel + * mode or even during context tracking state changes. + * + * This means that we can't schedule. That's okay. + */ ist_enter(regs); RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP @@ -609,15 +615,10 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) SIGTRAP) == NOTIFY_STOP) goto exit; - /* - * Let others (NMI) know that the debug stack is in use - * as we may switch to the interrupt stack. - */ - debug_stack_usage_inc(); cond_local_irq_enable(regs); do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL); cond_local_irq_disable(regs); - debug_stack_usage_dec(); + exit: ist_exit(regs); } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index fb4302738410..91e6da48cbb6 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -317,7 +317,7 @@ static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2) hpet2 -= hpet1; tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); do_div(tmp, 1000000); - do_div(deltatsc, tmp); + deltatsc = div64_u64(deltatsc, tmp); return (unsigned long) deltatsc; } @@ -1179,6 +1179,45 @@ struct system_counterval_t convert_art_to_tsc(u64 art) } EXPORT_SYMBOL(convert_art_to_tsc); +/** + * convert_art_ns_to_tsc() - Convert ART in nanoseconds to TSC. + * @art_ns: ART (Always Running Timer) in unit of nanoseconds + * + * PTM requires all timestamps to be in units of nanoseconds. When user + * software requests a cross-timestamp, this function converts system timestamp + * to TSC. + * + * This is valid when CPU feature flag X86_FEATURE_TSC_KNOWN_FREQ is set + * indicating the tsc_khz is derived from CPUID[15H]. Drivers should check + * that this flag is set before conversion to TSC is attempted. + * + * Return: + * struct system_counterval_t - system counter value with the pointer to the + * corresponding clocksource + * @cycles: System counter value + * @cs: Clocksource corresponding to system counter value. Used + * by timekeeping code to verify comparibility of two cycle + * values. + */ + +struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns) +{ + u64 tmp, res, rem; + + rem = do_div(art_ns, USEC_PER_SEC); + + res = art_ns * tsc_khz; + tmp = rem * tsc_khz; + + do_div(tmp, USEC_PER_SEC); + res += tmp; + + return (struct system_counterval_t) { .cs = art_related_clocksource, + .cycles = res}; +} +EXPORT_SYMBOL(convert_art_ns_to_tsc); + + static void tsc_refine_calibration_work(struct work_struct *work); static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); /** diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 1f9188f5357c..feb28fee6cea 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -5,7 +5,6 @@ #include <asm/unwind.h> #include <asm/orc_types.h> #include <asm/orc_lookup.h> -#include <asm/sections.h> #define orc_warn(fmt, ...) \ printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__) @@ -148,7 +147,7 @@ static struct orc_entry *orc_find(unsigned long ip) } /* vmlinux .init slow lookup: */ - if (ip >= (unsigned long)_sinittext && ip < (unsigned long)_einittext) + if (init_kernel_text(ip)) return __orc_find(__start_orc_unwind_ip, __start_orc_unwind, __stop_orc_unwind_ip - __start_orc_unwind_ip, ip); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 5edb27f1a2c4..9d0b5af7db91 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -727,7 +727,8 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code) return; check_vip: - if (VEFLAGS & X86_EFLAGS_VIP) { + if ((VEFLAGS & (X86_EFLAGS_VIP | X86_EFLAGS_VIF)) == + (X86_EFLAGS_VIP | X86_EFLAGS_VIF)) { save_v86_state(regs, VM86_STI); return; } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 9b138a06c1a4..795f3a80e576 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -102,7 +102,6 @@ SECTIONS _stext = .; /* bootstrapping code */ HEAD_TEXT - . = ALIGN(8); TEXT_TEXT SCHED_TEXT CPUIDLE_TEXT @@ -118,9 +117,11 @@ SECTIONS #ifdef CONFIG_X86_64 . = ALIGN(PAGE_SIZE); + VMLINUX_SYMBOL(__entry_trampoline_start) = .; _entry_trampoline = .; *(.entry_trampoline) . = ALIGN(PAGE_SIZE); + VMLINUX_SYMBOL(__entry_trampoline_end) = .; ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big"); #endif @@ -198,7 +199,7 @@ SECTIONS . = __vvar_beginning_hack + PAGE_SIZE; } :data - . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); + . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); /* Init code and data - will be freed after init */ . = ALIGN(PAGE_SIZE); @@ -366,8 +367,8 @@ SECTIONS . = ALIGN(PAGE_SIZE); /* keep VO_INIT_SIZE page aligned */ _end = .; - STABS_DEBUG - DWARF_DEBUG + STABS_DEBUG + DWARF_DEBUG /* Sections to be discarded */ DISCARDS diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 1151ccd72ce9..3ab867603e81 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -8,6 +8,7 @@ #include <linux/export.h> #include <linux/pci.h> +#include <asm/acpi.h> #include <asm/bios_ebda.h> #include <asm/paravirt.h> #include <asm/pci_x86.h> @@ -26,10 +27,11 @@ void x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } -int __init iommu_init_noop(void) { return 0; } -void iommu_shutdown_noop(void) { } -bool __init bool_x86_init_noop(void) { return false; } -void x86_op_int_noop(int cpu) { } +static int __init iommu_init_noop(void) { return 0; } +static void iommu_shutdown_noop(void) { } +static bool __init bool_x86_init_noop(void) { return false; } +static void x86_op_int_noop(int cpu) { } +static u64 u64_x86_init_noop(void) { return 0; } /* * The platform setup functions are preset with the default functions @@ -90,6 +92,12 @@ struct x86_init_ops x86_init __initdata = { .guest_late_init = x86_init_noop, .x2apic_available = bool_x86_init_noop, .init_mem_mapping = x86_init_noop, + .init_after_bootmem = x86_init_noop, + }, + + .acpi = { + .get_root_pointer = u64_x86_init_noop, + .reduced_hw_early_init = acpi_generic_reduced_hw_init, }, }; @@ -146,7 +154,7 @@ void arch_restore_msi_irqs(struct pci_dev *dev) } #endif -struct x86_io_apic_ops x86_io_apic_ops __ro_after_init = { - .read = native_io_apic_read, - .disable = native_disable_io_apic, +struct x86_apic_ops x86_apic_ops __ro_after_init = { + .io_apic_read = native_io_apic_read, + .restore = native_restore_boot_irq_mode, }; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index a0c5a69bc7c4..82055b90a8b3 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -135,6 +135,11 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) return -EINVAL; } + best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0); + if (kvm_hlt_in_guest(vcpu->kvm) && best && + (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) + best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); + /* Update physical-address width */ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); kvm_mmu_reset_context(vcpu); @@ -370,7 +375,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) | - F(TOPOEXT); + F(TOPOEXT) | F(PERFCTR_CORE); /* cpuid 0x80000008.ebx */ const u32 kvm_cpuid_8000_0008_ebx_x86_features = @@ -607,7 +612,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_PV_EOI) | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | (1 << KVM_FEATURE_PV_UNHALT) | - (1 << KVM_FEATURE_PV_TLB_FLUSH); + (1 << KVM_FEATURE_PV_TLB_FLUSH) | + (1 << KVM_FEATURE_ASYNC_PF_VMEXIT); if (sched_info_on()) entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d91eaeb01034..b3705ae52824 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -30,6 +30,7 @@ #include "x86.h" #include "tss.h" #include "mmu.h" +#include "pmu.h" /* * Operand types @@ -2887,6 +2888,9 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) return ctxt->ops->cpl(ctxt) > iopl; } +#define VMWARE_PORT_VMPORT (0x5658) +#define VMWARE_PORT_VMRPC (0x5659) + static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, u16 port, u16 len) { @@ -2898,6 +2902,14 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, unsigned mask = (1 << len) - 1; unsigned long base; + /* + * VMware allows access to these ports even if denied + * by TSS I/O permission bitmap. Mimic behavior. + */ + if (enable_vmware_backdoor && + ((port == VMWARE_PORT_VMPORT) || (port == VMWARE_PORT_VMRPC))) + return true; + ops->get_segment(ctxt, &tr, &tr_seg, &base3, VCPU_SREG_TR); if (!tr_seg.p) return false; @@ -4282,6 +4294,13 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt) u64 cr4 = ctxt->ops->get_cr(ctxt, 4); u64 rcx = reg_read(ctxt, VCPU_REGS_RCX); + /* + * VMware allows access to these Pseduo-PMCs even when read via RDPMC + * in Ring3 when CR4.PCE=0. + */ + if (enable_vmware_backdoor && is_vmware_backdoor_pmc(rcx)) + return X86EMUL_CONTINUE; + if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || ctxt->ops->check_pmc(ctxt, rcx)) return emulate_gp(ctxt, 0); @@ -4498,6 +4517,10 @@ static const struct gprefix pfx_0f_2b = { ID(0, &instr_dual_0f_2b), ID(0, &instr_dual_0f_2b), N, N, }; +static const struct gprefix pfx_0f_10_0f_11 = { + I(Unaligned, em_mov), I(Unaligned, em_mov), N, N, +}; + static const struct gprefix pfx_0f_28_0f_29 = { I(Aligned, em_mov), I(Aligned, em_mov), N, N, }; @@ -4709,7 +4732,9 @@ static const struct opcode twobyte_table[256] = { DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N, /* 0x10 - 0x1F */ - N, N, N, N, N, N, N, N, + GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_10_0f_11), + GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_10_0f_11), + N, N, N, N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N, N, N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 0x20 - 0x2F */ diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index dc97f2544b6f..98618e397342 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -29,6 +29,7 @@ #include <linux/kvm_host.h> #include <linux/highmem.h> #include <linux/sched/cputime.h> +#include <linux/eventfd.h> #include <asm/apicdef.h> #include <trace/events/kvm.h> @@ -74,13 +75,38 @@ static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic, return false; } +static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, + int vector) +{ + if (vector < HV_SYNIC_FIRST_VALID_VECTOR) + return; + + if (synic_has_vector_connected(synic, vector)) + __set_bit(vector, synic->vec_bitmap); + else + __clear_bit(vector, synic->vec_bitmap); + + if (synic_has_vector_auto_eoi(synic, vector)) + __set_bit(vector, synic->auto_eoi_bitmap); + else + __clear_bit(vector, synic->auto_eoi_bitmap); +} + static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, u64 data, bool host) { - int vector; + int vector, old_vector; + bool masked; vector = data & HV_SYNIC_SINT_VECTOR_MASK; - if (vector < 16 && !host) + masked = data & HV_SYNIC_SINT_MASKED; + + /* + * Valid vectors are 16-255, however, nested Hyper-V attempts to write + * default '0x10000' value on boot and this should not #GP. We need to + * allow zero-initing the register from host as well. + */ + if (vector < HV_SYNIC_FIRST_VALID_VECTOR && !host && !masked) return 1; /* * Guest may configure multiple SINTs to use the same vector, so @@ -88,18 +114,13 @@ static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, * bitmap of vectors with auto-eoi behavior. The bitmaps are * updated here, and atomically queried on fast paths. */ + old_vector = synic_read_sint(synic, sint) & HV_SYNIC_SINT_VECTOR_MASK; atomic64_set(&synic->sint[sint], data); - if (synic_has_vector_connected(synic, vector)) - __set_bit(vector, synic->vec_bitmap); - else - __clear_bit(vector, synic->vec_bitmap); + synic_update_vector(synic, old_vector); - if (synic_has_vector_auto_eoi(synic, vector)) - __set_bit(vector, synic->auto_eoi_bitmap); - else - __clear_bit(vector, synic->auto_eoi_bitmap); + synic_update_vector(synic, vector); /* Load SynIC vectors into EOI exit bitmap */ kvm_make_request(KVM_REQ_SCAN_IOAPIC, synic_to_vcpu(synic)); @@ -736,6 +757,9 @@ static bool kvm_hv_msr_partition_wide(u32 msr) case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_RESET: + case HV_X64_MSR_REENLIGHTENMENT_CONTROL: + case HV_X64_MSR_TSC_EMULATION_CONTROL: + case HV_X64_MSR_TSC_EMULATION_STATUS: r = true; break; } @@ -981,6 +1005,15 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, kvm_make_request(KVM_REQ_HV_RESET, vcpu); } break; + case HV_X64_MSR_REENLIGHTENMENT_CONTROL: + hv->hv_reenlightenment_control = data; + break; + case HV_X64_MSR_TSC_EMULATION_CONTROL: + hv->hv_tsc_emulation_control = data; + break; + case HV_X64_MSR_TSC_EMULATION_STATUS: + hv->hv_tsc_emulation_status = data; + break; default: vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", msr, data); @@ -1009,17 +1042,17 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return 1; hv->vp_index = (u32)data; break; - case HV_X64_MSR_APIC_ASSIST_PAGE: { + case HV_X64_MSR_VP_ASSIST_PAGE: { u64 gfn; unsigned long addr; - if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { + if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) { hv->hv_vapic = data; if (kvm_lapic_enable_pv_eoi(vcpu, 0)) return 1; break; } - gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT; + gfn = data >> HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT; addr = kvm_vcpu_gfn_to_hva(vcpu, gfn); if (kvm_is_error_hva(addr)) return 1; @@ -1105,6 +1138,15 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_RESET: data = 0; break; + case HV_X64_MSR_REENLIGHTENMENT_CONTROL: + data = hv->hv_reenlightenment_control; + break; + case HV_X64_MSR_TSC_EMULATION_CONTROL: + data = hv->hv_tsc_emulation_control; + break; + case HV_X64_MSR_TSC_EMULATION_STATUS: + data = hv->hv_tsc_emulation_status; + break; default: vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; @@ -1129,7 +1171,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); - case HV_X64_MSR_APIC_ASSIST_PAGE: + case HV_X64_MSR_VP_ASSIST_PAGE: data = hv->hv_vapic; break; case HV_X64_MSR_VP_RUNTIME: @@ -1226,10 +1268,47 @@ static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) return 1; } +static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param) +{ + struct eventfd_ctx *eventfd; + + if (unlikely(!fast)) { + int ret; + gpa_t gpa = param; + + if ((gpa & (__alignof__(param) - 1)) || + offset_in_page(gpa) + sizeof(param) > PAGE_SIZE) + return HV_STATUS_INVALID_ALIGNMENT; + + ret = kvm_vcpu_read_guest(vcpu, gpa, ¶m, sizeof(param)); + if (ret < 0) + return HV_STATUS_INVALID_ALIGNMENT; + } + + /* + * Per spec, bits 32-47 contain the extra "flag number". However, we + * have no use for it, and in all known usecases it is zero, so just + * report lookup failure if it isn't. + */ + if (param & 0xffff00000000ULL) + return HV_STATUS_INVALID_PORT_ID; + /* remaining bits are reserved-zero */ + if (param & ~KVM_HYPERV_CONN_ID_MASK) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + + /* conn_to_evt is protected by vcpu->kvm->srcu */ + eventfd = idr_find(&vcpu->kvm->arch.hyperv.conn_to_evt, param); + if (!eventfd) + return HV_STATUS_INVALID_PORT_ID; + + eventfd_signal(eventfd, 1); + return HV_STATUS_SUCCESS; +} + int kvm_hv_hypercall(struct kvm_vcpu *vcpu) { - u64 param, ingpa, outgpa, ret; - uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; + u64 param, ingpa, outgpa, ret = HV_STATUS_SUCCESS; + uint16_t code, rep_idx, rep_cnt; bool fast, longmode; /* @@ -1268,7 +1347,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) /* Hypercall continuation is not supported yet */ if (rep_cnt || rep_idx) { - res = HV_STATUS_INVALID_HYPERCALL_CODE; + ret = HV_STATUS_INVALID_HYPERCALL_CODE; goto set_result; } @@ -1276,11 +1355,15 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) case HVCALL_NOTIFY_LONG_SPIN_WAIT: kvm_vcpu_on_spin(vcpu, true); break; - case HVCALL_POST_MESSAGE: case HVCALL_SIGNAL_EVENT: + ret = kvm_hvcall_signal_event(vcpu, fast, ingpa); + if (ret != HV_STATUS_INVALID_PORT_ID) + break; + /* maybe userspace knows this conn_id: fall through */ + case HVCALL_POST_MESSAGE: /* don't bother userspace if it has no way to handle it */ if (!vcpu_to_synic(vcpu)->active) { - res = HV_STATUS_INVALID_HYPERCALL_CODE; + ret = HV_STATUS_INVALID_HYPERCALL_CODE; break; } vcpu->run->exit_reason = KVM_EXIT_HYPERV; @@ -1292,12 +1375,79 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) kvm_hv_hypercall_complete_userspace; return 0; default: - res = HV_STATUS_INVALID_HYPERCALL_CODE; + ret = HV_STATUS_INVALID_HYPERCALL_CODE; break; } set_result: - ret = res | (((u64)rep_done & 0xfff) << 32); kvm_hv_hypercall_set_result(vcpu, ret); return 1; } + +void kvm_hv_init_vm(struct kvm *kvm) +{ + mutex_init(&kvm->arch.hyperv.hv_lock); + idr_init(&kvm->arch.hyperv.conn_to_evt); +} + +void kvm_hv_destroy_vm(struct kvm *kvm) +{ + struct eventfd_ctx *eventfd; + int i; + + idr_for_each_entry(&kvm->arch.hyperv.conn_to_evt, eventfd, i) + eventfd_ctx_put(eventfd); + idr_destroy(&kvm->arch.hyperv.conn_to_evt); +} + +static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd) +{ + struct kvm_hv *hv = &kvm->arch.hyperv; + struct eventfd_ctx *eventfd; + int ret; + + eventfd = eventfd_ctx_fdget(fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + mutex_lock(&hv->hv_lock); + ret = idr_alloc(&hv->conn_to_evt, eventfd, conn_id, conn_id + 1, + GFP_KERNEL); + mutex_unlock(&hv->hv_lock); + + if (ret >= 0) + return 0; + + if (ret == -ENOSPC) + ret = -EEXIST; + eventfd_ctx_put(eventfd); + return ret; +} + +static int kvm_hv_eventfd_deassign(struct kvm *kvm, u32 conn_id) +{ + struct kvm_hv *hv = &kvm->arch.hyperv; + struct eventfd_ctx *eventfd; + + mutex_lock(&hv->hv_lock); + eventfd = idr_remove(&hv->conn_to_evt, conn_id); + mutex_unlock(&hv->hv_lock); + + if (!eventfd) + return -ENOENT; + + synchronize_srcu(&kvm->srcu); + eventfd_ctx_put(eventfd); + return 0; +} + +int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args) +{ + if ((args->flags & ~KVM_HYPERV_EVENTFD_DEASSIGN) || + (args->conn_id & ~KVM_HYPERV_CONN_ID_MASK)) + return -EINVAL; + + if (args->flags == KVM_HYPERV_EVENTFD_DEASSIGN) + return kvm_hv_eventfd_deassign(kvm, args->conn_id); + return kvm_hv_eventfd_assign(kvm, args->conn_id, args->fd); +} diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index e637631a9574..837465d69c6d 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -88,4 +88,8 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu); void kvm_hv_setup_tsc_page(struct kvm *kvm, struct pvclock_vcpu_time_info *hv_clock); +void kvm_hv_init_vm(struct kvm *kvm); +void kvm_hv_destroy_vm(struct kvm *kvm); +int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args); + #endif diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index f171051eecf3..faa264822cee 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -73,8 +73,19 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v) */ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) { + /* + * FIXME: interrupt.injected represents an interrupt that it's + * side-effects have already been applied (e.g. bit from IRR + * already moved to ISR). Therefore, it is incorrect to rely + * on interrupt.injected to know if there is a pending + * interrupt in the user-mode LAPIC. + * This leads to nVMX/nSVM not be able to distinguish + * if it should exit from L2 to L1 on EXTERNAL_INTERRUPT on + * pending interrupt or should re-inject an injected + * interrupt. + */ if (!lapic_in_kernel(v)) - return v->arch.interrupt.pending; + return v->arch.interrupt.injected; if (kvm_cpu_has_extint(v)) return 1; @@ -91,8 +102,19 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) */ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) { + /* + * FIXME: interrupt.injected represents an interrupt that it's + * side-effects have already been applied (e.g. bit from IRR + * already moved to ISR). Therefore, it is incorrect to rely + * on interrupt.injected to know if there is a pending + * interrupt in the user-mode LAPIC. + * This leads to nVMX/nSVM not be able to distinguish + * if it should exit from L2 to L1 on EXTERNAL_INTERRUPT on + * pending interrupt or should re-inject an injected + * interrupt. + */ if (!lapic_in_kernel(v)) - return v->arch.interrupt.pending; + return v->arch.interrupt.injected; if (kvm_cpu_has_extint(v)) return 1; diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index f500293dad8d..9619dcc2b325 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -41,7 +41,7 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) if (!test_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_avail)) - kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); + kvm_x86_ops->cache_reg(vcpu, (enum kvm_reg)VCPU_EXREG_PDPTR); return vcpu->arch.walk_mmu->pdptrs[index]; } @@ -93,6 +93,11 @@ static inline void enter_guest_mode(struct kvm_vcpu *vcpu) static inline void leave_guest_mode(struct kvm_vcpu *vcpu) { vcpu->arch.hflags &= ~HF_GUEST_MASK; + + if (vcpu->arch.load_eoi_exitmap_pending) { + vcpu->arch.load_eoi_exitmap_pending = false; + kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu); + } } static inline bool is_guest_mode(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 924ac8ce9d50..70dcb5548022 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -321,8 +321,16 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; + /* + * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation) + * which doesn't have EOI register; Some buggy OSes (e.g. Windows with + * Hyper-V role) disable EOI broadcast in lapic not checking for IOAPIC + * version first and level-triggered interrupts never get EOIed in + * IOAPIC. + */ feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); - if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31)))) + if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))) && + !ioapic_in_kernel(vcpu->kvm)) v |= APIC_LVR_DIRECTED_EOI; kvm_lapic_set_reg(apic, APIC_LVR, v); } @@ -2002,14 +2010,13 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) { - struct kvm_lapic *apic; + struct kvm_lapic *apic = vcpu->arch.apic; int i; - apic_debug("%s\n", __func__); + if (!apic) + return; - ASSERT(vcpu); - apic = vcpu->arch.apic; - ASSERT(apic != NULL); + apic_debug("%s\n", __func__); /* Stop the timer in case it's a reset to an active apic */ hrtimer_cancel(&apic->lapic_timer.timer); @@ -2165,7 +2172,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) */ vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */ - kvm_lapic_reset(vcpu, false); kvm_iodevice_init(&apic->dev, &apic_mmio_ops); return 0; @@ -2569,7 +2575,6 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) pe = xchg(&apic->pending_events, 0); if (test_bit(KVM_APIC_INIT, &pe)) { - kvm_lapic_reset(vcpu, true); kvm_vcpu_reset(vcpu, true); if (kvm_vcpu_is_bsp(apic->vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 56c36014f7b7..edce055e9fd7 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -109,7 +109,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) { - return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; + return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE; } int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8eca1d04aeb8..8494dbae41b9 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2770,8 +2770,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, else pte_access &= ~ACC_WRITE_MASK; + if (!kvm_is_mmio_pfn(pfn)) + spte |= shadow_me_mask; + spte |= (u64)pfn << PAGE_SHIFT; - spte |= shadow_me_mask; if (pte_access & ACC_WRITE_MASK) { @@ -5080,7 +5082,7 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head); /* The caller should hold mmu-lock before calling this function. */ -static bool +static __always_inline bool slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, int start_level, int end_level, gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb) @@ -5110,7 +5112,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, return flush; } -static bool +static __always_inline bool slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, int start_level, int end_level, bool lock_flush_tlb) @@ -5121,7 +5123,7 @@ slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, lock_flush_tlb); } -static bool +static __always_inline bool slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, bool lock_flush_tlb) { @@ -5129,7 +5131,7 @@ slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); } -static bool +static __always_inline bool slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, bool lock_flush_tlb) { @@ -5137,7 +5139,7 @@ slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); } -static bool +static __always_inline bool slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, bool lock_flush_tlb) { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 5abae72266b7..6288e9d7068e 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -452,14 +452,21 @@ error: * done by is_rsvd_bits_set() above. * * We set up the value of exit_qualification to inject: - * [2:0] - Derive from [2:0] of real exit_qualification at EPT violation + * [2:0] - Derive from the access bits. The exit_qualification might be + * out of date if it is serving an EPT misconfiguration. * [5:3] - Calculated by the page walk of the guest EPT page tables * [7:8] - Derived from [7:8] of real exit_qualification * * The other bits are set to 0. */ if (!(errcode & PFERR_RSVD_MASK)) { - vcpu->arch.exit_qualification &= 0x187; + vcpu->arch.exit_qualification &= 0x180; + if (write_fault) + vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE; + if (user_fault) + vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ; + if (fetch_fault) + vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR; vcpu->arch.exit_qualification |= (pte_access & 0x7) << 3; } #endif diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 026db42a86c3..58ead7db71a3 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -244,12 +244,49 @@ int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) return kvm_x86_ops->pmu_ops->is_valid_msr_idx(vcpu, idx); } +bool is_vmware_backdoor_pmc(u32 pmc_idx) +{ + switch (pmc_idx) { + case VMWARE_BACKDOOR_PMC_HOST_TSC: + case VMWARE_BACKDOOR_PMC_REAL_TIME: + case VMWARE_BACKDOOR_PMC_APPARENT_TIME: + return true; + } + return false; +} + +static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) +{ + u64 ctr_val; + + switch (idx) { + case VMWARE_BACKDOOR_PMC_HOST_TSC: + ctr_val = rdtsc(); + break; + case VMWARE_BACKDOOR_PMC_REAL_TIME: + ctr_val = ktime_get_boot_ns(); + break; + case VMWARE_BACKDOOR_PMC_APPARENT_TIME: + ctr_val = ktime_get_boot_ns() + + vcpu->kvm->arch.kvmclock_offset; + break; + default: + return 1; + } + + *data = ctr_val; + return 0; +} + int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) { bool fast_mode = idx & (1u << 31); struct kvm_pmc *pmc; u64 ctr_val; + if (is_vmware_backdoor_pmc(idx)) + return kvm_pmu_rdpmc_vmware(vcpu, idx, data); + pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, idx); if (!pmc) return 1; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index a9a62b9a73e2..ba8898e1a854 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -9,6 +9,10 @@ /* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */ #define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf) +#define VMWARE_BACKDOOR_PMC_HOST_TSC 0x10000 +#define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001 +#define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002 + struct kvm_event_hw_type_mapping { u8 eventsel; u8 unit_mask; @@ -114,6 +118,8 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu); void kvm_pmu_init(struct kvm_vcpu *vcpu); void kvm_pmu_destroy(struct kvm_vcpu *vcpu); +bool is_vmware_backdoor_pmc(u32 pmc_idx); + extern struct kvm_pmu_ops intel_pmu_ops; extern struct kvm_pmu_ops amd_pmu_ops; #endif /* __KVM_X86_PMU_H */ diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c index cd944435dfbd..1495a735b38e 100644 --- a/arch/x86/kvm/pmu_amd.c +++ b/arch/x86/kvm/pmu_amd.c @@ -19,6 +19,21 @@ #include "lapic.h" #include "pmu.h" +enum pmu_type { + PMU_TYPE_COUNTER = 0, + PMU_TYPE_EVNTSEL, +}; + +enum index { + INDEX_ZERO = 0, + INDEX_ONE, + INDEX_TWO, + INDEX_THREE, + INDEX_FOUR, + INDEX_FIVE, + INDEX_ERROR, +}; + /* duplicated from amd_perfmon_event_map, K7 and above should work. */ static struct kvm_event_hw_type_mapping amd_event_mapping[] = { [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES }, @@ -31,6 +46,88 @@ static struct kvm_event_hw_type_mapping amd_event_mapping[] = { [7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, }; +static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type) +{ + struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); + + if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) { + if (type == PMU_TYPE_COUNTER) + return MSR_F15H_PERF_CTR; + else + return MSR_F15H_PERF_CTL; + } else { + if (type == PMU_TYPE_COUNTER) + return MSR_K7_PERFCTR0; + else + return MSR_K7_EVNTSEL0; + } +} + +static enum index msr_to_index(u32 msr) +{ + switch (msr) { + case MSR_F15H_PERF_CTL0: + case MSR_F15H_PERF_CTR0: + case MSR_K7_EVNTSEL0: + case MSR_K7_PERFCTR0: + return INDEX_ZERO; + case MSR_F15H_PERF_CTL1: + case MSR_F15H_PERF_CTR1: + case MSR_K7_EVNTSEL1: + case MSR_K7_PERFCTR1: + return INDEX_ONE; + case MSR_F15H_PERF_CTL2: + case MSR_F15H_PERF_CTR2: + case MSR_K7_EVNTSEL2: + case MSR_K7_PERFCTR2: + return INDEX_TWO; + case MSR_F15H_PERF_CTL3: + case MSR_F15H_PERF_CTR3: + case MSR_K7_EVNTSEL3: + case MSR_K7_PERFCTR3: + return INDEX_THREE; + case MSR_F15H_PERF_CTL4: + case MSR_F15H_PERF_CTR4: + return INDEX_FOUR; + case MSR_F15H_PERF_CTL5: + case MSR_F15H_PERF_CTR5: + return INDEX_FIVE; + default: + return INDEX_ERROR; + } +} + +static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, + enum pmu_type type) +{ + switch (msr) { + case MSR_F15H_PERF_CTL0: + case MSR_F15H_PERF_CTL1: + case MSR_F15H_PERF_CTL2: + case MSR_F15H_PERF_CTL3: + case MSR_F15H_PERF_CTL4: + case MSR_F15H_PERF_CTL5: + case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: + if (type != PMU_TYPE_EVNTSEL) + return NULL; + break; + case MSR_F15H_PERF_CTR0: + case MSR_F15H_PERF_CTR1: + case MSR_F15H_PERF_CTR2: + case MSR_F15H_PERF_CTR3: + case MSR_F15H_PERF_CTR4: + case MSR_F15H_PERF_CTR5: + case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: + if (type != PMU_TYPE_COUNTER) + return NULL; + break; + default: + return NULL; + } + + return &pmu->gp_counters[msr_to_index(msr)]; +} + static unsigned amd_find_arch_event(struct kvm_pmu *pmu, u8 event_select, u8 unit_mask) @@ -64,7 +161,18 @@ static bool amd_pmc_is_enabled(struct kvm_pmc *pmc) static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) { - return get_gp_pmc(pmu, MSR_K7_EVNTSEL0 + pmc_idx, MSR_K7_EVNTSEL0); + unsigned int base = get_msr_base(pmu, PMU_TYPE_COUNTER); + struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); + + if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) { + /* + * The idx is contiguous. The MSRs are not. The counter MSRs + * are interleaved with the event select MSRs. + */ + pmc_idx *= 2; + } + + return get_gp_pmc_amd(pmu, base + pmc_idx, PMU_TYPE_COUNTER); } /* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */ @@ -96,8 +204,8 @@ static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); int ret = false; - ret = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0) || - get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0); + ret = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER) || + get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL); return ret; } @@ -107,14 +215,14 @@ static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_pmc *pmc; - /* MSR_K7_PERFCTRn */ - pmc = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0); + /* MSR_PERFCTRn */ + pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER); if (pmc) { *data = pmc_read_counter(pmc); return 0; } - /* MSR_K7_EVNTSELn */ - pmc = get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0); + /* MSR_EVNTSELn */ + pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL); if (pmc) { *data = pmc->eventsel; return 0; @@ -130,14 +238,14 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) u32 msr = msr_info->index; u64 data = msr_info->data; - /* MSR_K7_PERFCTRn */ - pmc = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0); + /* MSR_PERFCTRn */ + pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER); if (pmc) { pmc->counter += data - pmc_read_counter(pmc); return 0; } - /* MSR_K7_EVNTSELn */ - pmc = get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0); + /* MSR_EVNTSELn */ + pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL); if (pmc) { if (data == pmc->eventsel) return 0; @@ -154,7 +262,11 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS; + if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) + pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE; + else + pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS; + pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; pmu->reserved_bits = 0xffffffff00200000ull; /* not applicable to AMD; but clean them to prevent any fall out */ @@ -169,7 +281,9 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); int i; - for (i = 0; i < AMD64_NUM_COUNTERS ; i++) { + BUILD_BUG_ON(AMD64_NUM_COUNTERS_CORE > INTEL_PMC_MAX_GENERIC); + + for (i = 0; i < AMD64_NUM_COUNTERS_CORE ; i++) { pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; pmu->gp_counters[i].idx = i; @@ -181,7 +295,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); int i; - for (i = 0; i < AMD64_NUM_COUNTERS; i++) { + for (i = 0; i < AMD64_NUM_COUNTERS_CORE; i++) { struct kvm_pmc *pmc = &pmu->gp_counters[i]; pmc_stop_counter(pmc); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b3e488a74828..1fc05e428aba 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -131,6 +131,28 @@ static const u32 host_save_user_msrs[] = { #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) +struct kvm_sev_info { + bool active; /* SEV enabled guest */ + unsigned int asid; /* ASID used for this guest */ + unsigned int handle; /* SEV firmware handle */ + int fd; /* SEV device fd */ + unsigned long pages_locked; /* Number of pages locked */ + struct list_head regions_list; /* List of registered regions */ +}; + +struct kvm_svm { + struct kvm kvm; + + /* Struct members for AVIC */ + u32 avic_vm_id; + u32 ldr_mode; + struct page *avic_logical_id_table_page; + struct page *avic_physical_id_table_page; + struct hlist_node hnode; + + struct kvm_sev_info sev_info; +}; + struct kvm_vcpu; struct nested_state { @@ -178,6 +200,8 @@ struct vcpu_svm { uint64_t sysenter_eip; uint64_t tsc_aux; + u64 msr_decfg; + u64 next_rip; u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; @@ -274,6 +298,54 @@ static bool npt_enabled = true; static bool npt_enabled; #endif +/* + * These 2 parameters are used to config the controls for Pause-Loop Exiting: + * pause_filter_count: On processors that support Pause filtering(indicated + * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter + * count value. On VMRUN this value is loaded into an internal counter. + * Each time a pause instruction is executed, this counter is decremented + * until it reaches zero at which time a #VMEXIT is generated if pause + * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause + * Intercept Filtering for more details. + * This also indicate if ple logic enabled. + * + * pause_filter_thresh: In addition, some processor families support advanced + * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on + * the amount of time a guest is allowed to execute in a pause loop. + * In this mode, a 16-bit pause filter threshold field is added in the + * VMCB. The threshold value is a cycle count that is used to reset the + * pause counter. As with simple pause filtering, VMRUN loads the pause + * count value from VMCB into an internal counter. Then, on each pause + * instruction the hardware checks the elapsed number of cycles since + * the most recent pause instruction against the pause filter threshold. + * If the elapsed cycle count is greater than the pause filter threshold, + * then the internal pause count is reloaded from the VMCB and execution + * continues. If the elapsed cycle count is less than the pause filter + * threshold, then the internal pause count is decremented. If the count + * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is + * triggered. If advanced pause filtering is supported and pause filter + * threshold field is set to zero, the filter will operate in the simpler, + * count only mode. + */ + +static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP; +module_param(pause_filter_thresh, ushort, 0444); + +static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW; +module_param(pause_filter_count, ushort, 0444); + +/* Default doubles per-vcpu window every exit. */ +static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW; +module_param(pause_filter_count_grow, ushort, 0444); + +/* Default resets per-vcpu window every exit to pause_filter_count. */ +static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; +module_param(pause_filter_count_shrink, ushort, 0444); + +/* Default is to compute the maximum so we can never overflow. */ +static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX; +module_param(pause_filter_count_max, ushort, 0444); + /* allow nested paging (virtualized MMU) for all guests */ static int npt = true; module_param(npt, int, S_IRUGO); @@ -300,6 +372,8 @@ module_param(vgif, int, 0444); static int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT); module_param(sev, int, 0444); +static u8 rsm_ins_bytes[] = "\x0f\xaa"; + static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa); static void svm_complete_interrupts(struct vcpu_svm *svm); @@ -348,6 +422,12 @@ struct enc_region { unsigned long size; }; + +static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) +{ + return container_of(kvm, struct kvm_svm, kvm); +} + static inline bool svm_sev_enabled(void) { return max_sev_asid; @@ -355,14 +435,14 @@ static inline bool svm_sev_enabled(void) static inline bool sev_guest(struct kvm *kvm) { - struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; return sev->active; } static inline int sev_get_asid(struct kvm *kvm) { - struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; return sev->asid; } @@ -1079,7 +1159,7 @@ static void disable_nmi_singlestep(struct vcpu_svm *svm) } /* Note: - * This hash table is used to map VM_ID to a struct kvm_arch, + * This hash table is used to map VM_ID to a struct kvm_svm, * when handling AMD IOMMU GALOG notification to schedule in * a particular vCPU. */ @@ -1096,7 +1176,7 @@ static DEFINE_SPINLOCK(svm_vm_data_hash_lock); static int avic_ga_log_notifier(u32 ga_tag) { unsigned long flags; - struct kvm_arch *ka = NULL; + struct kvm_svm *kvm_svm; struct kvm_vcpu *vcpu = NULL; u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag); u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag); @@ -1104,13 +1184,10 @@ static int avic_ga_log_notifier(u32 ga_tag) pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id); spin_lock_irqsave(&svm_vm_data_hash_lock, flags); - hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) { - struct kvm *kvm = container_of(ka, struct kvm, arch); - struct kvm_arch *vm_data = &kvm->arch; - - if (vm_data->avic_vm_id != vm_id) + hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) { + if (kvm_svm->avic_vm_id != vm_id) continue; - vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); + vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id); break; } spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); @@ -1168,6 +1245,42 @@ err: return rc; } +static void grow_ple_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_control_area *control = &svm->vmcb->control; + int old = control->pause_filter_count; + + control->pause_filter_count = __grow_ple_window(old, + pause_filter_count, + pause_filter_count_grow, + pause_filter_count_max); + + if (control->pause_filter_count != old) + mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + + trace_kvm_ple_window_grow(vcpu->vcpu_id, + control->pause_filter_count, old); +} + +static void shrink_ple_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_control_area *control = &svm->vmcb->control; + int old = control->pause_filter_count; + + control->pause_filter_count = + __shrink_ple_window(old, + pause_filter_count, + pause_filter_count_shrink, + pause_filter_count); + if (control->pause_filter_count != old) + mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + + trace_kvm_ple_window_shrink(vcpu->vcpu_id, + control->pause_filter_count, old); +} + static __init int svm_hardware_setup(void) { int cpu; @@ -1198,6 +1311,14 @@ static __init int svm_hardware_setup(void) kvm_tsc_scaling_ratio_frac_bits = 32; } + /* Check for pause filtering support */ + if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { + pause_filter_count = 0; + pause_filter_thresh = 0; + } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) { + pause_filter_thresh = 0; + } + if (nested) { printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); @@ -1302,12 +1423,23 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } +static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (is_guest_mode(vcpu)) + return svm->nested.hsave->control.tsc_offset; + + return vcpu->arch.tsc_offset; +} + static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { struct vcpu_svm *svm = to_svm(vcpu); u64 g_tsc_offset = 0; if (is_guest_mode(vcpu)) { + /* Write L1's TSC offset. */ g_tsc_offset = svm->vmcb->control.tsc_offset - svm->nested.hsave->control.tsc_offset; svm->nested.hsave->control.tsc_offset = offset; @@ -1324,10 +1456,10 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) static void avic_init_vmcb(struct vcpu_svm *svm) { struct vmcb *vmcb = svm->vmcb; - struct kvm_arch *vm_data = &svm->vcpu.kvm->arch; + struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm); phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page)); - phys_addr_t lpa = __sme_set(page_to_phys(vm_data->avic_logical_id_table_page)); - phys_addr_t ppa = __sme_set(page_to_phys(vm_data->avic_physical_id_table_page)); + phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page)); + phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page)); vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; @@ -1359,6 +1491,14 @@ static void init_vmcb(struct vcpu_svm *svm) set_exception_intercept(svm, MC_VECTOR); set_exception_intercept(svm, AC_VECTOR); set_exception_intercept(svm, DB_VECTOR); + /* + * Guest access to VMware backdoor ports could legitimately + * trigger #GP because of TSS I/O permission bitmap. + * We intercept those #GP and allow access to them anyway + * as VMware does. + */ + if (enable_vmware_backdoor) + set_exception_intercept(svm, GP_VECTOR); set_intercept(svm, INTERCEPT_INTR); set_intercept(svm, INTERCEPT_NMI); @@ -1367,7 +1507,6 @@ static void init_vmcb(struct vcpu_svm *svm) set_intercept(svm, INTERCEPT_RDPMC); set_intercept(svm, INTERCEPT_CPUID); set_intercept(svm, INTERCEPT_INVD); - set_intercept(svm, INTERCEPT_HLT); set_intercept(svm, INTERCEPT_INVLPG); set_intercept(svm, INTERCEPT_INVLPGA); set_intercept(svm, INTERCEPT_IOIO_PROT); @@ -1383,12 +1522,16 @@ static void init_vmcb(struct vcpu_svm *svm) set_intercept(svm, INTERCEPT_SKINIT); set_intercept(svm, INTERCEPT_WBINVD); set_intercept(svm, INTERCEPT_XSETBV); + set_intercept(svm, INTERCEPT_RSM); - if (!kvm_mwait_in_guest()) { + if (!kvm_mwait_in_guest(svm->vcpu.kvm)) { set_intercept(svm, INTERCEPT_MONITOR); set_intercept(svm, INTERCEPT_MWAIT); } + if (!kvm_hlt_in_guest(svm->vcpu.kvm)) + set_intercept(svm, INTERCEPT_HLT); + control->iopm_base_pa = __sme_set(iopm_base); control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); control->int_ctl = V_INTR_MASKING_MASK; @@ -1444,9 +1587,13 @@ static void init_vmcb(struct vcpu_svm *svm) svm->nested.vmcb = 0; svm->vcpu.arch.hflags = 0; - if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { - control->pause_filter_count = 3000; + if (pause_filter_count) { + control->pause_filter_count = pause_filter_count; + if (pause_filter_thresh) + control->pause_filter_thresh = pause_filter_thresh; set_intercept(svm, INTERCEPT_PAUSE); + } else { + clr_intercept(svm, INTERCEPT_PAUSE); } if (kvm_vcpu_apicv_active(&svm->vcpu)) @@ -1483,12 +1630,12 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, unsigned int index) { u64 *avic_physical_id_table; - struct kvm_arch *vm_data = &vcpu->kvm->arch; + struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); if (index >= AVIC_MAX_PHYSICAL_ID_COUNT) return NULL; - avic_physical_id_table = page_address(vm_data->avic_physical_id_table_page); + avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page); return &avic_physical_id_table[index]; } @@ -1571,7 +1718,7 @@ static void __sev_asid_free(int asid) static void sev_asid_free(struct kvm *kvm) { - struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; __sev_asid_free(sev->asid); } @@ -1611,7 +1758,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, unsigned long ulen, unsigned long *n, int write) { - struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; unsigned long npages, npinned, size; unsigned long locked, lock_limit; struct page **pages; @@ -1662,7 +1809,7 @@ err: static void sev_unpin_memory(struct kvm *kvm, struct page **pages, unsigned long npages) { - struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; release_pages(pages, npages); kvfree(pages); @@ -1700,9 +1847,20 @@ static void __unregister_enc_region_locked(struct kvm *kvm, kfree(region); } +static struct kvm *svm_vm_alloc(void) +{ + struct kvm_svm *kvm_svm = kzalloc(sizeof(struct kvm_svm), GFP_KERNEL); + return &kvm_svm->kvm; +} + +static void svm_vm_free(struct kvm *kvm) +{ + kfree(to_kvm_svm(kvm)); +} + static void sev_vm_destroy(struct kvm *kvm) { - struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct list_head *head = &sev->regions_list; struct list_head *pos, *q; @@ -1731,18 +1889,18 @@ static void sev_vm_destroy(struct kvm *kvm) static void avic_vm_destroy(struct kvm *kvm) { unsigned long flags; - struct kvm_arch *vm_data = &kvm->arch; + struct kvm_svm *kvm_svm = to_kvm_svm(kvm); if (!avic) return; - if (vm_data->avic_logical_id_table_page) - __free_page(vm_data->avic_logical_id_table_page); - if (vm_data->avic_physical_id_table_page) - __free_page(vm_data->avic_physical_id_table_page); + if (kvm_svm->avic_logical_id_table_page) + __free_page(kvm_svm->avic_logical_id_table_page); + if (kvm_svm->avic_physical_id_table_page) + __free_page(kvm_svm->avic_physical_id_table_page); spin_lock_irqsave(&svm_vm_data_hash_lock, flags); - hash_del(&vm_data->hnode); + hash_del(&kvm_svm->hnode); spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); } @@ -1756,10 +1914,10 @@ static int avic_vm_init(struct kvm *kvm) { unsigned long flags; int err = -ENOMEM; - struct kvm_arch *vm_data = &kvm->arch; + struct kvm_svm *kvm_svm = to_kvm_svm(kvm); + struct kvm_svm *k2; struct page *p_page; struct page *l_page; - struct kvm_arch *ka; u32 vm_id; if (!avic) @@ -1770,7 +1928,7 @@ static int avic_vm_init(struct kvm *kvm) if (!p_page) goto free_avic; - vm_data->avic_physical_id_table_page = p_page; + kvm_svm->avic_physical_id_table_page = p_page; clear_page(page_address(p_page)); /* Allocating logical APIC ID table (4KB) */ @@ -1778,7 +1936,7 @@ static int avic_vm_init(struct kvm *kvm) if (!l_page) goto free_avic; - vm_data->avic_logical_id_table_page = l_page; + kvm_svm->avic_logical_id_table_page = l_page; clear_page(page_address(l_page)); spin_lock_irqsave(&svm_vm_data_hash_lock, flags); @@ -1790,15 +1948,13 @@ static int avic_vm_init(struct kvm *kvm) } /* Is it still in use? Only possible if wrapped at least once */ if (next_vm_id_wrapped) { - hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) { - struct kvm *k2 = container_of(ka, struct kvm, arch); - struct kvm_arch *vd2 = &k2->arch; - if (vd2->avic_vm_id == vm_id) + hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) { + if (k2->avic_vm_id == vm_id) goto again; } } - vm_data->avic_vm_id = vm_id; - hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id); + kvm_svm->avic_vm_id = vm_id; + hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id); spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); return 0; @@ -1902,6 +2058,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) u32 dummy; u32 eax = 1; + vcpu->arch.microcode_version = 0x01000065; svm->spec_ctrl = 0; if (!init_event) { @@ -2529,14 +2686,7 @@ static int bp_interception(struct vcpu_svm *svm) static int ud_interception(struct vcpu_svm *svm) { - int er; - - er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD); - if (er == EMULATE_USER_EXIT) - return 0; - if (er != EMULATE_DONE) - kvm_queue_exception(&svm->vcpu, UD_VECTOR); - return 1; + return handle_ud(&svm->vcpu); } static int ac_interception(struct vcpu_svm *svm) @@ -2545,6 +2695,23 @@ static int ac_interception(struct vcpu_svm *svm) return 1; } +static int gp_interception(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + u32 error_code = svm->vmcb->control.exit_info_1; + int er; + + WARN_ON_ONCE(!enable_vmware_backdoor); + + er = emulate_instruction(vcpu, + EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); + if (er == EMULATE_USER_EXIT) + return 0; + else if (er != EMULATE_DONE) + kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); + return 1; +} + static bool is_erratum_383(void) { int err, i; @@ -2633,7 +2800,7 @@ static int io_interception(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ - int size, in, string, ret; + int size, in, string; unsigned port; ++svm->vcpu.stat.io_exits; @@ -2645,16 +2812,8 @@ static int io_interception(struct vcpu_svm *svm) port = io_info >> 16; size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; svm->next_rip = svm->vmcb->control.exit_info_2; - ret = kvm_skip_emulated_instruction(&svm->vcpu); - /* - * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered - * KVM_EXIT_DEBUG here. - */ - if (in) - return kvm_fast_pio_in(vcpu, size, port) && ret; - else - return kvm_fast_pio_out(vcpu, size, port) && ret; + return kvm_fast_pio(&svm->vcpu, size, port, in); } static int nmi_interception(struct vcpu_svm *svm) @@ -3174,6 +3333,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) /* Restore the original control entries */ copy_vmcb_control_area(vmcb, hsave); + svm->vcpu.arch.tsc_offset = svm->vmcb->control.tsc_offset; kvm_clear_exception_queue(&svm->vcpu); kvm_clear_interrupt_queue(&svm->vcpu); @@ -3334,10 +3494,12 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, /* We don't want to see VMMCALLs from a nested guest */ clr_intercept(svm, INTERCEPT_VMMCALL); + svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset; + svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset; + svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext; svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; svm->vmcb->control.int_state = nested_vmcb->control.int_state; - svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; @@ -3699,6 +3861,12 @@ static int emulate_on_interception(struct vcpu_svm *svm) return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; } +static int rsm_interception(struct vcpu_svm *svm) +{ + return x86_emulate_instruction(&svm->vcpu, 0, 0, + rsm_ins_bytes, 2) == EMULATE_DONE; +} + static int rdpmc_interception(struct vcpu_svm *svm) { int err; @@ -3860,17 +4028,27 @@ static int cr8_write_interception(struct vcpu_svm *svm) return 0; } +static int svm_get_msr_feature(struct kvm_msr_entry *msr) +{ + msr->data = 0; + + switch (msr->index) { + case MSR_F10H_DECFG: + if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) + msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE; + break; + default: + return 1; + } + + return 0; +} + static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_svm *svm = to_svm(vcpu); switch (msr_info->index) { - case MSR_IA32_TSC: { - msr_info->data = svm->vmcb->control.tsc_offset + - kvm_scale_tsc(vcpu, rdtsc()); - - break; - } case MSR_STAR: msr_info->data = svm->vmcb->save.star; break; @@ -3935,9 +4113,6 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = svm->spec_ctrl; break; - case MSR_IA32_UCODE_REV: - msr_info->data = 0x01000065; - break; case MSR_F15H_IC_CFG: { int family, model; @@ -3955,6 +4130,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0x1E; } break; + case MSR_F10H_DECFG: + msr_info->data = svm->msr_decfg; + break; default: return kvm_get_msr_common(vcpu, msr_info); } @@ -4023,9 +4201,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->vmcb->save.g_pat = data; mark_dirty(svm->vmcb, VMCB_NPT); break; - case MSR_IA32_TSC: - kvm_write_tsc(vcpu, msr); - break; case MSR_IA32_SPEC_CTRL: if (!msr->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_IBRS)) @@ -4133,6 +4308,24 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_VM_IGNNE: vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; + case MSR_F10H_DECFG: { + struct kvm_msr_entry msr_entry; + + msr_entry.index = msr->index; + if (svm_get_msr_feature(&msr_entry)) + return 1; + + /* Check the supported bits */ + if (data & ~msr_entry.data) + return 1; + + /* Don't allow the guest to change a bit, #GP */ + if (!msr->host_initiated && (data ^ msr_entry.data)) + return 1; + + svm->msr_decfg = data; + break; + } case MSR_IA32_APICBASE: if (kvm_vcpu_apicv_active(vcpu)) avic_update_vapic_bar(to_svm(vcpu), data); @@ -4187,6 +4380,9 @@ static int pause_interception(struct vcpu_svm *svm) struct kvm_vcpu *vcpu = &svm->vcpu; bool in_kernel = (svm_get_cpl(vcpu) == 0); + if (pause_filter_thresh) + grow_ple_window(vcpu); + kvm_vcpu_on_spin(vcpu, in_kernel); return 1; } @@ -4277,7 +4473,7 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm) static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) { - struct kvm_arch *vm_data = &vcpu->kvm->arch; + struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); int index; u32 *logical_apic_id_table; int dlid = GET_APIC_LOGICAL_ID(ldr); @@ -4299,7 +4495,7 @@ static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) index = (cluster << 2) + apic; } - logical_apic_id_table = (u32 *) page_address(vm_data->avic_logical_id_table_page); + logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page); return &logical_apic_id_table[index]; } @@ -4379,7 +4575,7 @@ static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu) static int avic_handle_dfr_update(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct kvm_arch *vm_data = &vcpu->kvm->arch; + struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR); u32 mod = (dfr >> 28) & 0xf; @@ -4388,11 +4584,11 @@ static int avic_handle_dfr_update(struct kvm_vcpu *vcpu) * If this changes, we need to flush the AVIC logical * APID id table. */ - if (vm_data->ldr_mode == mod) + if (kvm_svm->ldr_mode == mod) return 0; - clear_page(page_address(vm_data->avic_logical_id_table_page)); - vm_data->ldr_mode = mod; + clear_page(page_address(kvm_svm->avic_logical_id_table_page)); + kvm_svm->ldr_mode = mod; if (svm->ldr_reg) avic_handle_ldr_update(vcpu); @@ -4512,6 +4708,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception, + [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception, [SVM_EXIT_INTR] = intr_interception, [SVM_EXIT_NMI] = nmi_interception, [SVM_EXIT_SMI] = nop_on_interception, @@ -4541,7 +4738,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_MWAIT] = mwait_interception, [SVM_EXIT_XSETBV] = xsetbv_interception, [SVM_EXIT_NPF] = npf_interception, - [SVM_EXIT_RSM] = emulate_on_interception, + [SVM_EXIT_RSM] = rsm_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, }; @@ -4560,6 +4757,8 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions); pr_err("%-20s%016llx\n", "intercepts:", control->intercept); pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count); + pr_err("%-20s%d\n", "pause filter threshold:", + control->pause_filter_thresh); pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa); pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa); pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset); @@ -5027,7 +5226,7 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, /* Try to enable guest_mode in IRTE */ pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK); - pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id, + pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, svm->vcpu.vcpu_id); pi.is_guest_mode = true; pi.vcpu_data = &vcpu_info; @@ -5071,9 +5270,8 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, } if (!ret && svm) { - trace_kvm_pi_irte_update(svm->vcpu.vcpu_id, - host_irq, e->gsi, - vcpu_info.vector, + trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id, + e->gsi, vcpu_info.vector, vcpu_info.pi_desc_addr, set); } @@ -5191,6 +5389,11 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) return 0; } +static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) +{ + return 0; +} + static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) { struct vcpu_svm *svm = to_svm(vcpu); @@ -5355,7 +5558,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) * being speculatively taken. */ if (svm->spec_ctrl) - wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); + native_wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); asm volatile ( "push %%" _ASM_BP "; \n\t" @@ -5464,11 +5667,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) * If the L02 MSR bitmap does not intercept the MSR, then we need to * save it. */ - if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) - rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); + if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) + svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); if (svm->spec_ctrl) - wrmsrl(MSR_IA32_SPEC_CTRL, 0); + native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); /* Eliminate branch target predictions from guest mode */ vmexit_fill_RSB(); @@ -5492,14 +5695,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) - kvm_before_handle_nmi(&svm->vcpu); + kvm_before_interrupt(&svm->vcpu); stgi(); /* Any pending NMI will happen here */ if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) - kvm_after_handle_nmi(&svm->vcpu); + kvm_after_interrupt(&svm->vcpu); sync_cr8_to_lapic(vcpu); @@ -5875,6 +6078,8 @@ static void svm_handle_external_intr(struct kvm_vcpu *vcpu) static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) { + if (pause_filter_thresh) + shrink_ple_window(vcpu); } static inline void avic_post_state_restore(struct kvm_vcpu *vcpu) @@ -5991,7 +6196,7 @@ static int sev_asid_new(void) static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; int asid, ret; ret = -EBUSY; @@ -6056,14 +6261,14 @@ static int __sev_issue_cmd(int fd, int id, void *data, int *error) static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error) { - struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; return __sev_issue_cmd(sev->fd, id, data, error); } static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_launch_start *start; struct kvm_sev_launch_start params; void *dh_blob, *session_blob; @@ -6161,7 +6366,7 @@ static int get_num_contig_pages(int idx, struct page **inpages, static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) { unsigned long vaddr, vaddr_end, next_vaddr, npages, size; - struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_launch_update_data params; struct sev_data_launch_update_data *data; struct page **inpages; @@ -6236,16 +6441,18 @@ e_free: static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &kvm->arch.sev_info; + void __user *measure = (void __user *)(uintptr_t)argp->data; + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_launch_measure *data; struct kvm_sev_launch_measure params; + void __user *p = NULL; void *blob = NULL; int ret; if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + if (copy_from_user(¶ms, measure, sizeof(params))) return -EFAULT; data = kzalloc(sizeof(*data), GFP_KERNEL); @@ -6256,17 +6463,13 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!params.len) goto cmd; - if (params.uaddr) { + p = (void __user *)(uintptr_t)params.uaddr; + if (p) { if (params.len > SEV_FW_BLOB_MAX_SIZE) { ret = -EINVAL; goto e_free; } - if (!access_ok(VERIFY_WRITE, params.uaddr, params.len)) { - ret = -EFAULT; - goto e_free; - } - ret = -ENOMEM; blob = kmalloc(params.len, GFP_KERNEL); if (!blob) @@ -6290,13 +6493,13 @@ cmd: goto e_free_blob; if (blob) { - if (copy_to_user((void __user *)(uintptr_t)params.uaddr, blob, params.len)) + if (copy_to_user(p, blob, params.len)) ret = -EFAULT; } done: params.len = data->len; - if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, sizeof(params))) + if (copy_to_user(measure, ¶ms, sizeof(params))) ret = -EFAULT; e_free_blob: kfree(blob); @@ -6307,7 +6510,7 @@ e_free: static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_launch_finish *data; int ret; @@ -6327,7 +6530,7 @@ static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_guest_status params; struct sev_data_guest_status *data; int ret; @@ -6359,7 +6562,7 @@ static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src, unsigned long dst, int size, int *error, bool enc) { - struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_dbg *data; int ret; @@ -6591,13 +6794,13 @@ err: static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_launch_secret *data; struct kvm_sev_launch_secret params; struct page **pages; void *blob, *hdr; unsigned long n; - int ret; + int ret, offset; if (!sev_guest(kvm)) return -ENOTTY; @@ -6623,6 +6826,10 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!data) goto e_unpin_memory; + offset = params.guest_uaddr & (PAGE_SIZE - 1); + data->guest_address = __sme_page_pa(pages[0]) + offset; + data->guest_len = params.guest_len; + blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len); if (IS_ERR(blob)) { ret = PTR_ERR(blob); @@ -6637,8 +6844,8 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) ret = PTR_ERR(hdr); goto e_free_blob; } - data->trans_address = __psp_pa(blob); - data->trans_len = params.trans_len; + data->hdr_address = __psp_pa(hdr); + data->hdr_len = params.hdr_len; data->handle = sev->handle; ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error); @@ -6711,7 +6918,7 @@ out: static int svm_register_enc_region(struct kvm *kvm, struct kvm_enc_region *range) { - struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct enc_region *region; int ret = 0; @@ -6753,7 +6960,7 @@ e_free: static struct enc_region * find_enc_region(struct kvm *kvm, struct kvm_enc_region *range) { - struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct list_head *head = &sev->regions_list; struct enc_region *i; @@ -6811,6 +7018,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .vcpu_free = svm_free_vcpu, .vcpu_reset = svm_vcpu_reset, + .vm_alloc = svm_vm_alloc, + .vm_free = svm_vm_free, .vm_init = avic_vm_init, .vm_destroy = svm_vm_destroy, @@ -6821,6 +7030,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .vcpu_unblocking = svm_vcpu_unblocking, .update_bp_intercept = update_bp_intercept, + .get_msr_feature = svm_get_msr_feature, .get_msr = svm_get_msr, .set_msr = svm_set_msr, .get_segment_base = svm_get_segment_base, @@ -6876,6 +7086,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .apicv_post_state_restore = avic_post_state_restore, .set_tss_addr = svm_set_tss_addr, + .set_identity_map_addr = svm_set_identity_map_addr, .get_tdp_level = get_npt_level, .get_mt_mask = svm_get_mt_mask, @@ -6895,6 +7106,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .has_wbinvd_exit = svm_has_wbinvd_exit, + .read_l1_tsc_offset = svm_read_l1_tsc_offset, .write_tsc_offset = svm_write_tsc_offset, .set_tdp_cr3 = set_tdp_cr3, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f427723dc7db..c7668806163f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -52,9 +52,11 @@ #include <asm/irq_remapping.h> #include <asm/mmu_context.h> #include <asm/nospec-branch.h> +#include <asm/mshyperv.h> #include "trace.h" #include "pmu.h" +#include "vmx_evmcs.h" #define __ex(x) __kvm_handle_fault_on_reboot(x) #define __ex_clear(x, reg) \ @@ -130,13 +132,15 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); #endif #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) -#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) -#define KVM_VM_CR0_ALWAYS_ON \ - (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE +#define KVM_VM_CR0_ALWAYS_ON \ + (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \ + X86_CR0_WP | X86_CR0_PG | X86_CR0_PE) #define KVM_CR4_GUEST_OWNED_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD) +#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) @@ -165,34 +169,33 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); * Time is measured based on a counter that runs at the same rate as the TSC, * refer SDM volume 3b section 21.6.13 & 22.1.3. */ -#define KVM_VMX_DEFAULT_PLE_GAP 128 -#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 -#define KVM_VMX_DEFAULT_PLE_WINDOW_GROW 2 -#define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0 -#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX \ - INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW +static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; -static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; -module_param(ple_gap, int, S_IRUGO); - -static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; -module_param(ple_window, int, S_IRUGO); +static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; +module_param(ple_window, uint, 0444); /* Default doubles per-vcpu window every exit. */ -static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW; -module_param(ple_window_grow, int, S_IRUGO); +static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW; +module_param(ple_window_grow, uint, 0444); /* Default resets per-vcpu window every exit to ple_window. */ -static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK; -module_param(ple_window_shrink, int, S_IRUGO); +static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; +module_param(ple_window_shrink, uint, 0444); /* Default is to compute the maximum so we can never overflow. */ -static int ple_window_actual_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; -static int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; -module_param(ple_window_max, int, S_IRUGO); +static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; +module_param(ple_window_max, uint, 0444); extern const ulong vmx_return; +struct kvm_vmx { + struct kvm kvm; + + unsigned int tss_addr; + bool ept_identity_pagetable_done; + gpa_t ept_identity_map_addr; +}; + #define NR_AUTOLOAD_MSRS 8 struct vmcs { @@ -424,6 +427,35 @@ struct __packed vmcs12 { */ #define VMCS12_MAX_FIELD_INDEX 0x17 +struct nested_vmx_msrs { + /* + * We only store the "true" versions of the VMX capability MSRs. We + * generate the "non-true" versions by setting the must-be-1 bits + * according to the SDM. + */ + u32 procbased_ctls_low; + u32 procbased_ctls_high; + u32 secondary_ctls_low; + u32 secondary_ctls_high; + u32 pinbased_ctls_low; + u32 pinbased_ctls_high; + u32 exit_ctls_low; + u32 exit_ctls_high; + u32 entry_ctls_low; + u32 entry_ctls_high; + u32 misc_low; + u32 misc_high; + u32 ept_caps; + u32 vpid_caps; + u64 basic; + u64 cr0_fixed0; + u64 cr0_fixed1; + u64 cr4_fixed0; + u64 cr4_fixed1; + u64 vmcs_enum; + u64 vmfunc_controls; +}; + /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. @@ -475,32 +507,7 @@ struct nested_vmx { u16 vpid02; u16 last_vpid; - /* - * We only store the "true" versions of the VMX capability MSRs. We - * generate the "non-true" versions by setting the must-be-1 bits - * according to the SDM. - */ - u32 nested_vmx_procbased_ctls_low; - u32 nested_vmx_procbased_ctls_high; - u32 nested_vmx_secondary_ctls_low; - u32 nested_vmx_secondary_ctls_high; - u32 nested_vmx_pinbased_ctls_low; - u32 nested_vmx_pinbased_ctls_high; - u32 nested_vmx_exit_ctls_low; - u32 nested_vmx_exit_ctls_high; - u32 nested_vmx_entry_ctls_low; - u32 nested_vmx_entry_ctls_high; - u32 nested_vmx_misc_low; - u32 nested_vmx_misc_high; - u32 nested_vmx_ept_caps; - u32 nested_vmx_vpid_caps; - u64 nested_vmx_basic; - u64 nested_vmx_cr0_fixed0; - u64 nested_vmx_cr0_fixed1; - u64 nested_vmx_cr4_fixed0; - u64 nested_vmx_cr4_fixed1; - u64 nested_vmx_vmcs_enum; - u64 nested_vmx_vmfunc_controls; + struct nested_vmx_msrs msrs; /* SMM related state */ struct { @@ -691,6 +698,11 @@ enum segment_cache_field { SEG_FIELD_NR = 4 }; +static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) +{ + return container_of(kvm, struct kvm_vmx, kvm); +} + static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) { return container_of(vcpu, struct vcpu_vmx, vcpu); @@ -953,6 +965,7 @@ static struct vmcs_config { u32 cpu_based_2nd_exec_ctrl; u32 vmexit_ctrl; u32 vmentry_ctrl; + struct nested_vmx_msrs nested; } vmcs_config; static struct vmx_capability { @@ -999,6 +1012,169 @@ static const u32 vmx_msr_index[] = { MSR_EFER, MSR_TSC_AUX, MSR_STAR, }; +DEFINE_STATIC_KEY_FALSE(enable_evmcs); + +#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs)) + +#define KVM_EVMCS_VERSION 1 + +#if IS_ENABLED(CONFIG_HYPERV) +static bool __read_mostly enlightened_vmcs = true; +module_param(enlightened_vmcs, bool, 0444); + +static inline void evmcs_write64(unsigned long field, u64 value) +{ + u16 clean_field; + int offset = get_evmcs_offset(field, &clean_field); + + if (offset < 0) + return; + + *(u64 *)((char *)current_evmcs + offset) = value; + + current_evmcs->hv_clean_fields &= ~clean_field; +} + +static inline void evmcs_write32(unsigned long field, u32 value) +{ + u16 clean_field; + int offset = get_evmcs_offset(field, &clean_field); + + if (offset < 0) + return; + + *(u32 *)((char *)current_evmcs + offset) = value; + current_evmcs->hv_clean_fields &= ~clean_field; +} + +static inline void evmcs_write16(unsigned long field, u16 value) +{ + u16 clean_field; + int offset = get_evmcs_offset(field, &clean_field); + + if (offset < 0) + return; + + *(u16 *)((char *)current_evmcs + offset) = value; + current_evmcs->hv_clean_fields &= ~clean_field; +} + +static inline u64 evmcs_read64(unsigned long field) +{ + int offset = get_evmcs_offset(field, NULL); + + if (offset < 0) + return 0; + + return *(u64 *)((char *)current_evmcs + offset); +} + +static inline u32 evmcs_read32(unsigned long field) +{ + int offset = get_evmcs_offset(field, NULL); + + if (offset < 0) + return 0; + + return *(u32 *)((char *)current_evmcs + offset); +} + +static inline u16 evmcs_read16(unsigned long field) +{ + int offset = get_evmcs_offset(field, NULL); + + if (offset < 0) + return 0; + + return *(u16 *)((char *)current_evmcs + offset); +} + +static void evmcs_load(u64 phys_addr) +{ + struct hv_vp_assist_page *vp_ap = + hv_get_vp_assist_page(smp_processor_id()); + + vp_ap->current_nested_vmcs = phys_addr; + vp_ap->enlighten_vmentry = 1; +} + +static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) +{ + /* + * Enlightened VMCSv1 doesn't support these: + * + * POSTED_INTR_NV = 0x00000002, + * GUEST_INTR_STATUS = 0x00000810, + * APIC_ACCESS_ADDR = 0x00002014, + * POSTED_INTR_DESC_ADDR = 0x00002016, + * EOI_EXIT_BITMAP0 = 0x0000201c, + * EOI_EXIT_BITMAP1 = 0x0000201e, + * EOI_EXIT_BITMAP2 = 0x00002020, + * EOI_EXIT_BITMAP3 = 0x00002022, + */ + vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; + vmcs_conf->cpu_based_2nd_exec_ctrl &= + ~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; + vmcs_conf->cpu_based_2nd_exec_ctrl &= + ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + vmcs_conf->cpu_based_2nd_exec_ctrl &= + ~SECONDARY_EXEC_APIC_REGISTER_VIRT; + + /* + * GUEST_PML_INDEX = 0x00000812, + * PML_ADDRESS = 0x0000200e, + */ + vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_PML; + + /* VM_FUNCTION_CONTROL = 0x00002018, */ + vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_VMFUNC; + + /* + * EPTP_LIST_ADDRESS = 0x00002024, + * VMREAD_BITMAP = 0x00002026, + * VMWRITE_BITMAP = 0x00002028, + */ + vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_SHADOW_VMCS; + + /* + * TSC_MULTIPLIER = 0x00002032, + */ + vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_TSC_SCALING; + + /* + * PLE_GAP = 0x00004020, + * PLE_WINDOW = 0x00004022, + */ + vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; + + /* + * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, + */ + vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + + /* + * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808, + * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04, + */ + vmcs_conf->vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + vmcs_conf->vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + + /* + * Currently unsupported in KVM: + * GUEST_IA32_RTIT_CTL = 0x00002814, + */ +} +#else /* !IS_ENABLED(CONFIG_HYPERV) */ +static inline void evmcs_write64(unsigned long field, u64 value) {} +static inline void evmcs_write32(unsigned long field, u32 value) {} +static inline void evmcs_write16(unsigned long field, u16 value) {} +static inline u64 evmcs_read64(unsigned long field) { return 0; } +static inline u32 evmcs_read32(unsigned long field) { return 0; } +static inline u16 evmcs_read16(unsigned long field) { return 0; } +static inline void evmcs_load(u64 phys_addr) {} +static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {} +#endif /* IS_ENABLED(CONFIG_HYPERV) */ + static inline bool is_exception_n(u32 intr_info, u8 vector) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | @@ -1031,6 +1207,11 @@ static inline bool is_invalid_opcode(u32 intr_info) return is_exception_n(intr_info, UD_VECTOR); } +static inline bool is_gp_fault(u32 intr_info) +{ + return is_exception_n(intr_info, GP_VECTOR); +} + static inline bool is_external_interrupt(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -1044,6 +1225,13 @@ static inline bool is_machine_check(u32 intr_info) (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); } +/* Undocumented: icebp/int1 */ +static inline bool is_icebp(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) + == (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK); +} + static inline bool cpu_has_vmx_msr_bitmap(void) { return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; @@ -1313,7 +1501,7 @@ static inline bool report_flexpriority(void) static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu) { - return vmx_misc_cr3_count(to_vmx(vcpu)->nested.nested_vmx_misc_low); + return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low); } static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) @@ -1334,6 +1522,16 @@ static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) PIN_BASED_VMX_PREEMPTION_TIMER; } +static inline bool nested_cpu_has_nmi_exiting(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING; +} + +static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; +} + static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); @@ -1472,6 +1670,9 @@ static void vmcs_load(struct vmcs *vmcs) u64 phys_addr = __pa(vmcs); u8 error; + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_load(phys_addr); + asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) : "cc", "memory"); @@ -1645,18 +1846,24 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) static __always_inline u16 vmcs_read16(unsigned long field) { vmcs_check16(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_read16(field); return __vmcs_readl(field); } static __always_inline u32 vmcs_read32(unsigned long field) { vmcs_check32(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_read32(field); return __vmcs_readl(field); } static __always_inline u64 vmcs_read64(unsigned long field) { vmcs_check64(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_read64(field); #ifdef CONFIG_X86_64 return __vmcs_readl(field); #else @@ -1667,6 +1874,8 @@ static __always_inline u64 vmcs_read64(unsigned long field) static __always_inline unsigned long vmcs_readl(unsigned long field) { vmcs_checkl(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_read64(field); return __vmcs_readl(field); } @@ -1690,18 +1899,27 @@ static __always_inline void __vmcs_writel(unsigned long field, unsigned long val static __always_inline void vmcs_write16(unsigned long field, u16 value) { vmcs_check16(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write16(field, value); + __vmcs_writel(field, value); } static __always_inline void vmcs_write32(unsigned long field, u32 value) { vmcs_check32(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write32(field, value); + __vmcs_writel(field, value); } static __always_inline void vmcs_write64(unsigned long field, u64 value) { vmcs_check64(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write64(field, value); + __vmcs_writel(field, value); #ifndef CONFIG_X86_64 asm volatile (""); @@ -1712,6 +1930,9 @@ static __always_inline void vmcs_write64(unsigned long field, u64 value) static __always_inline void vmcs_writel(unsigned long field, unsigned long value) { vmcs_checkl(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write64(field, value); + __vmcs_writel(field, value); } @@ -1719,6 +1940,9 @@ static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask) { BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, "vmcs_clear_bits does not support 64-bit fields"); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write32(field, evmcs_read32(field) & ~mask); + __vmcs_writel(field, __vmcs_readl(field) & ~mask); } @@ -1726,6 +1950,9 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) { BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, "vmcs_set_bits does not support 64-bit fields"); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write32(field, evmcs_read32(field) | mask); + __vmcs_writel(field, __vmcs_readl(field) | mask); } @@ -1857,6 +2084,14 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR); + /* + * Guest access to VMware backdoor ports could legitimately + * trigger #GP because of TSS I/O permission bitmap. + * We intercept those #GP and allow access to them anyway + * as VMware does. + */ + if (enable_vmware_backdoor) + eb |= (1u << GP_VECTOR); if ((vcpu->guest_debug & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) @@ -2122,6 +2357,9 @@ static unsigned long segment_base(u16 selector) static void vmx_save_host_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); +#ifdef CONFIG_X86_64 + int cpu = raw_smp_processor_id(); +#endif int i; if (vmx->host_state.loaded) @@ -2134,7 +2372,15 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) */ vmx->host_state.ldt_sel = kvm_read_ldt(); vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; + +#ifdef CONFIG_X86_64 + save_fsgs_for_kvm(); + vmx->host_state.fs_sel = current->thread.fsindex; + vmx->host_state.gs_sel = current->thread.gsindex; +#else savesegment(fs, vmx->host_state.fs_sel); + savesegment(gs, vmx->host_state.gs_sel); +#endif if (!(vmx->host_state.fs_sel & 7)) { vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); vmx->host_state.fs_reload_needed = 0; @@ -2142,7 +2388,6 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) vmcs_write16(HOST_FS_SELECTOR, 0); vmx->host_state.fs_reload_needed = 1; } - savesegment(gs, vmx->host_state.gs_sel); if (!(vmx->host_state.gs_sel & 7)) vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); else { @@ -2153,20 +2398,16 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 savesegment(ds, vmx->host_state.ds_sel); savesegment(es, vmx->host_state.es_sel); -#endif -#ifdef CONFIG_X86_64 - vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); - vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); -#else - vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); - vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); -#endif + vmcs_writel(HOST_FS_BASE, current->thread.fsbase); + vmcs_writel(HOST_GS_BASE, cpu_kernelmode_gs_base(cpu)); -#ifdef CONFIG_X86_64 - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); + vmx->msr_host_kernel_gs_base = current->thread.gsbase; if (is_long_mode(&vmx->vcpu)) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); +#else + vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); + vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); #endif if (boot_cpu_has(X86_FEATURE_MPX)) rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); @@ -2525,6 +2766,19 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit return 0; } +static void vmx_clear_hlt(struct kvm_vcpu *vcpu) +{ + /* + * Ensure that we clear the HLT state in the VMCS. We don't need to + * explicitly skip the instruction because if the HLT state is set, + * then the instruction is already executing and RIP has already been + * advanced. + */ + if (kvm_hlt_in_guest(vcpu->kvm) && + vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) + vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); +} + static void vmx_queue_exception(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2547,6 +2801,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu) return; } + WARN_ON_ONCE(vmx->emulation_required); + if (kvm_exception_is_soft(nr)) { vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, vmx->vcpu.arch.event_exit_inst_len); @@ -2555,6 +2811,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu) intr_info |= INTR_TYPE_HARD_EXCEPTION; vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); + + vmx_clear_hlt(vcpu); } static bool vmx_rdtscp_supported(void) @@ -2622,18 +2880,15 @@ static void setup_msrs(struct vcpu_vmx *vmx) vmx_update_msr_bitmap(&vmx->vcpu); } -/* - * reads and returns guest's timestamp counter "register" - * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset - * -- Intel TSC Scaling for Virtualization White Paper, sec 1.3 - */ -static u64 guest_read_tsc(struct kvm_vcpu *vcpu) +static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu) { - u64 host_tsc, tsc_offset; + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + if (is_guest_mode(vcpu) && + (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) + return vcpu->arch.tsc_offset - vmcs12->tsc_offset; - host_tsc = rdtsc(); - tsc_offset = vmcs_read64(TSC_OFFSET); - return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset; + return vcpu->arch.tsc_offset; } /* @@ -2682,8 +2937,13 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) * bit in the high half is on if the corresponding bit in the control field * may be on. See also vmx_control_verify(). */ -static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) +static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) { + if (!nested) { + memset(msrs, 0, sizeof(*msrs)); + return; + } + /* * Note that as a general rule, the high half of the MSRs (bits in * the control fields which may be 1) should be initialized by the @@ -2701,70 +2961,68 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) /* pin-based controls */ rdmsr(MSR_IA32_VMX_PINBASED_CTLS, - vmx->nested.nested_vmx_pinbased_ctls_low, - vmx->nested.nested_vmx_pinbased_ctls_high); - vmx->nested.nested_vmx_pinbased_ctls_low |= + msrs->pinbased_ctls_low, + msrs->pinbased_ctls_high); + msrs->pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; - vmx->nested.nested_vmx_pinbased_ctls_high &= + msrs->pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | - PIN_BASED_VIRTUAL_NMIS; - vmx->nested.nested_vmx_pinbased_ctls_high |= + PIN_BASED_VIRTUAL_NMIS | + (apicv ? PIN_BASED_POSTED_INTR : 0); + msrs->pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; - if (kvm_vcpu_apicv_active(&vmx->vcpu)) - vmx->nested.nested_vmx_pinbased_ctls_high |= - PIN_BASED_POSTED_INTR; /* exit controls */ rdmsr(MSR_IA32_VMX_EXIT_CTLS, - vmx->nested.nested_vmx_exit_ctls_low, - vmx->nested.nested_vmx_exit_ctls_high); - vmx->nested.nested_vmx_exit_ctls_low = + msrs->exit_ctls_low, + msrs->exit_ctls_high); + msrs->exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; - vmx->nested.nested_vmx_exit_ctls_high &= + msrs->exit_ctls_high &= #ifdef CONFIG_X86_64 VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; - vmx->nested.nested_vmx_exit_ctls_high |= + msrs->exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; if (kvm_mpx_supported()) - vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; + msrs->exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; /* We support free control of debug control saving. */ - vmx->nested.nested_vmx_exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; + msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, - vmx->nested.nested_vmx_entry_ctls_low, - vmx->nested.nested_vmx_entry_ctls_high); - vmx->nested.nested_vmx_entry_ctls_low = + msrs->entry_ctls_low, + msrs->entry_ctls_high); + msrs->entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; - vmx->nested.nested_vmx_entry_ctls_high &= + msrs->entry_ctls_high &= #ifdef CONFIG_X86_64 VM_ENTRY_IA32E_MODE | #endif VM_ENTRY_LOAD_IA32_PAT; - vmx->nested.nested_vmx_entry_ctls_high |= + msrs->entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); if (kvm_mpx_supported()) - vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; + msrs->entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; /* We support free control of debug control loading. */ - vmx->nested.nested_vmx_entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; + msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, - vmx->nested.nested_vmx_procbased_ctls_low, - vmx->nested.nested_vmx_procbased_ctls_high); - vmx->nested.nested_vmx_procbased_ctls_low = + msrs->procbased_ctls_low, + msrs->procbased_ctls_high); + msrs->procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; - vmx->nested.nested_vmx_procbased_ctls_high &= + msrs->procbased_ctls_high &= CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | @@ -2784,12 +3042,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) * can use it to avoid exits to L1 - even when L0 runs L2 * without MSR bitmaps. */ - vmx->nested.nested_vmx_procbased_ctls_high |= + msrs->procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | CPU_BASED_USE_MSR_BITMAPS; /* We support free control of CR3 access interception. */ - vmx->nested.nested_vmx_procbased_ctls_low &= + msrs->procbased_ctls_low &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); /* @@ -2797,10 +3055,10 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) * depend on CPUID bits, they are added later by vmx_cpuid_update. */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, - vmx->nested.nested_vmx_secondary_ctls_low, - vmx->nested.nested_vmx_secondary_ctls_high); - vmx->nested.nested_vmx_secondary_ctls_low = 0; - vmx->nested.nested_vmx_secondary_ctls_high &= + msrs->secondary_ctls_low, + msrs->secondary_ctls_high); + msrs->secondary_ctls_low = 0; + msrs->secondary_ctls_high &= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_DESC | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | @@ -2810,33 +3068,33 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ - vmx->nested.nested_vmx_secondary_ctls_high |= + msrs->secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; - vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | + msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT | VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; if (cpu_has_vmx_ept_execute_only()) - vmx->nested.nested_vmx_ept_caps |= + msrs->ept_caps |= VMX_EPT_EXECUTE_ONLY_BIT; - vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept; - vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | + msrs->ept_caps &= vmx_capability.ept; + msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | VMX_EPT_1GB_PAGE_BIT; if (enable_ept_ad_bits) { - vmx->nested.nested_vmx_secondary_ctls_high |= + msrs->secondary_ctls_high |= SECONDARY_EXEC_ENABLE_PML; - vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT; + msrs->ept_caps |= VMX_EPT_AD_BIT; } } if (cpu_has_vmx_vmfunc()) { - vmx->nested.nested_vmx_secondary_ctls_high |= + msrs->secondary_ctls_high |= SECONDARY_EXEC_ENABLE_VMFUNC; /* * Advertise EPTP switching unconditionally * since we emulate it */ if (enable_ept) - vmx->nested.nested_vmx_vmfunc_controls = + msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING; } @@ -2847,25 +3105,25 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) * not failing the single-context invvpid, and it is worse. */ if (enable_vpid) { - vmx->nested.nested_vmx_secondary_ctls_high |= + msrs->secondary_ctls_high |= SECONDARY_EXEC_ENABLE_VPID; - vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | + msrs->vpid_caps = VMX_VPID_INVVPID_BIT | VMX_VPID_EXTENT_SUPPORTED_MASK; } if (enable_unrestricted_guest) - vmx->nested.nested_vmx_secondary_ctls_high |= + msrs->secondary_ctls_high |= SECONDARY_EXEC_UNRESTRICTED_GUEST; /* miscellaneous data */ rdmsr(MSR_IA32_VMX_MISC, - vmx->nested.nested_vmx_misc_low, - vmx->nested.nested_vmx_misc_high); - vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; - vmx->nested.nested_vmx_misc_low |= + msrs->misc_low, + msrs->misc_high); + msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA; + msrs->misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | VMX_MISC_ACTIVITY_HLT; - vmx->nested.nested_vmx_misc_high = 0; + msrs->misc_high = 0; /* * This MSR reports some information about VMX support. We @@ -2873,14 +3131,14 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) * guest, and the VMCS structure we give it - not about the * VMX support of the underlying hardware. */ - vmx->nested.nested_vmx_basic = + msrs->basic = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS | ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); if (cpu_has_vmx_basic_inout()) - vmx->nested.nested_vmx_basic |= VMX_BASIC_INOUT; + msrs->basic |= VMX_BASIC_INOUT; /* * These MSRs specify bits which the guest must keep fixed on @@ -2889,15 +3147,15 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) */ #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) #define VMXON_CR4_ALWAYSON X86_CR4_VMXE - vmx->nested.nested_vmx_cr0_fixed0 = VMXON_CR0_ALWAYSON; - vmx->nested.nested_vmx_cr4_fixed0 = VMXON_CR4_ALWAYSON; + msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; + msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; /* These MSRs specify bits which the guest must keep fixed off. */ - rdmsrl(MSR_IA32_VMX_CR0_FIXED1, vmx->nested.nested_vmx_cr0_fixed1); - rdmsrl(MSR_IA32_VMX_CR4_FIXED1, vmx->nested.nested_vmx_cr4_fixed1); + rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); + rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); /* highest index: VMX_PREEMPTION_TIMER_VALUE */ - vmx->nested.nested_vmx_vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; + msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; } /* @@ -2934,7 +3192,7 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | /* reserved */ BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); - u64 vmx_basic = vmx->nested.nested_vmx_basic; + u64 vmx_basic = vmx->nested.msrs.basic; if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) return -EINVAL; @@ -2953,7 +3211,7 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) return -EINVAL; - vmx->nested.nested_vmx_basic = data; + vmx->nested.msrs.basic = data; return 0; } @@ -2965,24 +3223,24 @@ vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) switch (msr_index) { case MSR_IA32_VMX_TRUE_PINBASED_CTLS: - lowp = &vmx->nested.nested_vmx_pinbased_ctls_low; - highp = &vmx->nested.nested_vmx_pinbased_ctls_high; + lowp = &vmx->nested.msrs.pinbased_ctls_low; + highp = &vmx->nested.msrs.pinbased_ctls_high; break; case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - lowp = &vmx->nested.nested_vmx_procbased_ctls_low; - highp = &vmx->nested.nested_vmx_procbased_ctls_high; + lowp = &vmx->nested.msrs.procbased_ctls_low; + highp = &vmx->nested.msrs.procbased_ctls_high; break; case MSR_IA32_VMX_TRUE_EXIT_CTLS: - lowp = &vmx->nested.nested_vmx_exit_ctls_low; - highp = &vmx->nested.nested_vmx_exit_ctls_high; + lowp = &vmx->nested.msrs.exit_ctls_low; + highp = &vmx->nested.msrs.exit_ctls_high; break; case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - lowp = &vmx->nested.nested_vmx_entry_ctls_low; - highp = &vmx->nested.nested_vmx_entry_ctls_high; + lowp = &vmx->nested.msrs.entry_ctls_low; + highp = &vmx->nested.msrs.entry_ctls_high; break; case MSR_IA32_VMX_PROCBASED_CTLS2: - lowp = &vmx->nested.nested_vmx_secondary_ctls_low; - highp = &vmx->nested.nested_vmx_secondary_ctls_high; + lowp = &vmx->nested.msrs.secondary_ctls_low; + highp = &vmx->nested.msrs.secondary_ctls_high; break; default: BUG(); @@ -3013,13 +3271,13 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) GENMASK_ULL(13, 9) | BIT_ULL(31); u64 vmx_misc; - vmx_misc = vmx_control_msr(vmx->nested.nested_vmx_misc_low, - vmx->nested.nested_vmx_misc_high); + vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, + vmx->nested.msrs.misc_high); if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) return -EINVAL; - if ((vmx->nested.nested_vmx_pinbased_ctls_high & + if ((vmx->nested.msrs.pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER) && vmx_misc_preemption_timer_rate(data) != vmx_misc_preemption_timer_rate(vmx_misc)) @@ -3034,8 +3292,8 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) return -EINVAL; - vmx->nested.nested_vmx_misc_low = data; - vmx->nested.nested_vmx_misc_high = data >> 32; + vmx->nested.msrs.misc_low = data; + vmx->nested.msrs.misc_high = data >> 32; return 0; } @@ -3043,15 +3301,15 @@ static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) { u64 vmx_ept_vpid_cap; - vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.nested_vmx_ept_caps, - vmx->nested.nested_vmx_vpid_caps); + vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, + vmx->nested.msrs.vpid_caps); /* Every bit is either reserved or a feature bit. */ if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) return -EINVAL; - vmx->nested.nested_vmx_ept_caps = data; - vmx->nested.nested_vmx_vpid_caps = data >> 32; + vmx->nested.msrs.ept_caps = data; + vmx->nested.msrs.vpid_caps = data >> 32; return 0; } @@ -3061,10 +3319,10 @@ static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) switch (msr_index) { case MSR_IA32_VMX_CR0_FIXED0: - msr = &vmx->nested.nested_vmx_cr0_fixed0; + msr = &vmx->nested.msrs.cr0_fixed0; break; case MSR_IA32_VMX_CR4_FIXED0: - msr = &vmx->nested.nested_vmx_cr4_fixed0; + msr = &vmx->nested.msrs.cr4_fixed0; break; default: BUG(); @@ -3128,7 +3386,7 @@ static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) case MSR_IA32_VMX_EPT_VPID_CAP: return vmx_restore_vmx_ept_vpid_cap(vmx, data); case MSR_IA32_VMX_VMCS_ENUM: - vmx->nested.nested_vmx_vmcs_enum = data; + vmx->nested.msrs.vmcs_enum = data; return 0; default: /* @@ -3139,77 +3397,75 @@ static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) } /* Returns 0 on success, non-0 otherwise. */ -static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +static int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - switch (msr_index) { case MSR_IA32_VMX_BASIC: - *pdata = vmx->nested.nested_vmx_basic; + *pdata = msrs->basic; break; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS: *pdata = vmx_control_msr( - vmx->nested.nested_vmx_pinbased_ctls_low, - vmx->nested.nested_vmx_pinbased_ctls_high); + msrs->pinbased_ctls_low, + msrs->pinbased_ctls_high); if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: case MSR_IA32_VMX_PROCBASED_CTLS: *pdata = vmx_control_msr( - vmx->nested.nested_vmx_procbased_ctls_low, - vmx->nested.nested_vmx_procbased_ctls_high); + msrs->procbased_ctls_low, + msrs->procbased_ctls_high); if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_EXIT_CTLS: case MSR_IA32_VMX_EXIT_CTLS: *pdata = vmx_control_msr( - vmx->nested.nested_vmx_exit_ctls_low, - vmx->nested.nested_vmx_exit_ctls_high); + msrs->exit_ctls_low, + msrs->exit_ctls_high); if (msr_index == MSR_IA32_VMX_EXIT_CTLS) *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_ENTRY_CTLS: case MSR_IA32_VMX_ENTRY_CTLS: *pdata = vmx_control_msr( - vmx->nested.nested_vmx_entry_ctls_low, - vmx->nested.nested_vmx_entry_ctls_high); + msrs->entry_ctls_low, + msrs->entry_ctls_high); if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_MISC: *pdata = vmx_control_msr( - vmx->nested.nested_vmx_misc_low, - vmx->nested.nested_vmx_misc_high); + msrs->misc_low, + msrs->misc_high); break; case MSR_IA32_VMX_CR0_FIXED0: - *pdata = vmx->nested.nested_vmx_cr0_fixed0; + *pdata = msrs->cr0_fixed0; break; case MSR_IA32_VMX_CR0_FIXED1: - *pdata = vmx->nested.nested_vmx_cr0_fixed1; + *pdata = msrs->cr0_fixed1; break; case MSR_IA32_VMX_CR4_FIXED0: - *pdata = vmx->nested.nested_vmx_cr4_fixed0; + *pdata = msrs->cr4_fixed0; break; case MSR_IA32_VMX_CR4_FIXED1: - *pdata = vmx->nested.nested_vmx_cr4_fixed1; + *pdata = msrs->cr4_fixed1; break; case MSR_IA32_VMX_VMCS_ENUM: - *pdata = vmx->nested.nested_vmx_vmcs_enum; + *pdata = msrs->vmcs_enum; break; case MSR_IA32_VMX_PROCBASED_CTLS2: *pdata = vmx_control_msr( - vmx->nested.nested_vmx_secondary_ctls_low, - vmx->nested.nested_vmx_secondary_ctls_high); + msrs->secondary_ctls_low, + msrs->secondary_ctls_high); break; case MSR_IA32_VMX_EPT_VPID_CAP: - *pdata = vmx->nested.nested_vmx_ept_caps | - ((u64)vmx->nested.nested_vmx_vpid_caps << 32); + *pdata = msrs->ept_caps | + ((u64)msrs->vpid_caps << 32); break; case MSR_IA32_VMX_VMFUNC: - *pdata = vmx->nested.nested_vmx_vmfunc_controls; + *pdata = msrs->vmfunc_controls; break; default: return 1; @@ -3226,6 +3482,20 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, return !(val & ~valid_bits); } +static int vmx_get_msr_feature(struct kvm_msr_entry *msr) +{ + switch (msr->index) { + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + if (!nested) + return 1; + return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); + default: + return 1; + } + + return 0; +} + /* * Reads an msr value (of 'msr_index') into 'pdata'. * Returns 0 on success, non-0 otherwise. @@ -3251,9 +3521,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) #endif case MSR_EFER: return kvm_get_msr_common(vcpu, msr_info); - case MSR_IA32_TSC: - msr_info->data = guest_read_tsc(vcpu); - break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) && @@ -3297,7 +3564,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!nested_vmx_allowed(vcpu)) return 1; - return vmx_get_vmx_msr(vcpu, msr_info->index, &msr_info->data); + return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, + &msr_info->data); case MSR_IA32_XSS: if (!vmx_xsaves_supported()) return 1; @@ -3372,9 +3640,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vmcs_write64(GUEST_BNDCFGS, data); break; - case MSR_IA32_TSC: - kvm_write_tsc(vcpu, msr_info); - break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) && @@ -3590,6 +3855,14 @@ static int hardware_enable(void) if (cr4_read_shadow() & X86_CR4_VMXE) return -EBUSY; + /* + * This can happen if we hot-added a CPU but failed to allocate + * VP assist page for it. + */ + if (static_branch_unlikely(&enable_evmcs) && + !hv_get_vp_assist_page(cpu)) + return -EFAULT; + INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); @@ -3688,6 +3961,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) u32 _vmexit_control = 0; u32 _vmentry_control = 0; + memset(vmcs_conf, 0, sizeof(*vmcs_conf)); min = CPU_BASED_HLT_EXITING | #ifdef CONFIG_X86_64 CPU_BASED_CR8_LOAD_EXITING | @@ -3698,13 +3972,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_UNCOND_IO_EXITING | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_MWAIT_EXITING | + CPU_BASED_MONITOR_EXITING | CPU_BASED_INVLPG_EXITING | CPU_BASED_RDPMC_EXITING; - if (!kvm_mwait_in_guest()) - min |= CPU_BASED_MWAIT_EXITING | - CPU_BASED_MONITOR_EXITING; - opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; @@ -3823,7 +4095,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmcs_conf->size = vmx_msr_high & 0x1fff; vmcs_conf->order = get_order(vmcs_conf->size); vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; - vmcs_conf->revision_id = vmx_msr_low; + + /* KVM supports Enlightened VMCS v1 only */ + if (static_branch_unlikely(&enable_evmcs)) + vmcs_conf->revision_id = KVM_EVMCS_VERSION; + else + vmcs_conf->revision_id = vmx_msr_low; vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; @@ -3831,6 +4108,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmcs_conf->vmexit_ctrl = _vmexit_control; vmcs_conf->vmentry_ctrl = _vmentry_control; + if (static_branch_unlikely(&enable_evmcs)) + evmcs_sanitize_exec_ctrls(vmcs_conf); + cpu_has_load_ia32_efer = allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, VM_ENTRY_LOAD_IA32_EFER) @@ -4150,6 +4430,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) { unsigned long flags; struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); @@ -4165,13 +4446,13 @@ static void enter_rmode(struct kvm_vcpu *vcpu) * Very old userspace does not call KVM_SET_TSS_ADDR before entering * vcpu. Warn the user that an update is overdue. */ - if (!vcpu->kvm->arch.tss_addr) + if (!kvm_vmx->tss_addr) printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " "called before entering vcpu\n"); vmx_segment_cache_clear(vmx); - vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr); + vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr); vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); @@ -4263,12 +4544,6 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa); } -static void vmx_flush_tlb_ept_only(struct kvm_vcpu *vcpu) -{ - if (enable_ept) - vmx_flush_tlb(vcpu, true); -} - static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) { ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; @@ -4279,7 +4554,7 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) static void vmx_decache_cr3(struct kvm_vcpu *vcpu) { - if (enable_ept && is_paging(vcpu)) + if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu))) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); } @@ -4327,11 +4602,11 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu) static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) { - u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0; - u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1; + u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high & + if (to_vmx(vcpu)->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_UNRESTRICTED_GUEST && nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) fixed0 &= ~(X86_CR0_PE | X86_CR0_PG); @@ -4341,16 +4616,16 @@ static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) { - u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0; - u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1; + u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; return fixed_bits_valid(val, fixed0, fixed1); } static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) { - u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed0; - u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed1; + u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1; return fixed_bits_valid(val, fixed0, fixed1); } @@ -4416,7 +4691,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } #endif - if (enable_ept) + if (enable_ept && !enable_unrestricted_guest) ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); vmcs_writel(CR0_READ_SHADOW, cr0); @@ -4457,10 +4732,11 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (enable_ept) { eptp = construct_eptp(vcpu, cr3); vmcs_write64(EPT_POINTER, eptp); - if (is_paging(vcpu) || is_guest_mode(vcpu)) + if (enable_unrestricted_guest || is_paging(vcpu) || + is_guest_mode(vcpu)) guest_cr3 = kvm_read_cr3(vcpu); else - guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr; + guest_cr3 = to_kvm_vmx(vcpu->kvm)->ept_identity_map_addr; ept_load_pdptrs(vcpu); } @@ -4475,17 +4751,22 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) * is in force while we are in guest mode. Do not let guests control * this bit, even if host CR4.MCE == 0. */ - unsigned long hw_cr4 = - (cr4_read_shadow() & X86_CR4_MCE) | - (cr4 & ~X86_CR4_MCE) | - (to_vmx(vcpu)->rmode.vm86_active ? - KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); + unsigned long hw_cr4; + + hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); + if (enable_unrestricted_guest) + hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; + else if (to_vmx(vcpu)->rmode.vm86_active) + hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; + else + hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; if ((cr4 & X86_CR4_UMIP) && !boot_cpu_has(X86_FEATURE_UMIP)) { vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_DESC); hw_cr4 &= ~X86_CR4_UMIP; - } else + } else if (!is_guest_mode(vcpu) || + !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_DESC); @@ -4504,16 +4785,17 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; vcpu->arch.cr4 = cr4; - if (enable_ept) { - if (!is_paging(vcpu)) { - hw_cr4 &= ~X86_CR4_PAE; - hw_cr4 |= X86_CR4_PSE; - } else if (!(cr4 & X86_CR4_PAE)) { - hw_cr4 &= ~X86_CR4_PAE; + + if (!enable_unrestricted_guest) { + if (enable_ept) { + if (!is_paging(vcpu)) { + hw_cr4 &= ~X86_CR4_PAE; + hw_cr4 |= X86_CR4_PSE; + } else if (!(cr4 & X86_CR4_PAE)) { + hw_cr4 &= ~X86_CR4_PAE; + } } - } - if (!enable_unrestricted_guest && !is_paging(vcpu)) /* * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in * hardware. To emulate this behavior, SMEP/SMAP/PKU needs @@ -4525,7 +4807,9 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) * If enable_unrestricted_guest, the CPU automatically * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. */ - hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); + if (!is_paging(vcpu)) + hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); + } vmcs_writel(CR4_READ_SHADOW, cr4); vmcs_writel(GUEST_CR4, hw_cr4); @@ -4893,7 +5177,7 @@ static int init_rmode_tss(struct kvm *kvm) int idx, r; idx = srcu_read_lock(&kvm->srcu); - fn = kvm->arch.tss_addr >> PAGE_SHIFT; + fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT; r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); if (r < 0) goto out; @@ -4919,22 +5203,23 @@ out: static int init_rmode_identity_map(struct kvm *kvm) { + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); int i, idx, r = 0; kvm_pfn_t identity_map_pfn; u32 tmp; - /* Protect kvm->arch.ept_identity_pagetable_done. */ + /* Protect kvm_vmx->ept_identity_pagetable_done. */ mutex_lock(&kvm->slots_lock); - if (likely(kvm->arch.ept_identity_pagetable_done)) + if (likely(kvm_vmx->ept_identity_pagetable_done)) goto out2; - if (!kvm->arch.ept_identity_map_addr) - kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; - identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; + if (!kvm_vmx->ept_identity_map_addr) + kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; + identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT; r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, - kvm->arch.ept_identity_map_addr, PAGE_SIZE); + kvm_vmx->ept_identity_map_addr, PAGE_SIZE); if (r < 0) goto out2; @@ -4951,7 +5236,7 @@ static int init_rmode_identity_map(struct kvm *kvm) if (r < 0) goto out; } - kvm->arch.ept_identity_pagetable_done = true; + kvm_vmx->ept_identity_pagetable_done = true; out: srcu_read_unlock(&kvm->srcu, idx); @@ -5487,6 +5772,11 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) exec_control |= CPU_BASED_CR3_STORE_EXITING | CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_INVLPG_EXITING; + if (kvm_mwait_in_guest(vmx->vcpu.kvm)) + exec_control &= ~(CPU_BASED_MWAIT_EXITING | + CPU_BASED_MONITOR_EXITING); + if (kvm_hlt_in_guest(vmx->vcpu.kvm)) + exec_control &= ~CPU_BASED_HLT_EXITING; return exec_control; } @@ -5520,7 +5810,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) } if (!enable_unrestricted_guest) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; - if (!ple_gap) + if (kvm_pause_in_guest(vmx->vcpu.kvm)) exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; if (!kvm_vcpu_apicv_active(vcpu)) exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | @@ -5552,10 +5842,10 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (nested) { if (xsaves_enabled) - vmx->nested.nested_vmx_secondary_ctls_high |= + vmx->nested.msrs.secondary_ctls_high |= SECONDARY_EXEC_XSAVES; else - vmx->nested.nested_vmx_secondary_ctls_high &= + vmx->nested.msrs.secondary_ctls_high &= ~SECONDARY_EXEC_XSAVES; } } @@ -5567,10 +5857,10 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (nested) { if (rdtscp_enabled) - vmx->nested.nested_vmx_secondary_ctls_high |= + vmx->nested.msrs.secondary_ctls_high |= SECONDARY_EXEC_RDTSCP; else - vmx->nested.nested_vmx_secondary_ctls_high &= + vmx->nested.msrs.secondary_ctls_high &= ~SECONDARY_EXEC_RDTSCP; } } @@ -5588,10 +5878,10 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (nested) { if (invpcid_enabled) - vmx->nested.nested_vmx_secondary_ctls_high |= + vmx->nested.msrs.secondary_ctls_high |= SECONDARY_EXEC_ENABLE_INVPCID; else - vmx->nested.nested_vmx_secondary_ctls_high &= + vmx->nested.msrs.secondary_ctls_high &= ~SECONDARY_EXEC_ENABLE_INVPCID; } } @@ -5603,10 +5893,10 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (nested) { if (rdrand_enabled) - vmx->nested.nested_vmx_secondary_ctls_high |= + vmx->nested.msrs.secondary_ctls_high |= SECONDARY_EXEC_RDRAND_EXITING; else - vmx->nested.nested_vmx_secondary_ctls_high &= + vmx->nested.msrs.secondary_ctls_high &= ~SECONDARY_EXEC_RDRAND_EXITING; } } @@ -5618,10 +5908,10 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (nested) { if (rdseed_enabled) - vmx->nested.nested_vmx_secondary_ctls_high |= + vmx->nested.msrs.secondary_ctls_high |= SECONDARY_EXEC_RDSEED_EXITING; else - vmx->nested.nested_vmx_secondary_ctls_high &= + vmx->nested.msrs.secondary_ctls_high &= ~SECONDARY_EXEC_RDSEED_EXITING; } } @@ -5683,7 +5973,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); } - if (ple_gap) { + if (!kvm_pause_in_guest(vmx->vcpu.kvm)) { vmcs_write32(PLE_GAP, ple_gap); vmx->ple_window = ple_window; vmx->ple_window_dirty = true; @@ -5765,6 +6055,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx->rmode.vm86_active = 0; vmx->spec_ctrl = 0; + vcpu->arch.microcode_version = 0x100000000ULL; vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); kvm_set_cr8(vcpu, 0); @@ -5847,6 +6138,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) update_exception_bitmap(vcpu); vpid_sync_context(vmx->vpid); + if (init_event) + vmx_clear_hlt(vcpu); } /* @@ -5871,8 +6164,7 @@ static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) { - return get_vmcs12(vcpu)->pin_based_vm_exec_control & - PIN_BASED_NMI_EXITING; + return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu)); } static void enable_irq_window(struct kvm_vcpu *vcpu) @@ -5918,6 +6210,8 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) } else intr |= INTR_TYPE_EXT_INTR; vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); + + vmx_clear_hlt(vcpu); } static void vmx_inject_nmi(struct kvm_vcpu *vcpu) @@ -5948,6 +6242,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); + + vmx_clear_hlt(vcpu); } static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) @@ -6010,14 +6306,23 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) { int ret; + if (enable_unrestricted_guest) + return 0; + ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, PAGE_SIZE * 3); if (ret) return ret; - kvm->arch.tss_addr = addr; + to_kvm_vmx(kvm)->tss_addr = addr; return init_rmode_tss(kvm); } +static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) +{ + to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; + return 0; +} + static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) { switch (vec) { @@ -6120,19 +6425,24 @@ static int handle_exception(struct kvm_vcpu *vcpu) if (is_nmi(intr_info)) return 1; /* already handled by vmx_vcpu_run() */ - if (is_invalid_opcode(intr_info)) { - er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); - if (er == EMULATE_USER_EXIT) - return 0; - if (er != EMULATE_DONE) - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } + if (is_invalid_opcode(intr_info)) + return handle_ud(vcpu); error_code = 0; if (intr_info & INTR_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { + WARN_ON_ONCE(!enable_vmware_backdoor); + er = emulate_instruction(vcpu, + EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); + if (er == EMULATE_USER_EXIT) + return 0; + else if (er != EMULATE_DONE) + kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); + return 1; + } + /* * The #PF with PFEC.RSVD = 1 indicates the guest is accessing * MMIO, it is better to report an internal error. @@ -6171,7 +6481,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { vcpu->arch.dr6 &= ~15; vcpu->arch.dr6 |= dr6 | DR6_RTM; - if (!(dr6 & ~DR6_RESERVED)) /* icebp */ + if (is_icebp(intr_info)) skip_emulated_instruction(vcpu); kvm_queue_exception(vcpu, DB_VECTOR); @@ -6218,28 +6528,22 @@ static int handle_triple_fault(struct kvm_vcpu *vcpu) static int handle_io(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; - int size, in, string, ret; + int size, in, string; unsigned port; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); string = (exit_qualification & 16) != 0; - in = (exit_qualification & 8) != 0; ++vcpu->stat.io_exits; - if (string || in) + if (string) return emulate_instruction(vcpu, 0) == EMULATE_DONE; port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; + in = (exit_qualification & 8) != 0; - ret = kvm_skip_emulated_instruction(vcpu); - - /* - * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered - * KVM_EXIT_DEBUG here. - */ - return kvm_fast_pio_out(vcpu, size, port) && ret; + return kvm_fast_pio(vcpu, size, port, in); } static void @@ -6330,6 +6634,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) err = handle_set_cr0(vcpu, val); return kvm_complete_insn_gp(vcpu, err); case 3: + WARN_ON_ONCE(enable_unrestricted_guest); err = kvm_set_cr3(vcpu, val); return kvm_complete_insn_gp(vcpu, err); case 4: @@ -6362,6 +6667,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) case 1: /*mov from cr*/ switch (cr) { case 3: + WARN_ON_ONCE(enable_unrestricted_guest); val = kvm_read_cr3(vcpu); kvm_register_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); @@ -6755,7 +7061,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) static int handle_ept_misconfig(struct kvm_vcpu *vcpu) { - int ret; gpa_t gpa; /* @@ -6783,17 +7088,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) NULL, 0) == EMULATE_DONE; } - ret = kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); - if (ret >= 0) - return ret; - - /* It is the real ept misconfig */ - WARN_ON(1); - - vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; - vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; - - return 0; + return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); } static int handle_nmi_window(struct kvm_vcpu *vcpu) @@ -6816,6 +7111,13 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) bool intr_window_requested; unsigned count = 130; + /* + * We should never reach the point where we are emulating L2 + * due to invalid guest state as that means we incorrectly + * allowed a nested VMEntry with an invalid vmcs12. + */ + WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending); + cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; @@ -6834,12 +7136,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) goto out; } - if (err != EMULATE_DONE) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - return 0; - } + if (err != EMULATE_DONE) + goto emulation_error; + + if (vmx->emulation_required && !vmx->rmode.vm86_active && + vcpu->arch.exception.pending) + goto emulation_error; if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; @@ -6855,34 +7157,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) out: return ret; -} - -static int __grow_ple_window(int val) -{ - if (ple_window_grow < 1) - return ple_window; - - val = min(val, ple_window_actual_max); - - if (ple_window_grow < ple_window) - val *= ple_window_grow; - else - val += ple_window_grow; - - return val; -} - -static int __shrink_ple_window(int val, int modifier, int minimum) -{ - if (modifier < 1) - return ple_window; - if (modifier < ple_window) - val /= modifier; - else - val -= modifier; - - return max(val, minimum); +emulation_error: + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; } static void grow_ple_window(struct kvm_vcpu *vcpu) @@ -6890,7 +7170,9 @@ static void grow_ple_window(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); int old = vmx->ple_window; - vmx->ple_window = __grow_ple_window(old); + vmx->ple_window = __grow_ple_window(old, ple_window, + ple_window_grow, + ple_window_max); if (vmx->ple_window != old) vmx->ple_window_dirty = true; @@ -6903,8 +7185,9 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); int old = vmx->ple_window; - vmx->ple_window = __shrink_ple_window(old, - ple_window_shrink, ple_window); + vmx->ple_window = __shrink_ple_window(old, ple_window, + ple_window_shrink, + ple_window); if (vmx->ple_window != old) vmx->ple_window_dirty = true; @@ -6913,21 +7196,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } /* - * ple_window_actual_max is computed to be one grow_ple_window() below - * ple_window_max. (See __grow_ple_window for the reason.) - * This prevents overflows, because ple_window_max is int. - * ple_window_max effectively rounded down to a multiple of ple_window_grow in - * this process. - * ple_window_max is also prevented from setting vmx->ple_window < ple_window. - */ -static void update_ple_window_actual_max(void) -{ - ple_window_actual_max = - __shrink_ple_window(max(ple_window_max, ple_window), - ple_window_grow, INT_MIN); -} - -/* * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. */ static void wakeup_handler(void) @@ -6946,7 +7214,7 @@ static void wakeup_handler(void) spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); } -void vmx_enable_tdp(void) +static void vmx_enable_tdp(void) { kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull, @@ -7047,8 +7315,6 @@ static __init int hardware_setup(void) else kvm_disable_tdp(); - update_ple_window_actual_max(); - /* * Only enable PML when hardware supports PML feature, and both EPT * and EPT A/D bit features are enabled -- PML depends on them to work. @@ -7080,6 +7346,7 @@ static __init int hardware_setup(void) init_vmcs_shadow_fields(); kvm_set_posted_intr_wakeup_handler(wakeup_handler); + nested_vmx_setup_ctls_msrs(&vmcs_config.nested, enable_apicv); kvm_mce_cap_supported |= MCG_LMCE_P; @@ -7108,7 +7375,7 @@ static __exit void hardware_unsetup(void) */ static int handle_pause(struct kvm_vcpu *vcpu) { - if (ple_gap) + if (!kvm_pause_in_guest(vcpu->kvm)) grow_ple_window(vcpu); /* @@ -7940,9 +8207,9 @@ static int handle_invept(struct kvm_vcpu *vcpu) u64 eptp, gpa; } operand; - if (!(vmx->nested.nested_vmx_secondary_ctls_high & + if (!(vmx->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || - !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { + !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -7953,7 +8220,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); - types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; + types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; if (type >= 32 || !(types & (1 << type))) { nested_vmx_failValid(vcpu, @@ -8004,9 +8271,9 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) u64 gla; } operand; - if (!(vmx->nested.nested_vmx_secondary_ctls_high & + if (!(vmx->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_ENABLE_VPID) || - !(vmx->nested.nested_vmx_vpid_caps & VMX_VPID_INVVPID_BIT)) { + !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -8017,7 +8284,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); - types = (vmx->nested.nested_vmx_vpid_caps & + types = (vmx->nested.msrs.vpid_caps & VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; if (type >= 32 || !(types & (1 << type))) { @@ -8111,11 +8378,11 @@ static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) /* Check for memory type validity */ switch (address & VMX_EPTP_MT_MASK) { case VMX_EPTP_MT_UC: - if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_UC_BIT)) + if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)) return false; break; case VMX_EPTP_MT_WB: - if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_WB_BIT)) + if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)) return false; break; default: @@ -8132,7 +8399,7 @@ static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) /* AD, if set, should be supported */ if (address & VMX_EPTP_AD_ENABLE_BIT) { - if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPT_AD_BIT)) + if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)) return false; } @@ -8776,7 +9043,8 @@ static void dump_vmcs(void) pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", vmcs_read64(GUEST_IA32_DEBUGCTL), vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); - if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) + if (cpu_has_load_perf_global_ctrl && + vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) pr_err("PerfGlobCtl = 0x%016llx\n", vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) @@ -8812,7 +9080,8 @@ static void dump_vmcs(void) pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_EFER), vmcs_read64(HOST_IA32_PAT)); - if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) + if (cpu_has_load_perf_global_ctrl && + vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) pr_err("PerfGlobCtl = 0x%016llx\n", vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); @@ -9003,7 +9272,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) } else { sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - vmx_flush_tlb_ept_only(vcpu); + vmx_flush_tlb(vcpu, true); } vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); @@ -9031,7 +9300,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) !nested_cpu_has2(get_vmcs12(&vmx->vcpu), SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { vmcs_write64(APIC_ACCESS_ADDR, hpa); - vmx_flush_tlb_ept_only(vcpu); + vmx_flush_tlb(vcpu, true); } } @@ -9164,9 +9433,9 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) /* We need to handle NMIs before interrupts are enabled */ if (is_nmi(exit_intr_info)) { - kvm_before_handle_nmi(&vmx->vcpu); + kvm_before_interrupt(&vmx->vcpu); asm("int $2"); - kvm_after_handle_nmi(&vmx->vcpu); + kvm_after_interrupt(&vmx->vcpu); } } @@ -9389,7 +9658,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long cr3, cr4; + unsigned long cr3, cr4, evmcs_rsp; /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!enable_vnmi && @@ -9452,9 +9721,13 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) * being speculatively taken. */ if (vmx->spec_ctrl) - wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); + native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); vmx->__launched = vmx->loaded_vmcs->launched; + + evmcs_rsp = static_branch_unlikely(&enable_evmcs) ? + (unsigned long)¤t_evmcs->host_rsp : 0; + asm( /* Store host registers */ "push %%" _ASM_DX "; push %%" _ASM_BP ";" @@ -9463,15 +9736,21 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t" "je 1f \n\t" "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t" + /* Avoid VMWRITE when Enlightened VMCS is in use */ + "test %%" _ASM_SI ", %%" _ASM_SI " \n\t" + "jz 2f \n\t" + "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t" + "jmp 1f \n\t" + "2: \n\t" __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" "1: \n\t" /* Reload cr2 if changed */ "mov %c[cr2](%0), %%" _ASM_AX " \n\t" "mov %%cr2, %%" _ASM_DX " \n\t" "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t" - "je 2f \n\t" + "je 3f \n\t" "mov %%" _ASM_AX", %%cr2 \n\t" - "2: \n\t" + "3: \n\t" /* Check if vmlaunch of vmresume is needed */ "cmpl $0, %c[launched](%0) \n\t" /* Load guest registers. Don't clobber flags. */ @@ -9540,7 +9819,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) ".global vmx_return \n\t" "vmx_return: " _ASM_PTR " 2b \n\t" ".popsection" - : : "c"(vmx), "d"((unsigned long)HOST_RSP), + : : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp), [launched]"i"(offsetof(struct vcpu_vmx, __launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), @@ -9565,10 +9844,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) [wordsize]"i"(sizeof(ulong)) : "cc", "memory" #ifdef CONFIG_X86_64 - , "rax", "rbx", "rdi", "rsi" + , "rax", "rbx", "rdi" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #else - , "eax", "ebx", "edi", "esi" + , "eax", "ebx", "edi" #endif ); @@ -9587,15 +9866,20 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) * If the L02 MSR bitmap does not intercept the MSR, then we need to * save it. */ - if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) - rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); + if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) + vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); if (vmx->spec_ctrl) - wrmsrl(MSR_IA32_SPEC_CTRL, 0); + native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); /* Eliminate branch target predictions from guest mode */ vmexit_fill_RSB(); + /* All fields are clean at this point */ + if (static_branch_unlikely(&enable_evmcs)) + current_evmcs->hv_clean_fields |= + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ if (vmx->host_debugctlmsr) update_debugctlmsr(vmx->host_debugctlmsr); @@ -9632,14 +9916,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) __write_pkru(vmx->host_pkru); } - /* - * the KVM_REQ_EVENT optimization bit is only on for one entry, and if - * we did not inject a still-pending event to L1 now because of - * nested_run_pending, we need to re-enable this bit. - */ - if (vmx->nested.nested_run_pending) - kvm_make_request(KVM_REQ_EVENT, vcpu); - vmx->nested.nested_run_pending = 0; vmx->idt_vectoring_info = 0; @@ -9656,6 +9932,17 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) } STACK_FRAME_NON_STANDARD(vmx_vcpu_run); +static struct kvm *vmx_vm_alloc(void) +{ + struct kvm_vmx *kvm_vmx = kzalloc(sizeof(struct kvm_vmx), GFP_KERNEL); + return &kvm_vmx->kvm; +} + +static void vmx_vm_free(struct kvm *kvm) +{ + kfree(to_kvm_vmx(kvm)); +} + static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -9763,14 +10050,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_vmcs; } - if (enable_ept) { + if (enable_ept && !enable_unrestricted_guest) { err = init_rmode_identity_map(kvm); if (err) goto free_vmcs; } if (nested) { - nested_vmx_setup_ctls_msrs(vmx); + nested_vmx_setup_ctls_msrs(&vmx->nested.msrs, + kvm_vcpu_apicv_active(&vmx->vcpu)); vmx->nested.vpid02 = allocate_vpid(); } @@ -9803,6 +10091,13 @@ free_vcpu: return ERR_PTR(err); } +static int vmx_vm_init(struct kvm *kvm) +{ + if (!ple_gap) + kvm->arch.pause_in_guest = true; + return 0; +} + static void __init vmx_check_processor_compat(void *rtn) { struct vmcs_config vmcs_conf; @@ -9810,6 +10105,7 @@ static void __init vmx_check_processor_compat(void *rtn) *(int *)rtn = 0; if (setup_vmcs_config(&vmcs_conf) < 0) *(int *)rtn = -EIO; + nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, enable_apicv); if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", smp_processor_id()); @@ -9897,12 +10193,12 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_cpuid_entry2 *entry; - vmx->nested.nested_vmx_cr0_fixed1 = 0xffffffff; - vmx->nested.nested_vmx_cr4_fixed1 = X86_CR4_PCE; + vmx->nested.msrs.cr0_fixed1 = 0xffffffff; + vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE; #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ if (entry && (entry->_reg & (_cpuid_mask))) \ - vmx->nested.nested_vmx_cr4_fixed1 |= (_cr4_mask); \ + vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ } while (0) entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); @@ -9999,7 +10295,7 @@ static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) kvm_mmu_unload(vcpu); kvm_init_shadow_ept_mmu(vcpu, - to_vmx(vcpu)->nested.nested_vmx_ept_caps & + to_vmx(vcpu)->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT, nested_ept_ad_enabled(vcpu)); vcpu->arch.mmu.set_cr3 = vmx_set_cr3; @@ -10136,7 +10432,10 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, (unsigned long)(vmcs12->posted_intr_desc_addr & (PAGE_SIZE - 1))); } - if (!nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) + if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) + vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_USE_MSR_BITMAPS); + else vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_USE_MSR_BITMAPS); } @@ -10224,8 +10523,8 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, * updated to reflect this when L1 (or its L2s) actually write to * the MSR. */ - bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD); - bool spec_ctrl = msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL); + bool pred_cmd = !msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD); + bool spec_ctrl = !msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL); /* Nothing to do if the MSR bitmap is not in use. */ if (!cpu_has_vmx_msr_bitmap() || @@ -10294,6 +10593,16 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, return true; } +static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && + !page_address_valid(vcpu, vmcs12->apic_access_addr)) + return -EINVAL; + else + return 0; +} + static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -10693,6 +11002,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exec_control, vmcs12_exec_ctrl; + if (vmx->nested.dirty_vmcs12) { + prepare_vmcs02_full(vcpu, vmcs12, from_vmentry); + vmx->nested.dirty_vmcs12 = false; + } + /* * First, the fields that are shadowed. This must be kept in sync * with vmx_shadow_fields.h. @@ -10857,11 +11171,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); } - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) - vmcs_write64(TSC_OFFSET, - vcpu->arch.tsc_offset + vmcs12->tsc_offset); - else - vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); + vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); + if (kvm_has_tsc_control) decache_tsc_multiplier(vmx); @@ -10903,7 +11214,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } } else if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { - vmx_flush_tlb_ept_only(vcpu); + vmx_flush_tlb(vcpu, true); } /* @@ -10930,9 +11241,14 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ vmx_set_efer(vcpu, vcpu->arch.efer); - if (vmx->nested.dirty_vmcs12) { - prepare_vmcs02_full(vcpu, vmcs12, from_vmentry); - vmx->nested.dirty_vmcs12 = false; + /* + * Guest state is invalid and unrestricted guest is disabled, + * which means L1 attempted VMEntry to L2 with invalid state. + * Fail the VMEntry. + */ + if (vmx->emulation_required) { + *entry_failure_code = ENTRY_FAIL_DEFAULT; + return 1; } /* Shadow page tables on either EPT or shadow page tables. */ @@ -10948,6 +11264,19 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, return 0; } +static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has_nmi_exiting(vmcs12) && + nested_cpu_has_virtual_nmis(vmcs12)) + return -EINVAL; + + if (!nested_cpu_has_virtual_nmis(vmcs12) && + nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING)) + return -EINVAL; + + return 0; +} + static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -10962,6 +11291,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_apic_access_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; @@ -10975,26 +11307,29 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, - vmx->nested.nested_vmx_procbased_ctls_low, - vmx->nested.nested_vmx_procbased_ctls_high) || + vmx->nested.msrs.procbased_ctls_low, + vmx->nested.msrs.procbased_ctls_high) || (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && !vmx_control_verify(vmcs12->secondary_vm_exec_control, - vmx->nested.nested_vmx_secondary_ctls_low, - vmx->nested.nested_vmx_secondary_ctls_high)) || + vmx->nested.msrs.secondary_ctls_low, + vmx->nested.msrs.secondary_ctls_high)) || !vmx_control_verify(vmcs12->pin_based_vm_exec_control, - vmx->nested.nested_vmx_pinbased_ctls_low, - vmx->nested.nested_vmx_pinbased_ctls_high) || + vmx->nested.msrs.pinbased_ctls_low, + vmx->nested.msrs.pinbased_ctls_high) || !vmx_control_verify(vmcs12->vm_exit_controls, - vmx->nested.nested_vmx_exit_ctls_low, - vmx->nested.nested_vmx_exit_ctls_high) || + vmx->nested.msrs.exit_ctls_low, + vmx->nested.msrs.exit_ctls_high) || !vmx_control_verify(vmcs12->vm_entry_controls, - vmx->nested.nested_vmx_entry_ctls_low, - vmx->nested.nested_vmx_entry_ctls_high)) + vmx->nested.msrs.entry_ctls_low, + vmx->nested.msrs.entry_ctls_high)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_vmx_check_nmi_controls(vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; if (nested_cpu_has_vmfunc(vmcs12)) { if (vmcs12->vm_function_control & - ~vmx->nested.nested_vmx_vmfunc_controls) + ~vmx->nested.msrs.vmfunc_controls) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; if (nested_cpu_has_eptp_switching(vmcs12)) { @@ -11080,6 +11415,7 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); u32 msr_entry_idx; u32 exit_qual; + int r; enter_guest_mode(vcpu); @@ -11089,26 +11425,21 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); vmx_segment_cache_clear(vmx); - if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) { - leave_guest_mode(vcpu); - vmx_switch_vmcs(vcpu, &vmx->vmcs01); - nested_vmx_entry_failure(vcpu, vmcs12, - EXIT_REASON_INVALID_STATE, exit_qual); - return 1; - } + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + vcpu->arch.tsc_offset += vmcs12->tsc_offset; + + r = EXIT_REASON_INVALID_STATE; + if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) + goto fail; nested_get_vmcs12_pages(vcpu, vmcs12); + r = EXIT_REASON_MSR_LOAD_FAIL; msr_entry_idx = nested_vmx_load_msr(vcpu, vmcs12->vm_entry_msr_load_addr, vmcs12->vm_entry_msr_load_count); - if (msr_entry_idx) { - leave_guest_mode(vcpu); - vmx_switch_vmcs(vcpu, &vmx->vmcs01); - nested_vmx_entry_failure(vcpu, vmcs12, - EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx); - return 1; - } + if (msr_entry_idx) + goto fail; /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point @@ -11117,6 +11448,14 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) * the success flag) when L2 exits (see nested_vmx_vmexit()). */ return 0; + +fail: + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + vcpu->arch.tsc_offset -= vmcs12->tsc_offset; + leave_guest_mode(vcpu); + vmx_switch_vmcs(vcpu, &vmx->vmcs01); + nested_vmx_entry_failure(vcpu, vmcs12, r, exit_qual); + return 1; } /* @@ -11196,7 +11535,12 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (ret) return ret; - if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) + /* + * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken + * by event injection, halt vcpu. + */ + if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) && + !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK)) return kvm_vcpu_halt(vcpu); vmx->nested.nested_run_pending = 1; @@ -11271,7 +11615,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, } else if (vcpu->arch.nmi_injected) { vmcs12->idt_vectoring_info_field = INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; - } else if (vcpu->arch.interrupt.pending) { + } else if (vcpu->arch.interrupt.injected) { nr = vcpu->arch.interrupt.nr; idt_vectoring = nr | VECTORING_INFO_VALID_MASK; @@ -11683,6 +12027,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, leave_guest_mode(vcpu); + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + vcpu->arch.tsc_offset -= vmcs12->tsc_offset; + if (likely(!vmx->fail)) { if (exit_reason == -1) sync_vmcs12(vcpu, vmcs12); @@ -11720,7 +12067,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, } else if (!nested_cpu_has_ept(vmcs12) && nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { - vmx_flush_tlb_ept_only(vcpu); + vmx_flush_tlb(vcpu, true); } /* This is needed for same reason as it was needed in prepare_vmcs02 */ @@ -11879,10 +12226,16 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift, static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 tscl = rdtsc(); - u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl); - u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; + struct vcpu_vmx *vmx; + u64 tscl, guest_tscl, delta_tsc; + + if (kvm_mwait_in_guest(vcpu->kvm)) + return -EOPNOTSUPP; + + vmx = to_vmx(vcpu); + tscl = rdtsc(); + guest_tscl = kvm_read_l1_tsc(vcpu, tscl); + delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; /* Convert to host delta tsc if tsc scaling is enabled */ if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && @@ -11919,7 +12272,7 @@ static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) { - if (ple_gap) + if (!kvm_pause_in_guest(vcpu->kvm)) shrink_ple_window(vcpu); } @@ -12188,7 +12541,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); vcpu_info.vector = irq.vector; - trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi, + trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, vcpu_info.vector, vcpu_info.pi_desc_addr, set); if (set) @@ -12237,6 +12590,7 @@ static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) vmx->nested.smm.vmxon = vmx->nested.vmxon; vmx->nested.vmxon = false; + vmx_clear_hlt(vcpu); return 0; } @@ -12278,6 +12632,10 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .cpu_has_accelerated_tpr = report_flexpriority, .cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase, + .vm_init = vmx_vm_init, + .vm_alloc = vmx_vm_alloc, + .vm_free = vmx_vm_free, + .vcpu_create = vmx_create_vcpu, .vcpu_free = vmx_free_vcpu, .vcpu_reset = vmx_vcpu_reset, @@ -12287,6 +12645,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .vcpu_put = vmx_vcpu_put, .update_bp_intercept = update_exception_bitmap, + .get_msr_feature = vmx_get_msr_feature, .get_msr = vmx_get_msr, .set_msr = vmx_set_msr, .get_segment_base = vmx_get_segment_base, @@ -12344,6 +12703,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .deliver_posted_interrupt = vmx_deliver_posted_interrupt, .set_tss_addr = vmx_set_tss_addr, + .set_identity_map_addr = vmx_set_identity_map_addr, .get_tdp_level = get_ept_level, .get_mt_mask = vmx_get_mt_mask, @@ -12360,6 +12720,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, + .read_l1_tsc_offset = vmx_read_l1_tsc_offset, .write_tsc_offset = vmx_write_tsc_offset, .set_tdp_cr3 = vmx_set_cr3, @@ -12402,7 +12763,38 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { static int __init vmx_init(void) { - int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), + int r; + +#if IS_ENABLED(CONFIG_HYPERV) + /* + * Enlightened VMCS usage should be recommended and the host needs + * to support eVMCS v1 or above. We can also disable eVMCS support + * with module parameter. + */ + if (enlightened_vmcs && + ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && + (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= + KVM_EVMCS_VERSION) { + int cpu; + + /* Check that we have assist pages on all online CPUs */ + for_each_online_cpu(cpu) { + if (!hv_get_vp_assist_page(cpu)) { + enlightened_vmcs = false; + break; + } + } + + if (enlightened_vmcs) { + pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n"); + static_branch_enable(&enable_evmcs); + } + } else { + enlightened_vmcs = false; + } +#endif + + r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), THIS_MODULE); if (r) return r; @@ -12423,6 +12815,29 @@ static void __exit vmx_exit(void) #endif kvm_exit(); + +#if IS_ENABLED(CONFIG_HYPERV) + if (static_branch_unlikely(&enable_evmcs)) { + int cpu; + struct hv_vp_assist_page *vp_ap; + /* + * Reset everything to support using non-enlightened VMCS + * access later (e.g. when we reload the module with + * enlightened_vmcs=0) + */ + for_each_online_cpu(cpu) { + vp_ap = hv_get_vp_assist_page(cpu); + + if (!vp_ap) + continue; + + vp_ap->current_nested_vmcs = 0; + vp_ap->enlighten_vmentry = 0; + } + + static_branch_disable(&enable_evmcs); + } +#endif } module_init(vmx_init) diff --git a/arch/x86/kvm/vmx_evmcs.h b/arch/x86/kvm/vmx_evmcs.h new file mode 100644 index 000000000000..210a884090ad --- /dev/null +++ b/arch/x86/kvm/vmx_evmcs.h @@ -0,0 +1,324 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_EVMCS_H +#define __KVM_X86_VMX_EVMCS_H + +#include <asm/hyperv-tlfs.h> + +#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) +#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) +#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ + {EVMCS1_OFFSET(name), clean_field} + +struct evmcs_field { + u16 offset; + u16 clean_field; +}; + +static const struct evmcs_field vmcs_field_to_evmcs_1[] = { + /* 64 bit rw */ + EVMCS1_FIELD(GUEST_RIP, guest_rip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(GUEST_RSP, guest_rsp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + EVMCS1_FIELD(GUEST_RFLAGS, guest_rflags, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + EVMCS1_FIELD(HOST_IA32_PAT, host_ia32_pat, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_EFER, host_ia32_efer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CR0, host_cr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CR3, host_cr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CR4, host_cr4, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_RIP, host_rip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(IO_BITMAP_A, io_bitmap_a, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), + EVMCS1_FIELD(IO_BITMAP_B, io_bitmap_b, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), + EVMCS1_FIELD(MSR_BITMAP, msr_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP), + EVMCS1_FIELD(GUEST_ES_BASE, guest_es_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_BASE, guest_cs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_BASE, guest_ss_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_BASE, guest_ds_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_BASE, guest_fs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_BASE, guest_gs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_BASE, guest_ldtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_BASE, guest_tr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GDTR_BASE, guest_gdtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_IDTR_BASE, guest_idtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(TSC_OFFSET, tsc_offset, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + EVMCS1_FIELD(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + EVMCS1_FIELD(VMCS_LINK_POINTER, vmcs_link_pointer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_PAT, guest_ia32_pat, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_EFER, guest_ia32_efer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR0, guest_pdptr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR1, guest_pdptr1, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR2, guest_pdptr2, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR3, guest_pdptr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(CR0_READ_SHADOW, cr0_read_shadow, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(CR4_READ_SHADOW, cr4_read_shadow, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_CR0, guest_cr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_CR3, guest_cr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_CR4, guest_cr4, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_DR7, guest_dr7, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(HOST_FS_BASE, host_fs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_GS_BASE, host_gs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_TR_BASE, host_tr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_GDTR_BASE, host_gdtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_IDTR_BASE, host_idtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_RSP, host_rsp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(EPT_POINTER, ept_pointer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), + EVMCS1_FIELD(GUEST_BNDCFGS, guest_bndcfgs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(XSS_EXIT_BITMAP, xss_exit_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + + /* 64 bit read only */ + EVMCS1_FIELD(GUEST_PHYSICAL_ADDRESS, guest_physical_address, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(EXIT_QUALIFICATION, exit_qualification, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + /* + * Not defined in KVM: + * + * EVMCS1_FIELD(0x00006402, exit_io_instruction_ecx, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + * EVMCS1_FIELD(0x00006404, exit_io_instruction_esi, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + * EVMCS1_FIELD(0x00006406, exit_io_instruction_esi, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + * EVMCS1_FIELD(0x00006408, exit_io_instruction_eip, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + */ + EVMCS1_FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + + /* + * No mask defined in the spec as Hyper-V doesn't currently support + * these. Future proof by resetting the whole clean field mask on + * access. + */ + EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(CR3_TARGET_VALUE0, cr3_target_value0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(CR3_TARGET_VALUE1, cr3_target_value1, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(CR3_TARGET_VALUE2, cr3_target_value2, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(CR3_TARGET_VALUE3, cr3_target_value3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + + /* 32 bit rw */ + EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + EVMCS1_FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC), + EVMCS1_FIELD(EXCEPTION_BITMAP, exception_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN), + EVMCS1_FIELD(VM_ENTRY_CONTROLS, vm_entry_controls, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY), + EVMCS1_FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), + EVMCS1_FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, + vm_entry_exception_error_code, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), + EVMCS1_FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), + EVMCS1_FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), + EVMCS1_FIELD(VM_EXIT_CONTROLS, vm_exit_controls, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), + EVMCS1_FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), + EVMCS1_FIELD(GUEST_ES_LIMIT, guest_es_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_LIMIT, guest_cs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_LIMIT, guest_ss_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_LIMIT, guest_ds_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_LIMIT, guest_fs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_LIMIT, guest_gs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_LIMIT, guest_tr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_ACTIVITY_STATE, guest_activity_state, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + + /* 32 bit read only */ + EVMCS1_FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_REASON, vm_exit_reason, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + + /* No mask defined in the spec (not used) */ + EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(CR3_TARGET_COUNT, cr3_target_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + + /* 16 bit rw */ + EVMCS1_FIELD(HOST_ES_SELECTOR, host_es_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CS_SELECTOR, host_cs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_SS_SELECTOR, host_ss_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_DS_SELECTOR, host_ds_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_FS_SELECTOR, host_fs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_GS_SELECTOR, host_gs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_TR_SELECTOR, host_tr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(GUEST_ES_SELECTOR, guest_es_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_SELECTOR, guest_cs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_SELECTOR, guest_ss_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_SELECTOR, guest_ds_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_SELECTOR, guest_fs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_SELECTOR, guest_gs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_SELECTOR, guest_tr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), +}; + +static __always_inline int get_evmcs_offset(unsigned long field, + u16 *clean_field) +{ + unsigned int index = ROL16(field, 6); + const struct evmcs_field *evmcs_field; + + if (unlikely(index >= ARRAY_SIZE(vmcs_field_to_evmcs_1))) { + WARN_ONCE(1, "KVM: accessing unsupported EVMCS field %lx\n", + field); + return -ENOENT; + } + + evmcs_field = &vmcs_field_to_evmcs_1[index]; + + if (clean_field) + *clean_field = evmcs_field->clean_field; + + return evmcs_field->offset; +} + +#undef ROL16 + +#endif /* __KVM_X86_VMX_EVMCS_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c8a0b545ac20..51ecd381793b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -102,6 +102,8 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); static void enter_smm(struct kvm_vcpu *vcpu); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); +static void store_regs(struct kvm_vcpu *vcpu); +static int sync_regs(struct kvm_vcpu *vcpu); struct kvm_x86_ops *kvm_x86_ops __read_mostly; EXPORT_SYMBOL_GPL(kvm_x86_ops); @@ -140,6 +142,13 @@ module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); static bool __read_mostly vector_hashing = true; module_param(vector_hashing, bool, S_IRUGO); +bool __read_mostly enable_vmware_backdoor = false; +module_param(enable_vmware_backdoor, bool, S_IRUGO); +EXPORT_SYMBOL_GPL(enable_vmware_backdoor); + +static bool __read_mostly force_emulation_prefix = false; +module_param(force_emulation_prefix, bool, S_IRUGO); + #define KVM_NR_SHARED_MSRS 16 struct kvm_shared_msrs_global { @@ -1032,7 +1041,11 @@ static u32 emulated_msrs[] = { HV_X64_MSR_VP_RUNTIME, HV_X64_MSR_SCONTROL, HV_X64_MSR_STIMER0_CONFIG, - HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, + HV_X64_MSR_VP_ASSIST_PAGE, + HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, + HV_X64_MSR_TSC_EMULATION_STATUS, + + MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_KVM_PV_EOI_EN, MSR_IA32_TSC_ADJUST, @@ -1049,6 +1062,64 @@ static u32 emulated_msrs[] = { static unsigned num_emulated_msrs; +/* + * List of msr numbers which are used to expose MSR-based features that + * can be used by a hypervisor to validate requested CPU features. + */ +static u32 msr_based_features[] = { + MSR_IA32_VMX_BASIC, + MSR_IA32_VMX_TRUE_PINBASED_CTLS, + MSR_IA32_VMX_PINBASED_CTLS, + MSR_IA32_VMX_TRUE_PROCBASED_CTLS, + MSR_IA32_VMX_PROCBASED_CTLS, + MSR_IA32_VMX_TRUE_EXIT_CTLS, + MSR_IA32_VMX_EXIT_CTLS, + MSR_IA32_VMX_TRUE_ENTRY_CTLS, + MSR_IA32_VMX_ENTRY_CTLS, + MSR_IA32_VMX_MISC, + MSR_IA32_VMX_CR0_FIXED0, + MSR_IA32_VMX_CR0_FIXED1, + MSR_IA32_VMX_CR4_FIXED0, + MSR_IA32_VMX_CR4_FIXED1, + MSR_IA32_VMX_VMCS_ENUM, + MSR_IA32_VMX_PROCBASED_CTLS2, + MSR_IA32_VMX_EPT_VPID_CAP, + MSR_IA32_VMX_VMFUNC, + + MSR_F10H_DECFG, + MSR_IA32_UCODE_REV, +}; + +static unsigned int num_msr_based_features; + +static int kvm_get_msr_feature(struct kvm_msr_entry *msr) +{ + switch (msr->index) { + case MSR_IA32_UCODE_REV: + rdmsrl(msr->index, msr->data); + break; + default: + if (kvm_x86_ops->get_msr_feature(msr)) + return 1; + } + return 0; +} + +static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +{ + struct kvm_msr_entry msr; + int r; + + msr.index = index; + r = kvm_get_msr_feature(&msr); + if (r) + return r; + + *data = msr.data; + + return 0; +} + bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) { if (efer & efer_reserved_bits) @@ -1419,7 +1490,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) { - u64 curr_offset = vcpu->arch.tsc_offset; + u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu); vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; } @@ -1461,7 +1532,9 @@ static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { - return vcpu->arch.tsc_offset + kvm_scale_tsc(vcpu, host_tsc); + u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu); + + return tsc_offset + kvm_scale_tsc(vcpu, host_tsc); } EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); @@ -2222,7 +2295,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr) { case MSR_AMD64_NB_CFG: - case MSR_IA32_UCODE_REV: case MSR_IA32_UCODE_WRITE: case MSR_VM_HSAVE_PA: case MSR_AMD64_PATCH_LOADER: @@ -2230,6 +2302,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_AMD64_DC_CFG: break; + case MSR_IA32_UCODE_REV: + if (msr_info->host_initiated) + vcpu->arch.microcode_version = data; + break; case MSR_EFER: return set_efer(vcpu, data); case MSR_K7_HWCR: @@ -2288,6 +2364,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.smbase = data; break; + case MSR_IA32_TSC: + kvm_write_tsc(vcpu, msr_info); + break; case MSR_SMI_COUNT: if (!msr_info->host_initiated) return 1; @@ -2390,6 +2469,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: + case HV_X64_MSR_REENLIGHTENMENT_CONTROL: + case HV_X64_MSR_TSC_EMULATION_CONTROL: + case HV_X64_MSR_TSC_EMULATION_STATUS: return kvm_hv_set_msr_common(vcpu, msr, data, msr_info->host_initiated); case MSR_IA32_BBL_CR_CTL3: @@ -2516,6 +2598,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_AMD64_DC_CFG: msr_info->data = 0; break; + case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: @@ -2525,7 +2608,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0; break; case MSR_IA32_UCODE_REV: - msr_info->data = 0x100000000ULL; + msr_info->data = vcpu->arch.microcode_version; + break; + case MSR_IA32_TSC: + msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset; break; case MSR_MTRRcap: case 0x200 ... 0x2ff: @@ -2619,6 +2705,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: + case HV_X64_MSR_REENLIGHTENMENT_CONTROL: + case HV_X64_MSR_TSC_EMULATION_CONTROL: + case HV_X64_MSR_TSC_EMULATION_STATUS: return kvm_hv_get_msr_common(vcpu, msr_info->index, &msr_info->data); break; @@ -2680,13 +2769,11 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int (*do_msr)(struct kvm_vcpu *vcpu, unsigned index, u64 *data)) { - int i, idx; + int i; - idx = srcu_read_lock(&vcpu->kvm->srcu); for (i = 0; i < msrs->nmsrs; ++i) if (do_msr(vcpu, entries[i].index, &entries[i].data)) break; - srcu_read_unlock(&vcpu->kvm->srcu, idx); return i; } @@ -2737,9 +2824,16 @@ out: return r; } +static inline bool kvm_can_mwait_in_guest(void) +{ + return boot_cpu_has(X86_FEATURE_MWAIT) && + !boot_cpu_has_bug(X86_BUG_MONITOR) && + boot_cpu_has(X86_FEATURE_ARAT); +} + int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { - int r; + int r = 0; switch (ext) { case KVM_CAP_IRQCHIP: @@ -2769,6 +2863,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_SYNIC: case KVM_CAP_HYPERV_SYNIC2: case KVM_CAP_HYPERV_VP_INDEX: + case KVM_CAP_HYPERV_EVENTFD: case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: @@ -2785,13 +2880,19 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SET_BOOT_CPU_ID: case KVM_CAP_SPLIT_IRQCHIP: case KVM_CAP_IMMEDIATE_EXIT: + case KVM_CAP_GET_MSR_FEATURES: r = 1; break; + case KVM_CAP_SYNC_REGS: + r = KVM_SYNC_X86_VALID_FIELDS; + break; case KVM_CAP_ADJUST_CLOCK: r = KVM_CLOCK_TSC_STABLE; break; - case KVM_CAP_X86_GUEST_MWAIT: - r = kvm_mwait_in_guest(); + case KVM_CAP_X86_DISABLE_EXITS: + r |= KVM_X86_DISABLE_EXITS_HTL | KVM_X86_DISABLE_EXITS_PAUSE; + if(kvm_can_mwait_in_guest()) + r |= KVM_X86_DISABLE_EXITS_MWAIT; break; case KVM_CAP_X86_SMM: /* SMBASE is usually relocated above 1M on modern chipsets, @@ -2832,7 +2933,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_X2APIC_API_VALID_FLAGS; break; default: - r = 0; break; } return r; @@ -2899,6 +2999,31 @@ long kvm_arch_dev_ioctl(struct file *filp, goto out; r = 0; break; + case KVM_GET_MSR_FEATURE_INDEX_LIST: { + struct kvm_msr_list __user *user_msr_list = argp; + struct kvm_msr_list msr_list; + unsigned int n; + + r = -EFAULT; + if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list))) + goto out; + n = msr_list.nmsrs; + msr_list.nmsrs = num_msr_based_features; + if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list))) + goto out; + r = -E2BIG; + if (n < msr_list.nmsrs) + goto out; + r = -EFAULT; + if (copy_to_user(user_msr_list->indices, &msr_based_features, + num_msr_based_features * sizeof(u32))) + goto out; + r = 0; + break; + } + case KVM_GET_MSRS: + r = msr_io(NULL, argp, do_get_msr_feature, 1); + break; } default: r = -EINVAL; @@ -3199,7 +3324,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->exception.error_code = vcpu->arch.exception.error_code; events->interrupt.injected = - vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft; + vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft; events->interrupt.nr = vcpu->arch.interrupt.nr; events->interrupt.soft = 0; events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu); @@ -3252,7 +3377,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.exception.has_error_code = events->exception.has_error_code; vcpu->arch.exception.error_code = events->exception.error_code; - vcpu->arch.interrupt.pending = events->interrupt.injected; + vcpu->arch.interrupt.injected = events->interrupt.injected; vcpu->arch.interrupt.nr = events->interrupt.nr; vcpu->arch.interrupt.soft = events->interrupt.soft; if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) @@ -3636,12 +3761,18 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = 0; break; } - case KVM_GET_MSRS: + case KVM_GET_MSRS: { + int idx = srcu_read_lock(&vcpu->kvm->srcu); r = msr_io(vcpu, argp, do_get_msr, 1); + srcu_read_unlock(&vcpu->kvm->srcu, idx); break; - case KVM_SET_MSRS: + } + case KVM_SET_MSRS: { + int idx = srcu_read_lock(&vcpu->kvm->srcu); r = msr_io(vcpu, argp, do_set_msr, 0); + srcu_read_unlock(&vcpu->kvm->srcu, idx); break; + } case KVM_TPR_ACCESS_REPORTING: { struct kvm_tpr_access_ctl tac; @@ -3845,8 +3976,7 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) { - kvm->arch.ept_identity_map_addr = ident_addr; - return 0; + return kvm_x86_ops->set_identity_map_addr(kvm, ident_addr); } static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, @@ -4106,6 +4236,20 @@ split_irqchip_unlock: r = 0; break; + case KVM_CAP_X86_DISABLE_EXITS: + r = -EINVAL; + if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS) + break; + + if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) && + kvm_can_mwait_in_guest()) + kvm->arch.mwait_in_guest = true; + if (cap->args[0] & KVM_X86_DISABLE_EXITS_HTL) + kvm->arch.hlt_in_guest = true; + if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE) + kvm->arch.pause_in_guest = true; + r = 0; + break; default: r = -EINVAL; break; @@ -4410,6 +4554,15 @@ set_identity_unlock: r = kvm_x86_ops->mem_enc_unreg_region(kvm, ®ion); break; } + case KVM_HYPERV_EVENTFD: { + struct kvm_hyperv_eventfd hvevfd; + + r = -EFAULT; + if (copy_from_user(&hvevfd, argp, sizeof(hvevfd))) + goto out; + r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd); + break; + } default: r = -ENOTTY; } @@ -4464,6 +4617,19 @@ static void kvm_init_msr_list(void) j++; } num_emulated_msrs = j; + + for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) { + struct kvm_msr_entry msr; + + msr.index = msr_based_features[i]; + if (kvm_get_msr_feature(&msr)) + continue; + + if (j < i) + msr_based_features[j] = msr_based_features[i]; + j++; + } + num_msr_based_features = j; } static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, @@ -4686,6 +4852,30 @@ out: } EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); +int handle_ud(struct kvm_vcpu *vcpu) +{ + int emul_type = EMULTYPE_TRAP_UD; + enum emulation_result er; + char sig[5]; /* ud2; .ascii "kvm" */ + struct x86_exception e; + + if (force_emulation_prefix && + kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, + kvm_get_linear_rip(vcpu), sig, sizeof(sig), &e) == 0 && + memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) { + kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig)); + emul_type = 0; + } + + er = emulate_instruction(vcpu, emul_type); + if (er == EMULATE_USER_EXIT) + return 0; + if (er != EMULATE_DONE) + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} +EXPORT_SYMBOL_GPL(handle_ud); + static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva, gpa_t gpa, bool write) { @@ -5527,27 +5717,27 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); - if (irq == NMI_VECTOR) - vcpu->arch.nmi_pending = 0; - else - vcpu->arch.interrupt.pending = false; - return EMULATE_DONE; } EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); -static int handle_emulation_failure(struct kvm_vcpu *vcpu) +static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) { int r = EMULATE_DONE; ++vcpu->stat.insn_emulation_fail; trace_kvm_emulate_insn_failed(vcpu); + + if (emulation_type & EMULTYPE_NO_UD_ON_FAIL) + return EMULATE_FAIL; + if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; r = EMULATE_USER_EXIT; } + kvm_queue_exception(vcpu, UD_VECTOR); return r; @@ -5791,6 +5981,37 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) return false; } +static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt) +{ + switch (ctxt->opcode_len) { + case 1: + switch (ctxt->b) { + case 0xe4: /* IN */ + case 0xe5: + case 0xec: + case 0xed: + case 0xe6: /* OUT */ + case 0xe7: + case 0xee: + case 0xef: + case 0x6c: /* INS */ + case 0x6d: + case 0x6e: /* OUTS */ + case 0x6f: + return true; + } + break; + case 2: + switch (ctxt->b) { + case 0x33: /* RDPMC */ + return true; + } + break; + } + + return false; +} + int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, int emulation_type, @@ -5843,10 +6064,14 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, return EMULATE_DONE; if (emulation_type & EMULTYPE_SKIP) return EMULATE_FAIL; - return handle_emulation_failure(vcpu); + return handle_emulation_failure(vcpu, emulation_type); } } + if ((emulation_type & EMULTYPE_VMWARE) && + !is_vmware_backdoor_opcode(ctxt)) + return EMULATE_FAIL; + if (emulation_type & EMULTYPE_SKIP) { kvm_rip_write(vcpu, ctxt->_eip); if (ctxt->eflags & X86_EFLAGS_RF) @@ -5878,7 +6103,7 @@ restart: emulation_type)) return EMULATE_DONE; - return handle_emulation_failure(vcpu); + return handle_emulation_failure(vcpu, emulation_type); } if (ctxt->have_exception) { @@ -5931,7 +6156,8 @@ restart: } EXPORT_SYMBOL_GPL(x86_emulate_instruction); -int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) +static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, + unsigned short port) { unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, @@ -5940,7 +6166,6 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) vcpu->arch.pio.count = 0; return ret; } -EXPORT_SYMBOL_GPL(kvm_fast_pio_out); static int complete_fast_pio_in(struct kvm_vcpu *vcpu) { @@ -5964,7 +6189,8 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu) return 1; } -int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port) +static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, + unsigned short port) { unsigned long val; int ret; @@ -5983,7 +6209,21 @@ int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port) return 0; } -EXPORT_SYMBOL_GPL(kvm_fast_pio_in); + +int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in) +{ + int ret = kvm_skip_emulated_instruction(vcpu); + + /* + * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered + * KVM_EXIT_DEBUG here. + */ + if (in) + return kvm_fast_pio_in(vcpu, size, port) && ret; + else + return kvm_fast_pio_out(vcpu, size, port) && ret; +} +EXPORT_SYMBOL_GPL(kvm_fast_pio); static int kvmclock_cpu_down_prep(unsigned int cpu) { @@ -6161,7 +6401,8 @@ static void kvm_timer_init(void) kvmclock_cpu_online, kvmclock_cpu_down_prep); } -static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); +DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); +EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu); int kvm_is_in_guest(void) { @@ -6194,18 +6435,6 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { .get_guest_ip = kvm_get_guest_ip, }; -void kvm_before_handle_nmi(struct kvm_vcpu *vcpu) -{ - __this_cpu_write(current_vcpu, vcpu); -} -EXPORT_SYMBOL_GPL(kvm_before_handle_nmi); - -void kvm_after_handle_nmi(struct kvm_vcpu *vcpu) -{ - __this_cpu_write(current_vcpu, NULL); -} -EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); - static void kvm_set_mmio_spte_mask(void) { u64 mask; @@ -6559,27 +6788,36 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) int r; /* try to reinject previous events if any */ - if (vcpu->arch.exception.injected) { - kvm_x86_ops->queue_exception(vcpu); - return 0; - } + if (vcpu->arch.exception.injected) + kvm_x86_ops->queue_exception(vcpu); /* - * Exceptions must be injected immediately, or the exception - * frame will have the address of the NMI or interrupt handler. + * Do not inject an NMI or interrupt if there is a pending + * exception. Exceptions and interrupts are recognized at + * instruction boundaries, i.e. the start of an instruction. + * Trap-like exceptions, e.g. #DB, have higher priority than + * NMIs and interrupts, i.e. traps are recognized before an + * NMI/interrupt that's pending on the same instruction. + * Fault-like exceptions, e.g. #GP and #PF, are the lowest + * priority, but are only generated (pended) during instruction + * execution, i.e. a pending fault-like exception means the + * fault occurred on the *previous* instruction and must be + * serviced prior to recognizing any new events in order to + * fully complete the previous instruction. */ - if (!vcpu->arch.exception.pending) { - if (vcpu->arch.nmi_injected) { + else if (!vcpu->arch.exception.pending) { + if (vcpu->arch.nmi_injected) kvm_x86_ops->set_nmi(vcpu); - return 0; - } - - if (vcpu->arch.interrupt.pending) { + else if (vcpu->arch.interrupt.injected) kvm_x86_ops->set_irq(vcpu); - return 0; - } } + /* + * Call check_nested_events() even if we reinjected a previous event + * in order for caller to determine if it should require immediate-exit + * from L2 to L1 due to pending L1 events which require exit + * from L2 to L1. + */ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); if (r != 0) @@ -6592,6 +6830,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) vcpu->arch.exception.has_error_code, vcpu->arch.exception.error_code); + WARN_ON_ONCE(vcpu->arch.exception.injected); vcpu->arch.exception.pending = false; vcpu->arch.exception.injected = true; @@ -6606,7 +6845,14 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) } kvm_x86_ops->queue_exception(vcpu); - } else if (vcpu->arch.smi_pending && !is_smm(vcpu) && kvm_x86_ops->smi_allowed(vcpu)) { + } + + /* Don't consider new event if we re-injected an event */ + if (kvm_event_needs_reinjection(vcpu)) + return 0; + + if (vcpu->arch.smi_pending && !is_smm(vcpu) && + kvm_x86_ops->smi_allowed(vcpu)) { vcpu->arch.smi_pending = false; ++vcpu->arch.smi_count; enter_smm(vcpu); @@ -6900,8 +7146,6 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm) static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { - u64 eoi_exit_bitmap[4]; - if (!kvm_apic_hw_enabled(vcpu->arch.apic)) return; @@ -6914,6 +7158,20 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) kvm_x86_ops->sync_pir_to_irr(vcpu); kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); } + + if (is_guest_mode(vcpu)) + vcpu->arch.load_eoi_exitmap_pending = true; + else + kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu); +} + +static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) +{ + u64 eoi_exit_bitmap[4]; + + if (!kvm_apic_hw_enabled(vcpu->arch.apic)) + return; + bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, vcpu_to_synic(vcpu)->vec_bitmap, 256); kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); @@ -7028,6 +7286,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu)) vcpu_scan_ioapic(vcpu); + if (kvm_check_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu)) + vcpu_load_eoi_exitmap(vcpu); if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu)) kvm_vcpu_reload_apic_access_page(vcpu); if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) { @@ -7206,7 +7466,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_put_guest_xcr0(vcpu); + kvm_before_interrupt(vcpu); kvm_x86_ops->handle_external_intr(vcpu); + kvm_after_interrupt(vcpu); ++vcpu->stat.exits; @@ -7415,7 +7677,6 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) return 0; } - int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; @@ -7441,6 +7702,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) goto out; } + if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) { + r = -EINVAL; + goto out; + } + + if (vcpu->run->kvm_dirty_regs) { + r = sync_regs(vcpu); + if (r != 0) + goto out; + } + /* re-sync apic's tpr */ if (!lapic_in_kernel(vcpu)) { if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) { @@ -7465,6 +7737,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) out: kvm_put_guest_fpu(vcpu); + if (vcpu->run->kvm_valid_regs) + store_regs(vcpu); post_kvm_run_save(vcpu); kvm_sigset_deactivate(vcpu); @@ -7472,10 +7746,8 @@ out: return r; } -int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { - vcpu_load(vcpu); - if (vcpu->arch.emulate_regs_need_sync_to_vcpu) { /* * We are here if userspace calls get_regs() in the middle of @@ -7508,15 +7780,18 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->rip = kvm_rip_read(vcpu); regs->rflags = kvm_get_rflags(vcpu); +} +int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + vcpu_load(vcpu); + __get_regs(vcpu, regs); vcpu_put(vcpu); return 0; } -int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { - vcpu_load(vcpu); - vcpu->arch.emulate_regs_need_sync_from_vcpu = true; vcpu->arch.emulate_regs_need_sync_to_vcpu = false; @@ -7545,7 +7820,12 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.exception.pending = false; kvm_make_request(KVM_REQ_EVENT, vcpu); +} +int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + vcpu_load(vcpu); + __set_regs(vcpu, regs); vcpu_put(vcpu); return 0; } @@ -7560,13 +7840,10 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) } EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); -int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) +static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { struct desc_ptr dt; - vcpu_load(vcpu); - kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); @@ -7594,10 +7871,16 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); - if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft) + if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft) set_bit(vcpu->arch.interrupt.nr, (unsigned long *)sregs->interrupt_bitmap); +} +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + vcpu_load(vcpu); + __get_sregs(vcpu, sregs); vcpu_put(vcpu); return 0; } @@ -7669,7 +7952,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, } EXPORT_SYMBOL_GPL(kvm_task_switch); -int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) { /* @@ -7692,8 +7975,7 @@ int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) return 0; } -int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) +static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { struct msr_data apic_base_msr; int mmu_reset_needed = 0; @@ -7701,8 +7983,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct desc_ptr dt; int ret = -EINVAL; - vcpu_load(vcpu); - if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (sregs->cr4 & X86_CR4_OSXSAVE)) goto out; @@ -7781,6 +8061,16 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, ret = 0; out: + return ret; +} + +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + int ret; + + vcpu_load(vcpu); + ret = __set_sregs(vcpu, sregs); vcpu_put(vcpu); return ret; } @@ -7907,6 +8197,45 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) return 0; } +static void store_regs(struct kvm_vcpu *vcpu) +{ + BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES); + + if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS) + __get_regs(vcpu, &vcpu->run->s.regs.regs); + + if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS) + __get_sregs(vcpu, &vcpu->run->s.regs.sregs); + + if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS) + kvm_vcpu_ioctl_x86_get_vcpu_events( + vcpu, &vcpu->run->s.regs.events); +} + +static int sync_regs(struct kvm_vcpu *vcpu) +{ + if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS) + return -EINVAL; + + if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) { + __set_regs(vcpu, &vcpu->run->s.regs.regs); + vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS; + } + if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) { + if (__set_sregs(vcpu, &vcpu->run->s.regs.sregs)) + return -EINVAL; + vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS; + } + if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) { + if (kvm_vcpu_ioctl_x86_set_vcpu_events( + vcpu, &vcpu->run->s.regs.events)) + return -EINVAL; + vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS; + } + + return 0; +} + static void fx_init(struct kvm_vcpu *vcpu) { fpstate_init(&vcpu->arch.guest_fpu.state); @@ -8017,6 +8346,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { + kvm_lapic_reset(vcpu, init_event); + vcpu->arch.hflags = 0; vcpu->arch.smi_pending = 0; @@ -8360,7 +8691,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); - mutex_init(&kvm->arch.hyperv.hv_lock); spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); kvm->arch.kvmclock_offset = -ktime_get_boot_ns(); @@ -8369,6 +8699,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); + kvm_hv_init_vm(kvm); kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); @@ -8460,10 +8791,8 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) return r; } - if (!size) { - r = vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE); - WARN_ON(r < 0); - } + if (!size) + vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE); return 0; } @@ -8501,6 +8830,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); kvm_mmu_uninit_vm(kvm); kvm_page_track_cleanup(kvm); + kvm_hv_destroy_vm(kvm); } void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index b91215d1fd80..c9492f764902 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -2,12 +2,48 @@ #ifndef ARCH_X86_KVM_X86_H #define ARCH_X86_KVM_X86_H -#include <asm/processor.h> -#include <asm/mwait.h> #include <linux/kvm_host.h> #include <asm/pvclock.h> #include "kvm_cache_regs.h" +#define KVM_DEFAULT_PLE_GAP 128 +#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 +#define KVM_DEFAULT_PLE_WINDOW_GROW 2 +#define KVM_DEFAULT_PLE_WINDOW_SHRINK 0 +#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX UINT_MAX +#define KVM_SVM_DEFAULT_PLE_WINDOW_MAX USHRT_MAX +#define KVM_SVM_DEFAULT_PLE_WINDOW 3000 + +static inline unsigned int __grow_ple_window(unsigned int val, + unsigned int base, unsigned int modifier, unsigned int max) +{ + u64 ret = val; + + if (modifier < 1) + return base; + + if (modifier < base) + ret *= modifier; + else + ret += modifier; + + return min(ret, (u64)max); +} + +static inline unsigned int __shrink_ple_window(unsigned int val, + unsigned int base, unsigned int modifier, unsigned int min) +{ + if (modifier < 1) + return base; + + if (modifier < base) + val /= modifier; + else + val -= modifier; + + return max(val, min); +} + #define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) @@ -19,19 +55,19 @@ static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector, bool soft) { - vcpu->arch.interrupt.pending = true; + vcpu->arch.interrupt.injected = true; vcpu->arch.interrupt.soft = soft; vcpu->arch.interrupt.nr = vector; } static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu) { - vcpu->arch.interrupt.pending = false; + vcpu->arch.interrupt.injected = false; } static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu) { - return vcpu->arch.exception.injected || vcpu->arch.interrupt.pending || + return vcpu->arch.exception.injected || vcpu->arch.interrupt.injected || vcpu->arch.nmi_injected; } @@ -205,8 +241,6 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) return !(kvm->arch.disabled_quirks & quirk); } -void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); -void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); void kvm_set_pending_timer(struct kvm_vcpu *vcpu); int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); @@ -221,6 +255,8 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception); +int handle_ud(struct kvm_vcpu *vcpu); + void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu); u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data); @@ -242,6 +278,8 @@ extern unsigned int min_timer_period_us; extern unsigned int lapic_timer_advance_ns; +extern bool enable_vmware_backdoor; + extern struct static_key kvm_no_apic_vcpu; static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) @@ -264,10 +302,31 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) __rem; \ }) -static inline bool kvm_mwait_in_guest(void) +static inline bool kvm_mwait_in_guest(struct kvm *kvm) +{ + return kvm->arch.mwait_in_guest; +} + +static inline bool kvm_hlt_in_guest(struct kvm *kvm) +{ + return kvm->arch.hlt_in_guest; +} + +static inline bool kvm_pause_in_guest(struct kvm *kvm) +{ + return kvm->arch.pause_in_guest; +} + +DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu); + +static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu) +{ + __this_cpu_write(current_vcpu, vcpu); +} + +static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu) { - return boot_cpu_has(X86_FEATURE_MWAIT) && - !boot_cpu_has_bug(X86_BUG_MONITOR); + __this_cpu_write(current_vcpu, NULL); } #endif diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 91e9700cc6dc..25a972c61b0a 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -28,7 +28,6 @@ lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o lib-$(CONFIG_RETPOLINE) += retpoline.o -OBJECT_FILES_NON_STANDARD_retpoline.o :=y obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index 81b1635d67de..88acd349911b 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -1,6 +1,4 @@ #include <linux/linkage.h> -#include <asm/cpufeatures.h> -#include <asm/alternative-asm.h> #include <asm/export.h> /* diff --git a/arch/x86/lib/cpu.c b/arch/x86/lib/cpu.c index d6f848d1211d..2dd1fe13a37b 100644 --- a/arch/x86/lib/cpu.c +++ b/arch/x86/lib/cpu.c @@ -18,7 +18,7 @@ unsigned int x86_model(unsigned int sig) { unsigned int fam, model; - fam = x86_family(sig); + fam = x86_family(sig); model = (sig >> 4) & 0xf; diff --git a/arch/x86/lib/error-inject.c b/arch/x86/lib/error-inject.c index 7b881d03d0dd..3cdf06128d13 100644 --- a/arch/x86/lib/error-inject.c +++ b/arch/x86/lib/error-inject.c @@ -7,6 +7,7 @@ asmlinkage void just_return_func(void); asm( ".type just_return_func, @function\n" + ".globl just_return_func\n" "just_return_func:\n" " ret\n" ".size just_return_func, .-just_return_func\n" diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c index 693cce0be82d..fee8b9c0520c 100644 --- a/arch/x86/lib/msr-smp.c +++ b/arch/x86/lib/msr-smp.c @@ -2,6 +2,7 @@ #include <linux/export.h> #include <linux/preempt.h> #include <linux/smp.h> +#include <linux/completion.h> #include <asm/msr.h> static void __rdmsr_on_cpu(void *info) @@ -143,13 +144,19 @@ void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs) } EXPORT_SYMBOL(wrmsr_on_cpus); +struct msr_info_completion { + struct msr_info msr; + struct completion done; +}; + /* These "safe" variants are slower and should be used when the target MSR may not actually exist. */ static void __rdmsr_safe_on_cpu(void *info) { - struct msr_info *rv = info; + struct msr_info_completion *rv = info; - rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h); + rv->msr.err = rdmsr_safe(rv->msr.msr_no, &rv->msr.reg.l, &rv->msr.reg.h); + complete(&rv->done); } static void __wrmsr_safe_on_cpu(void *info) @@ -161,17 +168,26 @@ static void __wrmsr_safe_on_cpu(void *info) int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) { + struct msr_info_completion rv; + call_single_data_t csd = { + .func = __rdmsr_safe_on_cpu, + .info = &rv, + }; int err; - struct msr_info rv; memset(&rv, 0, sizeof(rv)); + init_completion(&rv.done); + rv.msr.msr_no = msr_no; - rv.msr_no = msr_no; - err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1); - *l = rv.reg.l; - *h = rv.reg.h; + err = smp_call_function_single_async(cpu, &csd); + if (!err) { + wait_for_completion(&rv.done); + err = rv.msr.err; + } + *l = rv.msr.reg.l; + *h = rv.msr.reg.h; - return err ? err : rv.err; + return err; } EXPORT_SYMBOL(rdmsr_safe_on_cpu); @@ -209,16 +225,13 @@ EXPORT_SYMBOL(wrmsrl_safe_on_cpu); int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) { + u32 low, high; int err; - struct msr_info rv; - memset(&rv, 0, sizeof(rv)); + err = rdmsr_safe_on_cpu(cpu, msr_no, &low, &high); + *q = (u64)high << 32 | low; - rv.msr_no = msr_no; - err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1); - *q = rv.reg.q; - - return err ? err : rv.err; + return err; } EXPORT_SYMBOL(rdmsrl_safe_on_cpu); diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 480edc3a5e03..c909961e678a 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -7,7 +7,6 @@ #include <asm/alternative-asm.h> #include <asm/export.h> #include <asm/nospec-branch.h> -#include <asm/bitsperlong.h> .macro THUNK reg .section .text.__x86.indirect_thunk @@ -47,58 +46,3 @@ GENERATE_THUNK(r13) GENERATE_THUNK(r14) GENERATE_THUNK(r15) #endif - -/* - * Fill the CPU return stack buffer. - * - * Each entry in the RSB, if used for a speculative 'ret', contains an - * infinite 'pause; lfence; jmp' loop to capture speculative execution. - * - * This is required in various cases for retpoline and IBRS-based - * mitigations for the Spectre variant 2 vulnerability. Sometimes to - * eliminate potentially bogus entries from the RSB, and sometimes - * purely to ensure that it doesn't get empty, which on some CPUs would - * allow predictions from other (unwanted!) sources to be used. - * - * Google experimented with loop-unrolling and this turned out to be - * the optimal version - two calls, each with their own speculation - * trap should their return address end up getting used, in a loop. - */ -.macro STUFF_RSB nr:req sp:req - mov $(\nr / 2), %_ASM_BX - .align 16 -771: - call 772f -773: /* speculation trap */ - pause - lfence - jmp 773b - .align 16 -772: - call 774f -775: /* speculation trap */ - pause - lfence - jmp 775b - .align 16 -774: - dec %_ASM_BX - jnz 771b - add $((BITS_PER_LONG/8) * \nr), \sp -.endm - -#define RSB_FILL_LOOPS 16 /* To avoid underflow */ - -ENTRY(__fill_rsb) - STUFF_RSB RSB_FILL_LOOPS, %_ASM_SP - ret -END(__fill_rsb) -EXPORT_SYMBOL_GPL(__fill_rsb) - -#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */ - -ENTRY(__clear_rsb) - STUFF_RSB RSB_CLEAR_LOOPS, %_ASM_SP - ret -END(__clear_rsb) -EXPORT_SYMBOL_GPL(__clear_rsb) diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 27e9e90a8d35..4b101dd6e52f 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,12 +1,15 @@ # SPDX-License-Identifier: GPL-2.0 -# Kernel does not boot with instrumentation of tlb.c and mem_encrypt.c -KCOV_INSTRUMENT_tlb.o := n -KCOV_INSTRUMENT_mem_encrypt.o := n +# Kernel does not boot with instrumentation of tlb.c and mem_encrypt*.c +KCOV_INSTRUMENT_tlb.o := n +KCOV_INSTRUMENT_mem_encrypt.o := n +KCOV_INSTRUMENT_mem_encrypt_identity.o := n -KASAN_SANITIZE_mem_encrypt.o := n +KASAN_SANITIZE_mem_encrypt.o := n +KASAN_SANITIZE_mem_encrypt_identity.o := n ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_mem_encrypt.o = -pg +CFLAGS_REMOVE_mem_encrypt.o = -pg +CFLAGS_REMOVE_mem_encrypt_identity.o = -pg endif obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ @@ -16,6 +19,7 @@ obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_physaddr.o := $(nostackp) CFLAGS_setup_nx.o := $(nostackp) +CFLAGS_mem_encrypt_identity.o := $(nostackp) CFLAGS_fault.o := -I$(src)/../include/asm/trace @@ -47,4 +51,5 @@ obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_identity.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index b9283cc27622..b45f5aaefd74 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -27,8 +27,20 @@ EXPORT_SYMBOL(get_cpu_entry_area); void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags) { unsigned long va = (unsigned long) cea_vaddr; + pte_t pte = pfn_pte(pa >> PAGE_SHIFT, flags); - set_pte_vaddr(va, pfn_pte(pa >> PAGE_SHIFT, flags)); + /* + * The cpu_entry_area is shared between the user and kernel + * page tables. All of its ptes can safely be global. + * _PAGE_GLOBAL gets reused to help indicate PROT_NONE for + * non-present PTEs, so be careful not to set it in that + * case to avoid confusion. + */ + if (boot_cpu_has(X86_FEATURE_PGE) && + (pgprot_val(flags) & _PAGE_PRESENT)) + pte = pte_set_flags(pte, _PAGE_GLOBAL); + + set_pte_vaddr(va, pte); } static void __init @@ -163,4 +175,10 @@ void __init setup_cpu_entry_areas(void) for_each_possible_cpu(cpu) setup_cpu_entry_area(cpu); + + /* + * This is the last essential update to swapper_pgdir which needs + * to be synchronized to initial_page_table on 32bit. + */ + sync_initial_page_table(); } diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index 421f2664ffa0..225fe2f0bfec 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c @@ -1,4 +1,5 @@ #include <linux/debugfs.h> +#include <linux/efi.h> #include <linux/module.h> #include <linux/seq_file.h> #include <asm/pgtable.h> @@ -72,6 +73,30 @@ static const struct file_operations ptdump_curusr_fops = { }; #endif +#if defined(CONFIG_EFI) && defined(CONFIG_X86_64) +static struct dentry *pe_efi; + +static int ptdump_show_efi(struct seq_file *m, void *v) +{ + if (efi_mm.pgd) + ptdump_walk_pgd_level_debugfs(m, efi_mm.pgd, false); + return 0; +} + +static int ptdump_open_efi(struct inode *inode, struct file *filp) +{ + return single_open(filp, ptdump_show_efi, NULL); +} + +static const struct file_operations ptdump_efi_fops = { + .owner = THIS_MODULE, + .open = ptdump_open_efi, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + static struct dentry *dir, *pe_knl, *pe_curknl; static int __init pt_dump_debug_init(void) @@ -96,6 +121,13 @@ static int __init pt_dump_debug_init(void) if (!pe_curusr) goto err; #endif + +#if defined(CONFIG_EFI) && defined(CONFIG_X86_64) + pe_efi = debugfs_create_file("efi", 0400, dir, NULL, &ptdump_efi_fops); + if (!pe_efi) + goto err; +#endif + return 0; err: debugfs_remove_recursive(dir); diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 2a4849e92831..cc7ff5957194 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -18,6 +18,7 @@ #include <linux/init.h> #include <linux/sched.h> #include <linux/seq_file.h> +#include <linux/highmem.h> #include <asm/pgtable.h> @@ -29,6 +30,7 @@ struct pg_state { int level; pgprot_t current_prot; + pgprotval_t effective_prot; unsigned long start_address; unsigned long current_address; const struct addr_marker *marker; @@ -85,11 +87,15 @@ static struct addr_marker address_markers[] = { [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, [VMEMMAP_START_NR] = { 0UL, "Vmemmap" }, #ifdef CONFIG_KASAN - [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" }, - [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" }, + /* + * These fields get initialized with the (dynamic) + * KASAN_SHADOW_{START,END} values in pt_dump_init(). + */ + [KASAN_SHADOW_START_NR] = { 0UL, "KASAN shadow" }, + [KASAN_SHADOW_END_NR] = { 0UL, "KASAN shadow end" }, #endif #ifdef CONFIG_MODIFY_LDT_SYSCALL - [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" }, + [LDT_NR] = { 0UL, "LDT remap" }, #endif [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, #ifdef CONFIG_X86_ESPFIX64 @@ -231,9 +237,9 @@ static unsigned long normalize_addr(unsigned long u) * print what we collected so far. */ static void note_page(struct seq_file *m, struct pg_state *st, - pgprot_t new_prot, int level) + pgprot_t new_prot, pgprotval_t new_eff, int level) { - pgprotval_t prot, cur; + pgprotval_t prot, cur, eff; static const char units[] = "BKMGTPE"; /* @@ -243,23 +249,24 @@ static void note_page(struct seq_file *m, struct pg_state *st, */ prot = pgprot_val(new_prot); cur = pgprot_val(st->current_prot); + eff = st->effective_prot; if (!st->level) { /* First entry */ st->current_prot = new_prot; + st->effective_prot = new_eff; st->level = level; st->marker = address_markers; st->lines = 0; pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", st->marker->name); - } else if (prot != cur || level != st->level || + } else if (prot != cur || new_eff != eff || level != st->level || st->current_address >= st->marker[1].start_address) { const char *unit = units; unsigned long delta; int width = sizeof(unsigned long) * 2; - pgprotval_t pr = pgprot_val(st->current_prot); - if (st->check_wx && (pr & _PAGE_RW) && !(pr & _PAGE_NX)) { + if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) { WARN_ONCE(1, "x86/mm: Found insecure W+X mapping at address %p/%pS\n", (void *)st->start_address, @@ -313,22 +320,31 @@ static void note_page(struct seq_file *m, struct pg_state *st, st->start_address = st->current_address; st->current_prot = new_prot; + st->effective_prot = new_eff; st->level = level; } } -static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, unsigned long P) +static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2) +{ + return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) | + ((prot1 | prot2) & _PAGE_NX); +} + +static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, + pgprotval_t eff_in, unsigned long P) { int i; - pte_t *start; - pgprotval_t prot; + pte_t *pte; + pgprotval_t prot, eff; - start = (pte_t *)pmd_page_vaddr(addr); for (i = 0; i < PTRS_PER_PTE; i++) { - prot = pte_flags(*start); st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); - note_page(m, st, __pgprot(prot), 5); - start++; + pte = pte_offset_map(&addr, st->current_address); + prot = pte_flags(*pte); + eff = effective_prot(eff_in, prot); + note_page(m, st, __pgprot(prot), eff, 5); + pte_unmap(pte); } } #ifdef CONFIG_KASAN @@ -344,12 +360,10 @@ static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, void *pt) { if (__pa(pt) == __pa(kasan_zero_pmd) || -#ifdef CONFIG_X86_5LEVEL - __pa(pt) == __pa(kasan_zero_p4d) || -#endif + (pgtable_l5_enabled && __pa(pt) == __pa(kasan_zero_p4d)) || __pa(pt) == __pa(kasan_zero_pud)) { pgprotval_t prot = pte_flags(kasan_zero_pte[0]); - note_page(m, st, __pgprot(prot), 5); + note_page(m, st, __pgprot(prot), 0, 5); return true; } return false; @@ -364,42 +378,45 @@ static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, #if PTRS_PER_PMD > 1 -static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P) +static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, + pgprotval_t eff_in, unsigned long P) { int i; pmd_t *start, *pmd_start; - pgprotval_t prot; + pgprotval_t prot, eff; pmd_start = start = (pmd_t *)pud_page_vaddr(addr); for (i = 0; i < PTRS_PER_PMD; i++) { st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); if (!pmd_none(*start)) { + prot = pmd_flags(*start); + eff = effective_prot(eff_in, prot); if (pmd_large(*start) || !pmd_present(*start)) { - prot = pmd_flags(*start); - note_page(m, st, __pgprot(prot), 4); + note_page(m, st, __pgprot(prot), eff, 4); } else if (!kasan_page_table(m, st, pmd_start)) { - walk_pte_level(m, st, *start, + walk_pte_level(m, st, *start, eff, P + i * PMD_LEVEL_MULT); } } else - note_page(m, st, __pgprot(0), 4); + note_page(m, st, __pgprot(0), 0, 4); start++; } } #else -#define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p) +#define walk_pmd_level(m,s,a,e,p) walk_pte_level(m,s,__pmd(pud_val(a)),e,p) #define pud_large(a) pmd_large(__pmd(pud_val(a))) #define pud_none(a) pmd_none(__pmd(pud_val(a))) #endif #if PTRS_PER_PUD > 1 -static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P) +static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, + pgprotval_t eff_in, unsigned long P) { int i; pud_t *start, *pud_start; - pgprotval_t prot; + pgprotval_t prot, eff; pud_t *prev_pud = NULL; pud_start = start = (pud_t *)p4d_page_vaddr(addr); @@ -407,15 +424,16 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, for (i = 0; i < PTRS_PER_PUD; i++) { st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); if (!pud_none(*start)) { + prot = pud_flags(*start); + eff = effective_prot(eff_in, prot); if (pud_large(*start) || !pud_present(*start)) { - prot = pud_flags(*start); - note_page(m, st, __pgprot(prot), 3); + note_page(m, st, __pgprot(prot), eff, 3); } else if (!kasan_page_table(m, st, pud_start)) { - walk_pmd_level(m, st, *start, + walk_pmd_level(m, st, *start, eff, P + i * PUD_LEVEL_MULT); } } else - note_page(m, st, __pgprot(0), 3); + note_page(m, st, __pgprot(0), 0, 3); prev_pud = start; start++; @@ -423,43 +441,43 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, } #else -#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(p4d_val(a)),p) +#define walk_pud_level(m,s,a,e,p) walk_pmd_level(m,s,__pud(p4d_val(a)),e,p) #define p4d_large(a) pud_large(__pud(p4d_val(a))) #define p4d_none(a) pud_none(__pud(p4d_val(a))) #endif -#if PTRS_PER_P4D > 1 - -static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P) +static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, + pgprotval_t eff_in, unsigned long P) { int i; p4d_t *start, *p4d_start; - pgprotval_t prot; + pgprotval_t prot, eff; + + if (PTRS_PER_P4D == 1) + return walk_pud_level(m, st, __p4d(pgd_val(addr)), eff_in, P); p4d_start = start = (p4d_t *)pgd_page_vaddr(addr); for (i = 0; i < PTRS_PER_P4D; i++) { st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT); if (!p4d_none(*start)) { + prot = p4d_flags(*start); + eff = effective_prot(eff_in, prot); if (p4d_large(*start) || !p4d_present(*start)) { - prot = p4d_flags(*start); - note_page(m, st, __pgprot(prot), 2); + note_page(m, st, __pgprot(prot), eff, 2); } else if (!kasan_page_table(m, st, p4d_start)) { - walk_pud_level(m, st, *start, + walk_pud_level(m, st, *start, eff, P + i * P4D_LEVEL_MULT); } } else - note_page(m, st, __pgprot(0), 2); + note_page(m, st, __pgprot(0), 0, 2); start++; } } -#else -#define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p) -#define pgd_large(a) p4d_large(__p4d(pgd_val(a))) -#define pgd_none(a) p4d_none(__p4d(pgd_val(a))) -#endif +#define pgd_large(a) (pgtable_l5_enabled ? pgd_large(a) : p4d_large(__p4d(pgd_val(a)))) +#define pgd_none(a) (pgtable_l5_enabled ? pgd_none(a) : p4d_none(__p4d(pgd_val(a)))) static inline bool is_hypervisor_range(int idx) { @@ -483,7 +501,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, #else pgd_t *start = swapper_pg_dir; #endif - pgprotval_t prot; + pgprotval_t prot, eff; int i; struct pg_state st = {}; @@ -499,15 +517,20 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, for (i = 0; i < PTRS_PER_PGD; i++) { st.current_address = normalize_addr(i * PGD_LEVEL_MULT); if (!pgd_none(*start) && !is_hypervisor_range(i)) { + prot = pgd_flags(*start); +#ifdef CONFIG_X86_PAE + eff = _PAGE_USER | _PAGE_RW; +#else + eff = prot; +#endif if (pgd_large(*start) || !pgd_present(*start)) { - prot = pgd_flags(*start); - note_page(m, &st, __pgprot(prot), 1); + note_page(m, &st, __pgprot(prot), eff, 1); } else { - walk_p4d_level(m, &st, *start, + walk_p4d_level(m, &st, *start, eff, i * PGD_LEVEL_MULT); } } else - note_page(m, &st, __pgprot(0), 1); + note_page(m, &st, __pgprot(0), 0, 1); cond_resched(); start++; @@ -515,7 +538,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, /* Flush out the last page */ st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT); - note_page(m, &st, __pgprot(0), 0); + note_page(m, &st, __pgprot(0), 0, 0); if (!checkwx) return; if (st.wx_pages) @@ -570,6 +593,13 @@ static int __init pt_dump_init(void) address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET; address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START; +#ifdef CONFIG_MODIFY_LDT_SYSCALL + address_markers[LDT_NR].start_address = LDT_BASE_ADDR; +#endif +#ifdef CONFIG_KASAN + address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START; + address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END; +#endif #endif #ifdef CONFIG_X86_32 address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 800de815519c..73bd8c95ac71 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -330,7 +330,7 @@ static noinline int vmalloc_fault(unsigned long address) if (!pmd_k) return -1; - if (pmd_huge(*pmd_k)) + if (pmd_large(*pmd_k)) return 0; pte_k = pte_offset_kernel(pmd_k, address); @@ -417,11 +417,11 @@ void vmalloc_sync_all(void) */ static noinline int vmalloc_fault(unsigned long address) { - pgd_t *pgd, *pgd_ref; - p4d_t *p4d, *p4d_ref; - pud_t *pud, *pud_ref; - pmd_t *pmd, *pmd_ref; - pte_t *pte, *pte_ref; + pgd_t *pgd, *pgd_k; + p4d_t *p4d, *p4d_k; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; /* Make sure we are in vmalloc area: */ if (!(address >= VMALLOC_START && address < VMALLOC_END)) @@ -435,73 +435,51 @@ static noinline int vmalloc_fault(unsigned long address) * case just flush: */ pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address); - pgd_ref = pgd_offset_k(address); - if (pgd_none(*pgd_ref)) + pgd_k = pgd_offset_k(address); + if (pgd_none(*pgd_k)) return -1; - if (CONFIG_PGTABLE_LEVELS > 4) { + if (pgtable_l5_enabled) { if (pgd_none(*pgd)) { - set_pgd(pgd, *pgd_ref); + set_pgd(pgd, *pgd_k); arch_flush_lazy_mmu_mode(); } else { - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k)); } } /* With 4-level paging, copying happens on the p4d level. */ p4d = p4d_offset(pgd, address); - p4d_ref = p4d_offset(pgd_ref, address); - if (p4d_none(*p4d_ref)) + p4d_k = p4d_offset(pgd_k, address); + if (p4d_none(*p4d_k)) return -1; - if (p4d_none(*p4d) && CONFIG_PGTABLE_LEVELS == 4) { - set_p4d(p4d, *p4d_ref); + if (p4d_none(*p4d) && !pgtable_l5_enabled) { + set_p4d(p4d, *p4d_k); arch_flush_lazy_mmu_mode(); } else { - BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_ref)); + BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k)); } - /* - * Below here mismatches are bugs because these lower tables - * are shared: - */ BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4); pud = pud_offset(p4d, address); - pud_ref = pud_offset(p4d_ref, address); - if (pud_none(*pud_ref)) + if (pud_none(*pud)) return -1; - if (pud_none(*pud) || pud_pfn(*pud) != pud_pfn(*pud_ref)) - BUG(); - - if (pud_huge(*pud)) + if (pud_large(*pud)) return 0; pmd = pmd_offset(pud, address); - pmd_ref = pmd_offset(pud_ref, address); - if (pmd_none(*pmd_ref)) + if (pmd_none(*pmd)) return -1; - if (pmd_none(*pmd) || pmd_pfn(*pmd) != pmd_pfn(*pmd_ref)) - BUG(); - - if (pmd_huge(*pmd)) + if (pmd_large(*pmd)) return 0; - pte_ref = pte_offset_kernel(pmd_ref, address); - if (!pte_present(*pte_ref)) - return -1; - pte = pte_offset_kernel(pmd, address); - - /* - * Don't use pte_page here, because the mappings can point - * outside mem_map, and the NUMA hash lookup cannot handle - * that: - */ - if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) - BUG(); + if (!pte_present(*pte)) + return -1; return 0; } @@ -699,7 +677,6 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, printk(KERN_CONT "paging request"); printk(KERN_CONT " at %px\n", (void *) address); - printk(KERN_ALERT "IP: %pS\n", (void *)regs->ip); dump_pagetable(address); } @@ -1248,10 +1225,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, tsk = current; mm = tsk->mm; - /* - * Detect and handle instructions that would cause a page fault for - * both a tracked kernel page and a userspace page. - */ prefetchw(&mm->mmap_sem); if (unlikely(kmmio_fault(regs, address))) diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index ab33a32df2a8..a2f0c7e20fb0 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -98,6 +98,9 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, if (!info->kernpg_flag) info->kernpg_flag = _KERNPG_TABLE; + /* Filter out unsupported __PAGE_KERNEL_* bits: */ + info->kernpg_flag &= __default_kernel_pte_mask; + for (; addr < end; addr = next) { pgd_t *pgd = pgd_page + pgd_index(addr); p4d_t *p4d; @@ -120,7 +123,7 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, result = ident_p4d_init(info, p4d, addr, next); if (result) return result; - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + if (pgtable_l5_enabled) { set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag)); } else { /* diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 82f5252c723a..fec82b577c18 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -161,12 +161,6 @@ struct map_range { static int page_size_mask; -static void enable_global_pages(void) -{ - if (!static_cpu_has(X86_FEATURE_PTI)) - __supported_pte_mask |= _PAGE_GLOBAL; -} - static void __init probe_page_size_mask(void) { /* @@ -187,9 +181,15 @@ static void __init probe_page_size_mask(void) __supported_pte_mask &= ~_PAGE_GLOBAL; if (boot_cpu_has(X86_FEATURE_PGE)) { cr4_set_bits_and_update_boot(X86_CR4_PGE); - enable_global_pages(); + __supported_pte_mask |= _PAGE_GLOBAL; } + /* By the default is everything supported: */ + __default_kernel_pte_mask = __supported_pte_mask; + /* Except when with PTI where the kernel is mostly non-Global: */ + if (cpu_feature_enabled(X86_FEATURE_PTI)) + __default_kernel_pte_mask &= ~_PAGE_GLOBAL; + /* Enable 1 GB linear kernel mappings if available: */ if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) { printk(KERN_INFO "Using GB pages for direct mapping\n"); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 79cb066f40c0..c893c6a3d707 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -453,6 +453,21 @@ static inline void permanent_kmaps_init(pgd_t *pgd_base) } #endif /* CONFIG_HIGHMEM */ +void __init sync_initial_page_table(void) +{ + clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + + /* + * sync back low identity map too. It is used for example + * in the 32-bit EFI stub. + */ + clone_pgd_range(initial_page_table, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); +} + void __init native_pagetable_init(void) { unsigned long pfn, va; @@ -543,8 +558,14 @@ static void __init pagetable_init(void) permanent_kmaps_init(pgd_base); } -pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL); +#define DEFAULT_PTE_MASK ~(_PAGE_NX | _PAGE_GLOBAL) +/* Bits supported by the hardware: */ +pteval_t __supported_pte_mask __read_mostly = DEFAULT_PTE_MASK; +/* Bits allowed in normal kernel mappings: */ +pteval_t __default_kernel_pte_mask __read_mostly = DEFAULT_PTE_MASK; EXPORT_SYMBOL_GPL(__supported_pte_mask); +/* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ +EXPORT_SYMBOL(__default_kernel_pte_mask); /* user-defined highmem size */ static unsigned int highmem_pages = -1; @@ -763,6 +784,7 @@ void __init mem_init(void) free_all_bootmem(); after_bootmem = 1; + x86_init.hyper.init_after_bootmem(); mem_init_print_info(NULL); printk(KERN_INFO "virtual kernel memory layout:\n" diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 1ab42c852069..0a400606dea0 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -65,8 +65,13 @@ * around without checking the pgd every time. */ +/* Bits supported by the hardware: */ pteval_t __supported_pte_mask __read_mostly = ~0; +/* Bits allowed in normal kernel mappings: */ +pteval_t __default_kernel_pte_mask __read_mostly = ~0; EXPORT_SYMBOL_GPL(__supported_pte_mask); +/* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ +EXPORT_SYMBOL(__default_kernel_pte_mask); int force_personality32; @@ -88,12 +93,7 @@ static int __init nonx32_setup(char *str) } __setup("noexec32=", nonx32_setup); -/* - * When memory was added make sure all the processes MM have - * suitable PGD entries in the local PGD level page. - */ -#ifdef CONFIG_X86_5LEVEL -void sync_global_pgds(unsigned long start, unsigned long end) +static void sync_global_pgds_l5(unsigned long start, unsigned long end) { unsigned long addr; @@ -129,8 +129,8 @@ void sync_global_pgds(unsigned long start, unsigned long end) spin_unlock(&pgd_lock); } } -#else -void sync_global_pgds(unsigned long start, unsigned long end) + +static void sync_global_pgds_l4(unsigned long start, unsigned long end) { unsigned long addr; @@ -143,7 +143,7 @@ void sync_global_pgds(unsigned long start, unsigned long end) * With folded p4d, pgd_none() is always false, we need to * handle synchonization on p4d level. */ - BUILD_BUG_ON(pgd_none(*pgd_ref)); + MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref)); p4d_ref = p4d_offset(pgd_ref, addr); if (p4d_none(*p4d_ref)) @@ -173,7 +173,18 @@ void sync_global_pgds(unsigned long start, unsigned long end) spin_unlock(&pgd_lock); } } -#endif + +/* + * When memory was added make sure all the processes MM have + * suitable PGD entries in the local PGD level page. + */ +void sync_global_pgds(unsigned long start, unsigned long end) +{ + if (pgtable_l5_enabled) + sync_global_pgds_l5(start, end); + else + sync_global_pgds_l4(start, end); +} /* * NOTE: This function is marked __ref because it calls __init function @@ -256,7 +267,7 @@ static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte) * It's enough to flush this one mapping. * (PGE mappings get flushed as well) */ - __flush_tlb_one(vaddr); + __flush_tlb_one_kernel(vaddr); } void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte) @@ -632,7 +643,7 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, unsigned long vaddr = (unsigned long)__va(paddr); int i = p4d_index(vaddr); - if (!IS_ENABLED(CONFIG_X86_5LEVEL)) + if (!pgtable_l5_enabled) return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask); for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) { @@ -712,7 +723,7 @@ kernel_physical_mapping_init(unsigned long paddr_start, page_size_mask); spin_lock(&init_mm.page_table_lock); - if (IS_ENABLED(CONFIG_X86_5LEVEL)) + if (pgtable_l5_enabled) pgd_populate(&init_mm, pgd, p4d); else p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d); @@ -800,17 +811,11 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, #define PAGE_INUSE 0xFD -static void __meminit free_pagetable(struct page *page, int order, - struct vmem_altmap *altmap) +static void __meminit free_pagetable(struct page *page, int order) { unsigned long magic; unsigned int nr_pages = 1 << order; - if (altmap) { - vmem_altmap_free(altmap, nr_pages); - return; - } - /* bootmem page has reserved flag */ if (PageReserved(page)) { __ClearPageReserved(page); @@ -826,9 +831,17 @@ static void __meminit free_pagetable(struct page *page, int order, free_pages((unsigned long)page_address(page), order); } -static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd, +static void __meminit free_hugepage_table(struct page *page, struct vmem_altmap *altmap) { + if (altmap) + vmem_altmap_free(altmap, PMD_SIZE / PAGE_SIZE); + else + free_pagetable(page, get_order(PMD_SIZE)); +} + +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) +{ pte_t *pte; int i; @@ -839,14 +852,13 @@ static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd, } /* free a pte talbe */ - free_pagetable(pmd_page(*pmd), 0, altmap); + free_pagetable(pmd_page(*pmd), 0); spin_lock(&init_mm.page_table_lock); pmd_clear(pmd); spin_unlock(&init_mm.page_table_lock); } -static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud, - struct vmem_altmap *altmap) +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) { pmd_t *pmd; int i; @@ -858,14 +870,13 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud, } /* free a pmd talbe */ - free_pagetable(pud_page(*pud), 0, altmap); + free_pagetable(pud_page(*pud), 0); spin_lock(&init_mm.page_table_lock); pud_clear(pud); spin_unlock(&init_mm.page_table_lock); } -static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d, - struct vmem_altmap *altmap) +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d) { pud_t *pud; int i; @@ -877,7 +888,7 @@ static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d, } /* free a pud talbe */ - free_pagetable(p4d_page(*p4d), 0, altmap); + free_pagetable(p4d_page(*p4d), 0); spin_lock(&init_mm.page_table_lock); p4d_clear(p4d); spin_unlock(&init_mm.page_table_lock); @@ -885,7 +896,7 @@ static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d, static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, - struct vmem_altmap *altmap, bool direct) + bool direct) { unsigned long next, pages = 0; pte_t *pte; @@ -916,7 +927,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, * freed when offlining, or simplely not in use. */ if (!direct) - free_pagetable(pte_page(*pte), 0, altmap); + free_pagetable(pte_page(*pte), 0); spin_lock(&init_mm.page_table_lock); pte_clear(&init_mm, addr, pte); @@ -939,7 +950,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, page_addr = page_address(pte_page(*pte)); if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) { - free_pagetable(pte_page(*pte), 0, altmap); + free_pagetable(pte_page(*pte), 0); spin_lock(&init_mm.page_table_lock); pte_clear(&init_mm, addr, pte); @@ -974,9 +985,8 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, if (IS_ALIGNED(addr, PMD_SIZE) && IS_ALIGNED(next, PMD_SIZE)) { if (!direct) - free_pagetable(pmd_page(*pmd), - get_order(PMD_SIZE), - altmap); + free_hugepage_table(pmd_page(*pmd), + altmap); spin_lock(&init_mm.page_table_lock); pmd_clear(pmd); @@ -989,9 +999,8 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, page_addr = page_address(pmd_page(*pmd)); if (!memchr_inv(page_addr, PAGE_INUSE, PMD_SIZE)) { - free_pagetable(pmd_page(*pmd), - get_order(PMD_SIZE), - altmap); + free_hugepage_table(pmd_page(*pmd), + altmap); spin_lock(&init_mm.page_table_lock); pmd_clear(pmd); @@ -1003,8 +1012,8 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, } pte_base = (pte_t *)pmd_page_vaddr(*pmd); - remove_pte_table(pte_base, addr, next, altmap, direct); - free_pte_table(pte_base, pmd, altmap); + remove_pte_table(pte_base, addr, next, direct); + free_pte_table(pte_base, pmd); } /* Call free_pmd_table() in remove_pud_table(). */ @@ -1033,8 +1042,7 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, IS_ALIGNED(next, PUD_SIZE)) { if (!direct) free_pagetable(pud_page(*pud), - get_order(PUD_SIZE), - altmap); + get_order(PUD_SIZE)); spin_lock(&init_mm.page_table_lock); pud_clear(pud); @@ -1048,8 +1056,7 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, if (!memchr_inv(page_addr, PAGE_INUSE, PUD_SIZE)) { free_pagetable(pud_page(*pud), - get_order(PUD_SIZE), - altmap); + get_order(PUD_SIZE)); spin_lock(&init_mm.page_table_lock); pud_clear(pud); @@ -1062,7 +1069,7 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, pmd_base = pmd_offset(pud, 0); remove_pmd_table(pmd_base, addr, next, direct, altmap); - free_pmd_table(pmd_base, pud, altmap); + free_pmd_table(pmd_base, pud); } if (direct) @@ -1093,8 +1100,8 @@ remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end, * 5-level case we should free them. This code will have to change * to adapt for boot-time switching between 4 and 5 level page tables. */ - if (CONFIG_PGTABLE_LEVELS == 5) - free_pud_table(pud_base, p4d, altmap); + if (pgtable_l5_enabled) + free_pud_table(pud_base, p4d); } if (direct) @@ -1183,6 +1190,7 @@ void __init mem_init(void) /* this will put all memory onto the freelists */ free_all_bootmem(); after_bootmem = 1; + x86_init.hyper.init_after_bootmem(); /* * Must be done after boot memory is put on freelist, because here we @@ -1193,8 +1201,8 @@ void __init mem_init(void) register_page_bootmem_info(); /* Register memory areas for /proc/kcore */ - kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, - PAGE_SIZE, KCORE_OTHER); + if (get_gate_vma(&init_mm)) + kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER); mem_init_print_info(NULL); } @@ -1283,6 +1291,12 @@ void mark_rodata_ro(void) (unsigned long) __va(__pa_symbol(_sdata))); debug_checkwx(); + + /* + * Do this after all of the manipulation of the + * kernel text page tables are complete. + */ + pti_clone_kernel_text(); } int kern_addr_valid(unsigned long addr) @@ -1326,14 +1340,39 @@ int kern_addr_valid(unsigned long addr) return pfn_valid(pte_pfn(*pte)); } +/* + * Block size is the minimum amount of memory which can be hotplugged or + * hotremoved. It must be power of two and must be equal or larger than + * MIN_MEMORY_BLOCK_SIZE. + */ +#define MAX_BLOCK_SIZE (2UL << 30) + +/* Amount of ram needed to start using large blocks */ +#define MEM_SIZE_FOR_LARGE_BLOCK (64UL << 30) + static unsigned long probe_memory_block_size(void) { - unsigned long bz = MIN_MEMORY_BLOCK_SIZE; + unsigned long boot_mem_end = max_pfn << PAGE_SHIFT; + unsigned long bz; - /* if system is UV or has 64GB of RAM or more, use large blocks */ - if (is_uv_system() || ((max_pfn << PAGE_SHIFT) >= (64UL << 30))) - bz = 2UL << 30; /* 2GB */ + /* If this is UV system, always set 2G block size */ + if (is_uv_system()) { + bz = MAX_BLOCK_SIZE; + goto done; + } + + /* Use regular block if RAM is smaller than MEM_SIZE_FOR_LARGE_BLOCK */ + if (boot_mem_end < MEM_SIZE_FOR_LARGE_BLOCK) { + bz = MIN_MEMORY_BLOCK_SIZE; + goto done; + } + /* Find the largest allowed block size that aligns to memory end */ + for (bz = MAX_BLOCK_SIZE; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) { + if (IS_ALIGNED(boot_mem_end, bz)) + break; + } +done: pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20); return bz; diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index ada98b39b8ad..b3294d36769d 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -44,6 +44,9 @@ int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot) return ret; *prot = __pgprot(__PAGE_KERNEL | cachemode2protval(pcm)); + /* Filter out unsupported __PAGE_KERNEL* bits: */ + pgprot_val(*prot) &= __default_kernel_pte_mask; + return 0; } EXPORT_SYMBOL_GPL(iomap_create_wc); @@ -88,6 +91,9 @@ iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) prot = __pgprot(__PAGE_KERNEL | cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)); + /* Filter out unsupported __PAGE_KERNEL* bits: */ + pgprot_val(prot) &= __default_kernel_pte_mask; + return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot); } EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index c45b6ec5357b..c63a545ec199 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -816,9 +816,12 @@ void __init __early_set_fixmap(enum fixed_addresses idx, } pte = early_ioremap_pte(addr); + /* Sanitize 'prot' against any unsupported bits: */ + pgprot_val(flags) &= __default_kernel_pte_mask; + if (pgprot_val(flags)) set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); else pte_clear(&init_mm, addr, pte); - __flush_tlb_one(addr); + __flush_tlb_one_kernel(addr); } diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index af6f2f9c6a26..980dbebd0ca7 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -1,6 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 #define DISABLE_BRANCH_PROFILING #define pr_fmt(fmt) "kasan: " fmt + +#ifdef CONFIG_X86_5LEVEL +/* Too early to use cpu_feature_enabled() */ +#define pgtable_l5_enabled __pgtable_l5_enabled +#endif + #include <linux/bootmem.h> #include <linux/kasan.h> #include <linux/kdebug.h> @@ -19,7 +25,7 @@ extern struct range pfn_mapped[E820_MAX_ENTRIES]; -static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); +static p4d_t tmp_p4d_table[MAX_PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); static __init void *early_alloc(size_t size, int nid, bool panic) { @@ -176,10 +182,10 @@ static void __init clear_pgds(unsigned long start, * With folded p4d, pgd_clear() is nop, use p4d_clear() * instead. */ - if (CONFIG_PGTABLE_LEVELS < 5) - p4d_clear(p4d_offset(pgd, start)); - else + if (pgtable_l5_enabled) pgd_clear(pgd); + else + p4d_clear(p4d_offset(pgd, start)); } pgd = pgd_offset_k(start); @@ -191,7 +197,7 @@ static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr) { unsigned long p4d; - if (!IS_ENABLED(CONFIG_X86_5LEVEL)) + if (!pgtable_l5_enabled) return (p4d_t *)pgd; p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK; @@ -263,6 +269,12 @@ void __init kasan_early_init(void) pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE; p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE; + /* Mask out unsupported __PAGE_KERNEL bits: */ + pte_val &= __default_kernel_pte_mask; + pmd_val &= __default_kernel_pte_mask; + pud_val &= __default_kernel_pte_mask; + p4d_val &= __default_kernel_pte_mask; + for (i = 0; i < PTRS_PER_PTE; i++) kasan_zero_pte[i] = __pte(pte_val); @@ -272,7 +284,7 @@ void __init kasan_early_init(void) for (i = 0; i < PTRS_PER_PUD; i++) kasan_zero_pud[i] = __pud(pud_val); - for (i = 0; IS_ENABLED(CONFIG_X86_5LEVEL) && i < PTRS_PER_P4D; i++) + for (i = 0; pgtable_l5_enabled && i < PTRS_PER_P4D; i++) kasan_zero_p4d[i] = __p4d(p4d_val); kasan_map_early_shadow(early_top_pgt); @@ -303,7 +315,7 @@ void __init kasan_init(void) * bunch of things like kernel code, modules, EFI mapping, etc. * We need to take extra steps to not overwrite them. */ - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + if (pgtable_l5_enabled) { void *ptr; ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END)); @@ -365,7 +377,13 @@ void __init kasan_init(void) */ memset(kasan_zero_page, 0, PAGE_SIZE); for (i = 0; i < PTRS_PER_PTE; i++) { - pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO | _PAGE_ENC); + pte_t pte; + pgprot_t prot; + + prot = __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC); + pgprot_val(prot) &= __default_kernel_pte_mask; + + pte = __pte(__pa(kasan_zero_page) | pgprot_val(prot)); set_pte(&kasan_zero_pte[i], pte); } /* Flush TLBs again to be sure that write protection applied. */ diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index aedebd2ebf1e..615cc03ced84 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -34,23 +34,12 @@ #define TB_SHIFT 40 /* - * Virtual address start and end range for randomization. - * * The end address could depend on more configuration options to make the * highest amount of space for randomization available, but that's too hard * to keep straight and caused issues already. */ -static const unsigned long vaddr_start = __PAGE_OFFSET_BASE; static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE; -/* Default values */ -unsigned long page_offset_base = __PAGE_OFFSET_BASE; -EXPORT_SYMBOL(page_offset_base); -unsigned long vmalloc_base = __VMALLOC_BASE; -EXPORT_SYMBOL(vmalloc_base); -unsigned long vmemmap_base = __VMEMMAP_BASE; -EXPORT_SYMBOL(vmemmap_base); - /* * Memory regions randomized by KASLR (except modules that use a separate logic * earlier during boot). The list is ordered based on virtual addresses. This @@ -60,8 +49,8 @@ static __initdata struct kaslr_memory_region { unsigned long *base; unsigned long size_tb; } kaslr_regions[] = { - { &page_offset_base, 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT) /* Maximum */ }, - { &vmalloc_base, VMALLOC_SIZE_TB }, + { &page_offset_base, 0 }, + { &vmalloc_base, 0 }, { &vmemmap_base, 1 }, }; @@ -84,11 +73,14 @@ static inline bool kaslr_memory_enabled(void) void __init kernel_randomize_memory(void) { size_t i; - unsigned long vaddr = vaddr_start; + unsigned long vaddr_start, vaddr; unsigned long rand, memory_tb; struct rnd_state rand_state; unsigned long remain_entropy; + vaddr_start = pgtable_l5_enabled ? __PAGE_OFFSET_BASE_L5 : __PAGE_OFFSET_BASE_L4; + vaddr = vaddr_start; + /* * These BUILD_BUG_ON checks ensure the memory layout is consistent * with the vaddr_start/vaddr_end variables. These checks are very @@ -101,6 +93,9 @@ void __init kernel_randomize_memory(void) if (!kaslr_memory_enabled()) return; + kaslr_regions[0].size_tb = 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT); + kaslr_regions[1].size_tb = VMALLOC_SIZE_TB; + /* * Update Physical memory mapping to available and * add padding if needed (especially for memory hotplug support). @@ -129,7 +124,7 @@ void __init kernel_randomize_memory(void) */ entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i); prandom_bytes_state(&rand_state, &rand, sizeof(rand)); - if (IS_ENABLED(CONFIG_X86_5LEVEL)) + if (pgtable_l5_enabled) entropy = (rand % (entropy + 1)) & P4D_MASK; else entropy = (rand % (entropy + 1)) & PUD_MASK; @@ -141,7 +136,7 @@ void __init kernel_randomize_memory(void) * randomization alignment. */ vaddr += get_padding(&kaslr_regions[i]); - if (IS_ENABLED(CONFIG_X86_5LEVEL)) + if (pgtable_l5_enabled) vaddr = round_up(vaddr + 1, P4D_SIZE); else vaddr = round_up(vaddr + 1, PUD_SIZE); @@ -217,7 +212,7 @@ void __meminit init_trampoline(void) return; } - if (IS_ENABLED(CONFIG_X86_5LEVEL)) + if (pgtable_l5_enabled) init_trampoline_p4d(); else init_trampoline_pud(); diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 58477ec3d66d..7c8686709636 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -168,7 +168,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear) return -1; } - __flush_tlb_one(f->addr); + __flush_tlb_one_kernel(f->addr); return 0; } diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 1a53071e2e17..b2de398d1fd3 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -25,17 +25,12 @@ #include <asm/bootparam.h> #include <asm/set_memory.h> #include <asm/cacheflush.h> -#include <asm/sections.h> #include <asm/processor-flags.h> #include <asm/msr.h> #include <asm/cmdline.h> #include "mm_internal.h" -static char sme_cmdline_arg[] __initdata = "mem_encrypt"; -static char sme_cmdline_on[] __initdata = "on"; -static char sme_cmdline_off[] __initdata = "off"; - /* * Since SME related variables are set early in the boot process they must * reside in the .data section so as not to be zeroed out when the .bss @@ -46,7 +41,7 @@ EXPORT_SYMBOL(sme_me_mask); DEFINE_STATIC_KEY_FALSE(sev_enable_key); EXPORT_SYMBOL_GPL(sev_enable_key); -static bool sev_enabled __section(.data); +bool sev_enabled __section(.data); /* Buffer used for early in-place encryption by BSP, no locking needed */ static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE); @@ -200,67 +195,6 @@ void __init sme_early_init(void) swiotlb_force = SWIOTLB_FORCE; } -static void *sev_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, unsigned long attrs) -{ - unsigned long dma_mask; - unsigned int order; - struct page *page; - void *vaddr = NULL; - - dma_mask = dma_alloc_coherent_mask(dev, gfp); - order = get_order(size); - - /* - * Memory will be memset to zero after marking decrypted, so don't - * bother clearing it before. - */ - gfp &= ~__GFP_ZERO; - - page = alloc_pages_node(dev_to_node(dev), gfp, order); - if (page) { - dma_addr_t addr; - - /* - * Since we will be clearing the encryption bit, check the - * mask with it already cleared. - */ - addr = __sme_clr(phys_to_dma(dev, page_to_phys(page))); - if ((addr + size) > dma_mask) { - __free_pages(page, get_order(size)); - } else { - vaddr = page_address(page); - *dma_handle = addr; - } - } - - if (!vaddr) - vaddr = swiotlb_alloc_coherent(dev, size, dma_handle, gfp); - - if (!vaddr) - return NULL; - - /* Clear the SME encryption bit for DMA use if not swiotlb area */ - if (!is_swiotlb_buffer(dma_to_phys(dev, *dma_handle))) { - set_memory_decrypted((unsigned long)vaddr, 1 << order); - memset(vaddr, 0, PAGE_SIZE << order); - *dma_handle = __sme_clr(*dma_handle); - } - - return vaddr; -} - -static void sev_free(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle, unsigned long attrs) -{ - /* Set the SME encryption bit for re-use if not swiotlb area */ - if (!is_swiotlb_buffer(dma_to_phys(dev, dma_handle))) - set_memory_encrypted((unsigned long)vaddr, - 1 << get_order(size)); - - swiotlb_free_coherent(dev, size, vaddr, dma_handle); -} - static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) { pgprot_t old_prot, new_prot; @@ -413,20 +347,6 @@ bool sev_active(void) } EXPORT_SYMBOL(sev_active); -static const struct dma_map_ops sev_dma_ops = { - .alloc = sev_alloc, - .free = sev_free, - .map_page = swiotlb_map_page, - .unmap_page = swiotlb_unmap_page, - .map_sg = swiotlb_map_sg_attrs, - .unmap_sg = swiotlb_unmap_sg_attrs, - .sync_single_for_cpu = swiotlb_sync_single_for_cpu, - .sync_single_for_device = swiotlb_sync_single_for_device, - .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, - .sync_sg_for_device = swiotlb_sync_sg_for_device, - .mapping_error = swiotlb_dma_mapping_error, -}; - /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void) { @@ -437,12 +357,11 @@ void __init mem_encrypt_init(void) swiotlb_update_mem_attributes(); /* - * With SEV, DMA operations cannot use encryption. New DMA ops - * are required in order to mark the DMA areas as decrypted or - * to use bounce buffers. + * With SEV, DMA operations cannot use encryption, we need to use + * SWIOTLB to bounce buffer DMA operation. */ if (sev_active()) - dma_ops = &sev_dma_ops; + dma_ops = &swiotlb_dma_ops; /* * With SEV, we need to unroll the rep string I/O instructions. @@ -455,582 +374,3 @@ void __init mem_encrypt_init(void) : "Secure Memory Encryption (SME)"); } -void swiotlb_set_mem_attributes(void *vaddr, unsigned long size) -{ - WARN(PAGE_ALIGN(size) != size, - "size is not page-aligned (%#lx)\n", size); - - /* Make the SWIOTLB buffer area decrypted */ - set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT); -} - -struct sme_populate_pgd_data { - void *pgtable_area; - pgd_t *pgd; - - pmdval_t pmd_flags; - pteval_t pte_flags; - unsigned long paddr; - - unsigned long vaddr; - unsigned long vaddr_end; -}; - -static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) -{ - unsigned long pgd_start, pgd_end, pgd_size; - pgd_t *pgd_p; - - pgd_start = ppd->vaddr & PGDIR_MASK; - pgd_end = ppd->vaddr_end & PGDIR_MASK; - - pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t); - - pgd_p = ppd->pgd + pgd_index(ppd->vaddr); - - memset(pgd_p, 0, pgd_size); -} - -#define PGD_FLAGS _KERNPG_TABLE_NOENC -#define P4D_FLAGS _KERNPG_TABLE_NOENC -#define PUD_FLAGS _KERNPG_TABLE_NOENC -#define PMD_FLAGS _KERNPG_TABLE_NOENC - -#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) - -#define PMD_FLAGS_DEC PMD_FLAGS_LARGE -#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ - (_PAGE_PAT | _PAGE_PWT)) - -#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC) - -#define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL) - -#define PTE_FLAGS_DEC PTE_FLAGS -#define PTE_FLAGS_DEC_WP ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ - (_PAGE_PAT | _PAGE_PWT)) - -#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC) - -static pmd_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) -{ - pgd_t *pgd_p; - p4d_t *p4d_p; - pud_t *pud_p; - pmd_t *pmd_p; - - pgd_p = ppd->pgd + pgd_index(ppd->vaddr); - if (native_pgd_val(*pgd_p)) { - if (IS_ENABLED(CONFIG_X86_5LEVEL)) - p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); - else - pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); - } else { - pgd_t pgd; - - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - p4d_p = ppd->pgtable_area; - memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D); - ppd->pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D; - - pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS); - } else { - pud_p = ppd->pgtable_area; - memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); - ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; - - pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS); - } - native_set_pgd(pgd_p, pgd); - } - - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - p4d_p += p4d_index(ppd->vaddr); - if (native_p4d_val(*p4d_p)) { - pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK); - } else { - p4d_t p4d; - - pud_p = ppd->pgtable_area; - memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); - ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; - - p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS); - native_set_p4d(p4d_p, p4d); - } - } - - pud_p += pud_index(ppd->vaddr); - if (native_pud_val(*pud_p)) { - if (native_pud_val(*pud_p) & _PAGE_PSE) - return NULL; - - pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK); - } else { - pud_t pud; - - pmd_p = ppd->pgtable_area; - memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); - ppd->pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD; - - pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS); - native_set_pud(pud_p, pud); - } - - return pmd_p; -} - -static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) -{ - pmd_t *pmd_p; - - pmd_p = sme_prepare_pgd(ppd); - if (!pmd_p) - return; - - pmd_p += pmd_index(ppd->vaddr); - if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE)) - native_set_pmd(pmd_p, native_make_pmd(ppd->paddr | ppd->pmd_flags)); -} - -static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) -{ - pmd_t *pmd_p; - pte_t *pte_p; - - pmd_p = sme_prepare_pgd(ppd); - if (!pmd_p) - return; - - pmd_p += pmd_index(ppd->vaddr); - if (native_pmd_val(*pmd_p)) { - if (native_pmd_val(*pmd_p) & _PAGE_PSE) - return; - - pte_p = (pte_t *)(native_pmd_val(*pmd_p) & ~PTE_FLAGS_MASK); - } else { - pmd_t pmd; - - pte_p = ppd->pgtable_area; - memset(pte_p, 0, sizeof(*pte_p) * PTRS_PER_PTE); - ppd->pgtable_area += sizeof(*pte_p) * PTRS_PER_PTE; - - pmd = native_make_pmd((pteval_t)pte_p + PMD_FLAGS); - native_set_pmd(pmd_p, pmd); - } - - pte_p += pte_index(ppd->vaddr); - if (!native_pte_val(*pte_p)) - native_set_pte(pte_p, native_make_pte(ppd->paddr | ppd->pte_flags)); -} - -static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) -{ - while (ppd->vaddr < ppd->vaddr_end) { - sme_populate_pgd_large(ppd); - - ppd->vaddr += PMD_PAGE_SIZE; - ppd->paddr += PMD_PAGE_SIZE; - } -} - -static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd) -{ - while (ppd->vaddr < ppd->vaddr_end) { - sme_populate_pgd(ppd); - - ppd->vaddr += PAGE_SIZE; - ppd->paddr += PAGE_SIZE; - } -} - -static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, - pmdval_t pmd_flags, pteval_t pte_flags) -{ - unsigned long vaddr_end; - - ppd->pmd_flags = pmd_flags; - ppd->pte_flags = pte_flags; - - /* Save original end value since we modify the struct value */ - vaddr_end = ppd->vaddr_end; - - /* If start is not 2MB aligned, create PTE entries */ - ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE); - __sme_map_range_pte(ppd); - - /* Create PMD entries */ - ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK; - __sme_map_range_pmd(ppd); - - /* If end is not 2MB aligned, create PTE entries */ - ppd->vaddr_end = vaddr_end; - __sme_map_range_pte(ppd); -} - -static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) -{ - __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC); -} - -static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) -{ - __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC); -} - -static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) -{ - __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP); -} - -static unsigned long __init sme_pgtable_calc(unsigned long len) -{ - unsigned long p4d_size, pud_size, pmd_size, pte_size; - unsigned long total; - - /* - * Perform a relatively simplistic calculation of the pagetable - * entries that are needed. Those mappings will be covered mostly - * by 2MB PMD entries so we can conservatively calculate the required - * number of P4D, PUD and PMD structures needed to perform the - * mappings. For mappings that are not 2MB aligned, PTE mappings - * would be needed for the start and end portion of the address range - * that fall outside of the 2MB alignment. This results in, at most, - * two extra pages to hold PTE entries for each range that is mapped. - * Incrementing the count for each covers the case where the addresses - * cross entries. - */ - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; - p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D; - pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1; - pud_size *= sizeof(pud_t) * PTRS_PER_PUD; - } else { - p4d_size = 0; - pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; - pud_size *= sizeof(pud_t) * PTRS_PER_PUD; - } - pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1; - pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; - pte_size = 2 * sizeof(pte_t) * PTRS_PER_PTE; - - total = p4d_size + pud_size + pmd_size + pte_size; - - /* - * Now calculate the added pagetable structures needed to populate - * the new pagetables. - */ - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; - p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D; - pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE; - pud_size *= sizeof(pud_t) * PTRS_PER_PUD; - } else { - p4d_size = 0; - pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; - pud_size *= sizeof(pud_t) * PTRS_PER_PUD; - } - pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE; - pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; - - total += p4d_size + pud_size + pmd_size; - - return total; -} - -void __init __nostackprotector sme_encrypt_kernel(struct boot_params *bp) -{ - unsigned long workarea_start, workarea_end, workarea_len; - unsigned long execute_start, execute_end, execute_len; - unsigned long kernel_start, kernel_end, kernel_len; - unsigned long initrd_start, initrd_end, initrd_len; - struct sme_populate_pgd_data ppd; - unsigned long pgtable_area_len; - unsigned long decrypted_base; - - if (!sme_active()) - return; - - /* - * Prepare for encrypting the kernel and initrd by building new - * pagetables with the necessary attributes needed to encrypt the - * kernel in place. - * - * One range of virtual addresses will map the memory occupied - * by the kernel and initrd as encrypted. - * - * Another range of virtual addresses will map the memory occupied - * by the kernel and initrd as decrypted and write-protected. - * - * The use of write-protect attribute will prevent any of the - * memory from being cached. - */ - - /* Physical addresses gives us the identity mapped virtual addresses */ - kernel_start = __pa_symbol(_text); - kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE); - kernel_len = kernel_end - kernel_start; - - initrd_start = 0; - initrd_end = 0; - initrd_len = 0; -#ifdef CONFIG_BLK_DEV_INITRD - initrd_len = (unsigned long)bp->hdr.ramdisk_size | - ((unsigned long)bp->ext_ramdisk_size << 32); - if (initrd_len) { - initrd_start = (unsigned long)bp->hdr.ramdisk_image | - ((unsigned long)bp->ext_ramdisk_image << 32); - initrd_end = PAGE_ALIGN(initrd_start + initrd_len); - initrd_len = initrd_end - initrd_start; - } -#endif - - /* Set the encryption workarea to be immediately after the kernel */ - workarea_start = kernel_end; - - /* - * Calculate required number of workarea bytes needed: - * executable encryption area size: - * stack page (PAGE_SIZE) - * encryption routine page (PAGE_SIZE) - * intermediate copy buffer (PMD_PAGE_SIZE) - * pagetable structures for the encryption of the kernel - * pagetable structures for workarea (in case not currently mapped) - */ - execute_start = workarea_start; - execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE; - execute_len = execute_end - execute_start; - - /* - * One PGD for both encrypted and decrypted mappings and a set of - * PUDs and PMDs for each of the encrypted and decrypted mappings. - */ - pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD; - pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2; - if (initrd_len) - pgtable_area_len += sme_pgtable_calc(initrd_len) * 2; - - /* PUDs and PMDs needed in the current pagetables for the workarea */ - pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len); - - /* - * The total workarea includes the executable encryption area and - * the pagetable area. The start of the workarea is already 2MB - * aligned, align the end of the workarea on a 2MB boundary so that - * we don't try to create/allocate PTE entries from the workarea - * before it is mapped. - */ - workarea_len = execute_len + pgtable_area_len; - workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE); - - /* - * Set the address to the start of where newly created pagetable - * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable - * structures are created when the workarea is added to the current - * pagetables and when the new encrypted and decrypted kernel - * mappings are populated. - */ - ppd.pgtable_area = (void *)execute_end; - - /* - * Make sure the current pagetable structure has entries for - * addressing the workarea. - */ - ppd.pgd = (pgd_t *)native_read_cr3_pa(); - ppd.paddr = workarea_start; - ppd.vaddr = workarea_start; - ppd.vaddr_end = workarea_end; - sme_map_range_decrypted(&ppd); - - /* Flush the TLB - no globals so cr3 is enough */ - native_write_cr3(__native_read_cr3()); - - /* - * A new pagetable structure is being built to allow for the kernel - * and initrd to be encrypted. It starts with an empty PGD that will - * then be populated with new PUDs and PMDs as the encrypted and - * decrypted kernel mappings are created. - */ - ppd.pgd = ppd.pgtable_area; - memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD); - ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD; - - /* - * A different PGD index/entry must be used to get different - * pagetable entries for the decrypted mapping. Choose the next - * PGD index and convert it to a virtual address to be used as - * the base of the mapping. - */ - decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1); - if (initrd_len) { - unsigned long check_base; - - check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1); - decrypted_base = max(decrypted_base, check_base); - } - decrypted_base <<= PGDIR_SHIFT; - - /* Add encrypted kernel (identity) mappings */ - ppd.paddr = kernel_start; - ppd.vaddr = kernel_start; - ppd.vaddr_end = kernel_end; - sme_map_range_encrypted(&ppd); - - /* Add decrypted, write-protected kernel (non-identity) mappings */ - ppd.paddr = kernel_start; - ppd.vaddr = kernel_start + decrypted_base; - ppd.vaddr_end = kernel_end + decrypted_base; - sme_map_range_decrypted_wp(&ppd); - - if (initrd_len) { - /* Add encrypted initrd (identity) mappings */ - ppd.paddr = initrd_start; - ppd.vaddr = initrd_start; - ppd.vaddr_end = initrd_end; - sme_map_range_encrypted(&ppd); - /* - * Add decrypted, write-protected initrd (non-identity) mappings - */ - ppd.paddr = initrd_start; - ppd.vaddr = initrd_start + decrypted_base; - ppd.vaddr_end = initrd_end + decrypted_base; - sme_map_range_decrypted_wp(&ppd); - } - - /* Add decrypted workarea mappings to both kernel mappings */ - ppd.paddr = workarea_start; - ppd.vaddr = workarea_start; - ppd.vaddr_end = workarea_end; - sme_map_range_decrypted(&ppd); - - ppd.paddr = workarea_start; - ppd.vaddr = workarea_start + decrypted_base; - ppd.vaddr_end = workarea_end + decrypted_base; - sme_map_range_decrypted(&ppd); - - /* Perform the encryption */ - sme_encrypt_execute(kernel_start, kernel_start + decrypted_base, - kernel_len, workarea_start, (unsigned long)ppd.pgd); - - if (initrd_len) - sme_encrypt_execute(initrd_start, initrd_start + decrypted_base, - initrd_len, workarea_start, - (unsigned long)ppd.pgd); - - /* - * At this point we are running encrypted. Remove the mappings for - * the decrypted areas - all that is needed for this is to remove - * the PGD entry/entries. - */ - ppd.vaddr = kernel_start + decrypted_base; - ppd.vaddr_end = kernel_end + decrypted_base; - sme_clear_pgd(&ppd); - - if (initrd_len) { - ppd.vaddr = initrd_start + decrypted_base; - ppd.vaddr_end = initrd_end + decrypted_base; - sme_clear_pgd(&ppd); - } - - ppd.vaddr = workarea_start + decrypted_base; - ppd.vaddr_end = workarea_end + decrypted_base; - sme_clear_pgd(&ppd); - - /* Flush the TLB - no globals so cr3 is enough */ - native_write_cr3(__native_read_cr3()); -} - -void __init __nostackprotector sme_enable(struct boot_params *bp) -{ - const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off; - unsigned int eax, ebx, ecx, edx; - unsigned long feature_mask; - bool active_by_default; - unsigned long me_mask; - char buffer[16]; - u64 msr; - - /* Check for the SME/SEV support leaf */ - eax = 0x80000000; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - if (eax < 0x8000001f) - return; - -#define AMD_SME_BIT BIT(0) -#define AMD_SEV_BIT BIT(1) - /* - * Set the feature mask (SME or SEV) based on whether we are - * running under a hypervisor. - */ - eax = 1; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT; - - /* - * Check for the SME/SEV feature: - * CPUID Fn8000_001F[EAX] - * - Bit 0 - Secure Memory Encryption support - * - Bit 1 - Secure Encrypted Virtualization support - * CPUID Fn8000_001F[EBX] - * - Bits 5:0 - Pagetable bit position used to indicate encryption - */ - eax = 0x8000001f; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - if (!(eax & feature_mask)) - return; - - me_mask = 1UL << (ebx & 0x3f); - - /* Check if memory encryption is enabled */ - if (feature_mask == AMD_SME_BIT) { - /* For SME, check the SYSCFG MSR */ - msr = __rdmsr(MSR_K8_SYSCFG); - if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) - return; - } else { - /* For SEV, check the SEV MSR */ - msr = __rdmsr(MSR_AMD64_SEV); - if (!(msr & MSR_AMD64_SEV_ENABLED)) - return; - - /* SEV state cannot be controlled by a command line option */ - sme_me_mask = me_mask; - sev_enabled = true; - return; - } - - /* - * Fixups have not been applied to phys_base yet and we're running - * identity mapped, so we must obtain the address to the SME command - * line argument data using rip-relative addressing. - */ - asm ("lea sme_cmdline_arg(%%rip), %0" - : "=r" (cmdline_arg) - : "p" (sme_cmdline_arg)); - asm ("lea sme_cmdline_on(%%rip), %0" - : "=r" (cmdline_on) - : "p" (sme_cmdline_on)); - asm ("lea sme_cmdline_off(%%rip), %0" - : "=r" (cmdline_off) - : "p" (sme_cmdline_off)); - - if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT)) - active_by_default = true; - else - active_by_default = false; - - cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr | - ((u64)bp->ext_cmd_line_ptr << 32)); - - cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)); - - if (!strncmp(buffer, cmdline_on, sizeof(buffer))) - sme_me_mask = me_mask; - else if (!strncmp(buffer, cmdline_off, sizeof(buffer))) - sme_me_mask = 0; - else - sme_me_mask = active_by_default ? me_mask : 0; -} diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S index 01f682cf77a8..40a6085063d6 100644 --- a/arch/x86/mm/mem_encrypt_boot.S +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -15,6 +15,7 @@ #include <asm/page.h> #include <asm/processor-flags.h> #include <asm/msr-index.h> +#include <asm/nospec-branch.h> .text .code64 @@ -59,6 +60,7 @@ ENTRY(sme_encrypt_execute) movq %rax, %r8 /* Workarea encryption routine */ addq $PAGE_SIZE, %r8 /* Workarea intermediate copy buffer */ + ANNOTATE_RETPOLINE_SAFE call *%rax /* Call the encryption routine */ pop %r12 diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c new file mode 100644 index 000000000000..1b2197d13832 --- /dev/null +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -0,0 +1,564 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define DISABLE_BRANCH_PROFILING + +/* + * Since we're dealing with identity mappings, physical and virtual + * addresses are the same, so override these defines which are ultimately + * used by the headers in misc.h. + */ +#define __pa(x) ((unsigned long)(x)) +#define __va(x) ((void *)((unsigned long)(x))) + +/* + * Special hack: we have to be careful, because no indirections are + * allowed here, and paravirt_ops is a kind of one. As it will only run in + * baremetal anyway, we just keep it from happening. (This list needs to + * be extended when new paravirt and debugging variants are added.) + */ +#undef CONFIG_PARAVIRT +#undef CONFIG_PARAVIRT_SPINLOCKS + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/mem_encrypt.h> + +#include <asm/setup.h> +#include <asm/sections.h> +#include <asm/cmdline.h> + +#include "mm_internal.h" + +#define PGD_FLAGS _KERNPG_TABLE_NOENC +#define P4D_FLAGS _KERNPG_TABLE_NOENC +#define PUD_FLAGS _KERNPG_TABLE_NOENC +#define PMD_FLAGS _KERNPG_TABLE_NOENC + +#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) + +#define PMD_FLAGS_DEC PMD_FLAGS_LARGE +#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ + (_PAGE_PAT | _PAGE_PWT)) + +#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC) + +#define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL) + +#define PTE_FLAGS_DEC PTE_FLAGS +#define PTE_FLAGS_DEC_WP ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ + (_PAGE_PAT | _PAGE_PWT)) + +#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC) + +struct sme_populate_pgd_data { + void *pgtable_area; + pgd_t *pgd; + + pmdval_t pmd_flags; + pteval_t pte_flags; + unsigned long paddr; + + unsigned long vaddr; + unsigned long vaddr_end; +}; + +static char sme_cmdline_arg[] __initdata = "mem_encrypt"; +static char sme_cmdline_on[] __initdata = "on"; +static char sme_cmdline_off[] __initdata = "off"; + +static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) +{ + unsigned long pgd_start, pgd_end, pgd_size; + pgd_t *pgd_p; + + pgd_start = ppd->vaddr & PGDIR_MASK; + pgd_end = ppd->vaddr_end & PGDIR_MASK; + + pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t); + + pgd_p = ppd->pgd + pgd_index(ppd->vaddr); + + memset(pgd_p, 0, pgd_size); +} + +static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = ppd->pgd + pgd_index(ppd->vaddr); + if (pgd_none(*pgd)) { + p4d = ppd->pgtable_area; + memset(p4d, 0, sizeof(*p4d) * PTRS_PER_P4D); + ppd->pgtable_area += sizeof(*p4d) * PTRS_PER_P4D; + set_pgd(pgd, __pgd(PGD_FLAGS | __pa(p4d))); + } + + p4d = p4d_offset(pgd, ppd->vaddr); + if (p4d_none(*p4d)) { + pud = ppd->pgtable_area; + memset(pud, 0, sizeof(*pud) * PTRS_PER_PUD); + ppd->pgtable_area += sizeof(*pud) * PTRS_PER_PUD; + set_p4d(p4d, __p4d(P4D_FLAGS | __pa(pud))); + } + + pud = pud_offset(p4d, ppd->vaddr); + if (pud_none(*pud)) { + pmd = ppd->pgtable_area; + memset(pmd, 0, sizeof(*pmd) * PTRS_PER_PMD); + ppd->pgtable_area += sizeof(*pmd) * PTRS_PER_PMD; + set_pud(pud, __pud(PUD_FLAGS | __pa(pmd))); + } + + if (pud_large(*pud)) + return NULL; + + return pud; +} + +static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) +{ + pud_t *pud; + pmd_t *pmd; + + pud = sme_prepare_pgd(ppd); + if (!pud) + return; + + pmd = pmd_offset(pud, ppd->vaddr); + if (pmd_large(*pmd)) + return; + + set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags)); +} + +static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) +{ + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pud = sme_prepare_pgd(ppd); + if (!pud) + return; + + pmd = pmd_offset(pud, ppd->vaddr); + if (pmd_none(*pmd)) { + pte = ppd->pgtable_area; + memset(pte, 0, sizeof(pte) * PTRS_PER_PTE); + ppd->pgtable_area += sizeof(pte) * PTRS_PER_PTE; + set_pmd(pmd, __pmd(PMD_FLAGS | __pa(pte))); + } + + if (pmd_large(*pmd)) + return; + + pte = pte_offset_map(pmd, ppd->vaddr); + if (pte_none(*pte)) + set_pte(pte, __pte(ppd->paddr | ppd->pte_flags)); +} + +static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) +{ + while (ppd->vaddr < ppd->vaddr_end) { + sme_populate_pgd_large(ppd); + + ppd->vaddr += PMD_PAGE_SIZE; + ppd->paddr += PMD_PAGE_SIZE; + } +} + +static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd) +{ + while (ppd->vaddr < ppd->vaddr_end) { + sme_populate_pgd(ppd); + + ppd->vaddr += PAGE_SIZE; + ppd->paddr += PAGE_SIZE; + } +} + +static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, + pmdval_t pmd_flags, pteval_t pte_flags) +{ + unsigned long vaddr_end; + + ppd->pmd_flags = pmd_flags; + ppd->pte_flags = pte_flags; + + /* Save original end value since we modify the struct value */ + vaddr_end = ppd->vaddr_end; + + /* If start is not 2MB aligned, create PTE entries */ + ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE); + __sme_map_range_pte(ppd); + + /* Create PMD entries */ + ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK; + __sme_map_range_pmd(ppd); + + /* If end is not 2MB aligned, create PTE entries */ + ppd->vaddr_end = vaddr_end; + __sme_map_range_pte(ppd); +} + +static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC); +} + +static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC); +} + +static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP); +} + +static unsigned long __init sme_pgtable_calc(unsigned long len) +{ + unsigned long entries = 0, tables = 0; + + /* + * Perform a relatively simplistic calculation of the pagetable + * entries that are needed. Those mappings will be covered mostly + * by 2MB PMD entries so we can conservatively calculate the required + * number of P4D, PUD and PMD structures needed to perform the + * mappings. For mappings that are not 2MB aligned, PTE mappings + * would be needed for the start and end portion of the address range + * that fall outside of the 2MB alignment. This results in, at most, + * two extra pages to hold PTE entries for each range that is mapped. + * Incrementing the count for each covers the case where the addresses + * cross entries. + */ + + /* PGDIR_SIZE is equal to P4D_SIZE on 4-level machine. */ + if (PTRS_PER_P4D > 1) + entries += (DIV_ROUND_UP(len, PGDIR_SIZE) + 1) * sizeof(p4d_t) * PTRS_PER_P4D; + entries += (DIV_ROUND_UP(len, P4D_SIZE) + 1) * sizeof(pud_t) * PTRS_PER_PUD; + entries += (DIV_ROUND_UP(len, PUD_SIZE) + 1) * sizeof(pmd_t) * PTRS_PER_PMD; + entries += 2 * sizeof(pte_t) * PTRS_PER_PTE; + + /* + * Now calculate the added pagetable structures needed to populate + * the new pagetables. + */ + + if (PTRS_PER_P4D > 1) + tables += DIV_ROUND_UP(entries, PGDIR_SIZE) * sizeof(p4d_t) * PTRS_PER_P4D; + tables += DIV_ROUND_UP(entries, P4D_SIZE) * sizeof(pud_t) * PTRS_PER_PUD; + tables += DIV_ROUND_UP(entries, PUD_SIZE) * sizeof(pmd_t) * PTRS_PER_PMD; + + return entries + tables; +} + +void __init sme_encrypt_kernel(struct boot_params *bp) +{ + unsigned long workarea_start, workarea_end, workarea_len; + unsigned long execute_start, execute_end, execute_len; + unsigned long kernel_start, kernel_end, kernel_len; + unsigned long initrd_start, initrd_end, initrd_len; + struct sme_populate_pgd_data ppd; + unsigned long pgtable_area_len; + unsigned long decrypted_base; + + if (!sme_active()) + return; + + /* + * Prepare for encrypting the kernel and initrd by building new + * pagetables with the necessary attributes needed to encrypt the + * kernel in place. + * + * One range of virtual addresses will map the memory occupied + * by the kernel and initrd as encrypted. + * + * Another range of virtual addresses will map the memory occupied + * by the kernel and initrd as decrypted and write-protected. + * + * The use of write-protect attribute will prevent any of the + * memory from being cached. + */ + + /* Physical addresses gives us the identity mapped virtual addresses */ + kernel_start = __pa_symbol(_text); + kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE); + kernel_len = kernel_end - kernel_start; + + initrd_start = 0; + initrd_end = 0; + initrd_len = 0; +#ifdef CONFIG_BLK_DEV_INITRD + initrd_len = (unsigned long)bp->hdr.ramdisk_size | + ((unsigned long)bp->ext_ramdisk_size << 32); + if (initrd_len) { + initrd_start = (unsigned long)bp->hdr.ramdisk_image | + ((unsigned long)bp->ext_ramdisk_image << 32); + initrd_end = PAGE_ALIGN(initrd_start + initrd_len); + initrd_len = initrd_end - initrd_start; + } +#endif + + /* Set the encryption workarea to be immediately after the kernel */ + workarea_start = kernel_end; + + /* + * Calculate required number of workarea bytes needed: + * executable encryption area size: + * stack page (PAGE_SIZE) + * encryption routine page (PAGE_SIZE) + * intermediate copy buffer (PMD_PAGE_SIZE) + * pagetable structures for the encryption of the kernel + * pagetable structures for workarea (in case not currently mapped) + */ + execute_start = workarea_start; + execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE; + execute_len = execute_end - execute_start; + + /* + * One PGD for both encrypted and decrypted mappings and a set of + * PUDs and PMDs for each of the encrypted and decrypted mappings. + */ + pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD; + pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2; + if (initrd_len) + pgtable_area_len += sme_pgtable_calc(initrd_len) * 2; + + /* PUDs and PMDs needed in the current pagetables for the workarea */ + pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len); + + /* + * The total workarea includes the executable encryption area and + * the pagetable area. The start of the workarea is already 2MB + * aligned, align the end of the workarea on a 2MB boundary so that + * we don't try to create/allocate PTE entries from the workarea + * before it is mapped. + */ + workarea_len = execute_len + pgtable_area_len; + workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE); + + /* + * Set the address to the start of where newly created pagetable + * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable + * structures are created when the workarea is added to the current + * pagetables and when the new encrypted and decrypted kernel + * mappings are populated. + */ + ppd.pgtable_area = (void *)execute_end; + + /* + * Make sure the current pagetable structure has entries for + * addressing the workarea. + */ + ppd.pgd = (pgd_t *)native_read_cr3_pa(); + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start; + ppd.vaddr_end = workarea_end; + sme_map_range_decrypted(&ppd); + + /* Flush the TLB - no globals so cr3 is enough */ + native_write_cr3(__native_read_cr3()); + + /* + * A new pagetable structure is being built to allow for the kernel + * and initrd to be encrypted. It starts with an empty PGD that will + * then be populated with new PUDs and PMDs as the encrypted and + * decrypted kernel mappings are created. + */ + ppd.pgd = ppd.pgtable_area; + memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD); + ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD; + + /* + * A different PGD index/entry must be used to get different + * pagetable entries for the decrypted mapping. Choose the next + * PGD index and convert it to a virtual address to be used as + * the base of the mapping. + */ + decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1); + if (initrd_len) { + unsigned long check_base; + + check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1); + decrypted_base = max(decrypted_base, check_base); + } + decrypted_base <<= PGDIR_SHIFT; + + /* Add encrypted kernel (identity) mappings */ + ppd.paddr = kernel_start; + ppd.vaddr = kernel_start; + ppd.vaddr_end = kernel_end; + sme_map_range_encrypted(&ppd); + + /* Add decrypted, write-protected kernel (non-identity) mappings */ + ppd.paddr = kernel_start; + ppd.vaddr = kernel_start + decrypted_base; + ppd.vaddr_end = kernel_end + decrypted_base; + sme_map_range_decrypted_wp(&ppd); + + if (initrd_len) { + /* Add encrypted initrd (identity) mappings */ + ppd.paddr = initrd_start; + ppd.vaddr = initrd_start; + ppd.vaddr_end = initrd_end; + sme_map_range_encrypted(&ppd); + /* + * Add decrypted, write-protected initrd (non-identity) mappings + */ + ppd.paddr = initrd_start; + ppd.vaddr = initrd_start + decrypted_base; + ppd.vaddr_end = initrd_end + decrypted_base; + sme_map_range_decrypted_wp(&ppd); + } + + /* Add decrypted workarea mappings to both kernel mappings */ + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start; + ppd.vaddr_end = workarea_end; + sme_map_range_decrypted(&ppd); + + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start + decrypted_base; + ppd.vaddr_end = workarea_end + decrypted_base; + sme_map_range_decrypted(&ppd); + + /* Perform the encryption */ + sme_encrypt_execute(kernel_start, kernel_start + decrypted_base, + kernel_len, workarea_start, (unsigned long)ppd.pgd); + + if (initrd_len) + sme_encrypt_execute(initrd_start, initrd_start + decrypted_base, + initrd_len, workarea_start, + (unsigned long)ppd.pgd); + + /* + * At this point we are running encrypted. Remove the mappings for + * the decrypted areas - all that is needed for this is to remove + * the PGD entry/entries. + */ + ppd.vaddr = kernel_start + decrypted_base; + ppd.vaddr_end = kernel_end + decrypted_base; + sme_clear_pgd(&ppd); + + if (initrd_len) { + ppd.vaddr = initrd_start + decrypted_base; + ppd.vaddr_end = initrd_end + decrypted_base; + sme_clear_pgd(&ppd); + } + + ppd.vaddr = workarea_start + decrypted_base; + ppd.vaddr_end = workarea_end + decrypted_base; + sme_clear_pgd(&ppd); + + /* Flush the TLB - no globals so cr3 is enough */ + native_write_cr3(__native_read_cr3()); +} + +void __init sme_enable(struct boot_params *bp) +{ + const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off; + unsigned int eax, ebx, ecx, edx; + unsigned long feature_mask; + bool active_by_default; + unsigned long me_mask; + char buffer[16]; + u64 msr; + + /* Check for the SME/SEV support leaf */ + eax = 0x80000000; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (eax < 0x8000001f) + return; + +#define AMD_SME_BIT BIT(0) +#define AMD_SEV_BIT BIT(1) + /* + * Set the feature mask (SME or SEV) based on whether we are + * running under a hypervisor. + */ + eax = 1; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT; + + /* + * Check for the SME/SEV feature: + * CPUID Fn8000_001F[EAX] + * - Bit 0 - Secure Memory Encryption support + * - Bit 1 - Secure Encrypted Virtualization support + * CPUID Fn8000_001F[EBX] + * - Bits 5:0 - Pagetable bit position used to indicate encryption + */ + eax = 0x8000001f; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (!(eax & feature_mask)) + return; + + me_mask = 1UL << (ebx & 0x3f); + + /* Check if memory encryption is enabled */ + if (feature_mask == AMD_SME_BIT) { + /* For SME, check the SYSCFG MSR */ + msr = __rdmsr(MSR_K8_SYSCFG); + if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + return; + } else { + /* For SEV, check the SEV MSR */ + msr = __rdmsr(MSR_AMD64_SEV); + if (!(msr & MSR_AMD64_SEV_ENABLED)) + return; + + /* SEV state cannot be controlled by a command line option */ + sme_me_mask = me_mask; + sev_enabled = true; + return; + } + + /* + * Fixups have not been applied to phys_base yet and we're running + * identity mapped, so we must obtain the address to the SME command + * line argument data using rip-relative addressing. + */ + asm ("lea sme_cmdline_arg(%%rip), %0" + : "=r" (cmdline_arg) + : "p" (sme_cmdline_arg)); + asm ("lea sme_cmdline_on(%%rip), %0" + : "=r" (cmdline_on) + : "p" (sme_cmdline_on)); + asm ("lea sme_cmdline_off(%%rip), %0" + : "=r" (cmdline_off) + : "p" (sme_cmdline_off)); + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT)) + active_by_default = true; + else + active_by_default = false; + + cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr | + ((u64)bp->ext_cmd_line_ptr << 32)); + + cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)); + + if (!strncmp(buffer, cmdline_on, sizeof(buffer))) + sme_me_mask = me_mask; + else if (!strncmp(buffer, cmdline_off, sizeof(buffer))) + sme_me_mask = 0; + else + sme_me_mask = active_by_default ? me_mask : 0; +} diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 155ecbac9e28..48c591251600 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -90,9 +90,10 @@ unsigned long arch_mmap_rnd(void) return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits); } -static unsigned long mmap_base(unsigned long rnd, unsigned long task_size) +static unsigned long mmap_base(unsigned long rnd, unsigned long task_size, + struct rlimit *rlim_stack) { - unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_maxrandom_size(task_size) + stack_guard_gap; unsigned long gap_min, gap_max; @@ -126,16 +127,17 @@ static unsigned long mmap_legacy_base(unsigned long rnd, * process VM image, sets up which VM layout function to use: */ static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base, - unsigned long random_factor, unsigned long task_size) + unsigned long random_factor, unsigned long task_size, + struct rlimit *rlim_stack) { *legacy_base = mmap_legacy_base(random_factor, task_size); if (mmap_is_legacy()) *base = *legacy_base; else - *base = mmap_base(random_factor, task_size); + *base = mmap_base(random_factor, task_size, rlim_stack); } -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { if (mmap_is_legacy()) mm->get_unmapped_area = arch_get_unmapped_area; @@ -143,7 +145,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm) mm->get_unmapped_area = arch_get_unmapped_area_topdown; arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, - arch_rnd(mmap64_rnd_bits), task_size_64bit(0)); + arch_rnd(mmap64_rnd_bits), task_size_64bit(0), + rlim_stack); #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES /* @@ -153,7 +156,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * mmap_base, the compat syscall uses mmap_compat_base. */ arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base, - arch_rnd(mmap32_rnd_bits), task_size_32bit()); + arch_rnd(mmap32_rnd_bits), task_size_32bit(), + rlim_stack); #endif } diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index aca6295350f3..e8a4a09e20f1 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -60,17 +60,6 @@ void memory_present(int nid, unsigned long start, unsigned long end) } printk(KERN_CONT "\n"); } - -unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, - unsigned long end_pfn) -{ - unsigned long nr_pages = end_pfn - start_pfn; - - if (!nr_pages) - return 0; - - return (nr_pages + 1) * sizeof(struct page); -} #endif extern unsigned long highend_pfn, highstart_pfn; diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 85cf12219dea..3bded76e8d5c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -93,6 +93,18 @@ void arch_report_meminfo(struct seq_file *m) static inline void split_page_count(int level) { } #endif +static inline int +within(unsigned long addr, unsigned long start, unsigned long end) +{ + return addr >= start && addr < end; +} + +static inline int +within_inclusive(unsigned long addr, unsigned long start, unsigned long end) +{ + return addr >= start && addr <= end; +} + #ifdef CONFIG_X86_64 static inline unsigned long highmap_start_pfn(void) @@ -106,20 +118,25 @@ static inline unsigned long highmap_end_pfn(void) return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT; } -#endif - -static inline int -within(unsigned long addr, unsigned long start, unsigned long end) +static bool __cpa_pfn_in_highmap(unsigned long pfn) { - return addr >= start && addr < end; + /* + * Kernel text has an alias mapping at a high address, known + * here as "highmap". + */ + return within_inclusive(pfn, highmap_start_pfn(), highmap_end_pfn()); } -static inline int -within_inclusive(unsigned long addr, unsigned long start, unsigned long end) +#else + +static bool __cpa_pfn_in_highmap(unsigned long pfn) { - return addr >= start && addr <= end; + /* There is no highmap on 32-bit */ + return false; } +#endif + /* * Flushing functions */ @@ -172,7 +189,7 @@ static void __cpa_flush_all(void *arg) static void cpa_flush_all(unsigned long cache) { - BUG_ON(irqs_disabled()); + BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); on_each_cpu(__cpa_flush_all, (void *) cache, 1); } @@ -236,7 +253,7 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache, unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */ #endif - BUG_ON(irqs_disabled()); + BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1); @@ -298,9 +315,11 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, /* * The .rodata section needs to be read-only. Using the pfn - * catches all aliases. + * catches all aliases. This also includes __ro_after_init, + * so do not enforce until kernel_set_to_readonly is true. */ - if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT, + if (kernel_set_to_readonly && + within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT, __pa_symbol(__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; @@ -512,6 +531,23 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) #endif } +static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot) +{ + /* + * _PAGE_GLOBAL means "global page" for present PTEs. + * But, it is also used to indicate _PAGE_PROTNONE + * for non-present PTEs. + * + * This ensures that a _PAGE_GLOBAL PTE going from + * present to non-present is not confused as + * _PAGE_PROTNONE. + */ + if (!(pgprot_val(prot) & _PAGE_PRESENT)) + pgprot_val(prot) &= ~_PAGE_GLOBAL; + + return prot; +} + static int try_preserve_large_page(pte_t *kpte, unsigned long address, struct cpa_data *cpa) @@ -566,6 +602,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * up accordingly. */ old_pte = *kpte; + /* Clear PSE (aka _PAGE_PAT) and move PAT bit to correct position */ req_prot = pgprot_large_2_4k(old_prot); pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); @@ -577,19 +614,9 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * different bit positions in the two formats. */ req_prot = pgprot_4k_2_large(req_prot); - - /* - * Set the PSE and GLOBAL flags only if the PRESENT flag is - * set otherwise pmd_present/pmd_huge will return true even on - * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL - * for the ancient hardware that doesn't support it. - */ + req_prot = pgprot_clear_protnone_bits(req_prot); if (pgprot_val(req_prot) & _PAGE_PRESENT) - pgprot_val(req_prot) |= _PAGE_PSE | _PAGE_GLOBAL; - else - pgprot_val(req_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL); - - req_prot = canon_pgprot(req_prot); + pgprot_val(req_prot) |= _PAGE_PSE; /* * old_pfn points to the large page base pfn. So we need @@ -674,8 +701,12 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, switch (level) { case PG_LEVEL_2M: ref_prot = pmd_pgprot(*(pmd_t *)kpte); - /* clear PSE and promote PAT bit to correct position */ + /* + * Clear PSE (aka _PAGE_PAT) and move + * PAT bit to correct position. + */ ref_prot = pgprot_large_2_4k(ref_prot); + ref_pfn = pmd_pfn(*(pmd_t *)kpte); break; @@ -698,23 +729,14 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, return 1; } - /* - * Set the GLOBAL flags only if the PRESENT flag is set - * otherwise pmd/pte_present will return true even on a non - * present pmd/pte. The canon_pgprot will clear _PAGE_GLOBAL - * for the ancient hardware that doesn't support it. - */ - if (pgprot_val(ref_prot) & _PAGE_PRESENT) - pgprot_val(ref_prot) |= _PAGE_GLOBAL; - else - pgprot_val(ref_prot) &= ~_PAGE_GLOBAL; + ref_prot = pgprot_clear_protnone_bits(ref_prot); /* * Get the target pfn from the original entry: */ pfn = ref_pfn; for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) - set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot))); + set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); if (virt_addr_valid(address)) { unsigned long pfn = PFN_DOWN(__pa(address)); @@ -930,19 +952,7 @@ static void populate_pte(struct cpa_data *cpa, pte = pte_offset_kernel(pmd, start); - /* - * Set the GLOBAL flags only if the PRESENT flag is - * set otherwise pte_present will return true even on - * a non present pte. The canon_pgprot will clear - * _PAGE_GLOBAL for the ancient hardware that doesn't - * support it. - */ - if (pgprot_val(pgprot) & _PAGE_PRESENT) - pgprot_val(pgprot) |= _PAGE_GLOBAL; - else - pgprot_val(pgprot) &= ~_PAGE_GLOBAL; - - pgprot = canon_pgprot(pgprot); + pgprot = pgprot_clear_protnone_bits(pgprot); while (num_pages-- && start < end) { set_pte(pte, pfn_pte(cpa->pfn, pgprot)); @@ -1190,6 +1200,10 @@ static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, cpa->numpages = 1; cpa->pfn = __pa(vaddr) >> PAGE_SHIFT; return 0; + + } else if (__cpa_pfn_in_highmap(cpa->pfn)) { + /* Faults in the highmap are OK, so do not warn: */ + return -EFAULT; } else { WARN(1, KERN_WARNING "CPA: called for zero pte. " "vaddr = %lx cpa->vaddr = %lx\n", vaddr, @@ -1234,24 +1248,14 @@ repeat: new_prot = static_protections(new_prot, address, pfn); - /* - * Set the GLOBAL flags only if the PRESENT flag is - * set otherwise pte_present will return true even on - * a non present pte. The canon_pgprot will clear - * _PAGE_GLOBAL for the ancient hardware that doesn't - * support it. - */ - if (pgprot_val(new_prot) & _PAGE_PRESENT) - pgprot_val(new_prot) |= _PAGE_GLOBAL; - else - pgprot_val(new_prot) &= ~_PAGE_GLOBAL; + new_prot = pgprot_clear_protnone_bits(new_prot); /* * We need to keep the pfn from the existing PTE, * after all we're only going to change it's attributes * not the memory it points to */ - new_pte = pfn_pte(pfn, canon_pgprot(new_prot)); + new_pte = pfn_pte(pfn, new_prot); cpa->pfn = pfn; /* * Do we really change anything ? @@ -1352,8 +1356,7 @@ static int cpa_process_alias(struct cpa_data *cpa) * to touch the high mapped kernel as well: */ if (!within(vaddr, (unsigned long)_text, _brk_end) && - within_inclusive(cpa->pfn, highmap_start_pfn(), - highmap_end_pfn())) { + __cpa_pfn_in_highmap(cpa->pfn)) { unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; alias_cpa = *cpa; @@ -1428,11 +1431,11 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, memset(&cpa, 0, sizeof(cpa)); /* - * Check, if we are requested to change a not supported - * feature: + * Check, if we are requested to set a not supported + * feature. Clearing non-supported features is OK. */ mask_set = canon_pgprot(mask_set); - mask_clr = canon_pgprot(mask_clr); + if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split) return 0; @@ -1775,6 +1778,12 @@ int set_memory_4k(unsigned long addr, int numpages) __pgprot(0), 1, 0, NULL); } +int set_memory_nonglobal(unsigned long addr, int numpages) +{ + return change_page_attr_clear(&addr, numpages, + __pgprot(_PAGE_GLOBAL), 0); +} + static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) { struct cpa_data cpa; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 004abf9ebf12..ffc8c13c50e4 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <linux/gfp.h> +#include <linux/hugetlb.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> #include <asm/tlb.h> @@ -583,6 +584,9 @@ void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) { + /* Sanitize 'prot' against any unsupported bits: */ + pgprot_val(flags) &= __default_kernel_pte_mask; + __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); } @@ -636,6 +640,10 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) (mtrr != MTRR_TYPE_WRBACK)) return 0; + /* Bail out if we are we on a populated non-leaf entry: */ + if (pud_present(*pud) && !pud_huge(*pud)) + return 0; + prot = pgprot_4k_2_large(prot); set_pte((pte_t *)pud, pfn_pte( @@ -664,6 +672,10 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) return 0; } + /* Bail out if we are we on a populated non-leaf entry: */ + if (pmd_present(*pmd) && !pmd_huge(*pmd)) + return 0; + prot = pgprot_4k_2_large(prot); set_pte((pte_t *)pmd, pfn_pte( @@ -702,4 +714,52 @@ int pmd_clear_huge(pmd_t *pmd) return 0; } + +/** + * pud_free_pmd_page - Clear pud entry and free pmd page. + * @pud: Pointer to a PUD. + * + * Context: The pud range has been unmaped and TLB purged. + * Return: 1 if clearing the entry succeeded. 0 otherwise. + */ +int pud_free_pmd_page(pud_t *pud) +{ + pmd_t *pmd; + int i; + + if (pud_none(*pud)) + return 1; + + pmd = (pmd_t *)pud_page_vaddr(*pud); + + for (i = 0; i < PTRS_PER_PMD; i++) + if (!pmd_free_pte_page(&pmd[i])) + return 0; + + pud_clear(pud); + free_page((unsigned long)pmd); + + return 1; +} + +/** + * pmd_free_pte_page - Clear pmd entry and free pte page. + * @pmd: Pointer to a PMD. + * + * Context: The pmd range has been unmaped and TLB purged. + * Return: 1 if clearing the entry succeeded. 0 otherwise. + */ +int pmd_free_pte_page(pmd_t *pmd) +{ + pte_t *pte; + + if (pmd_none(*pmd)) + return 1; + + pte = (pte_t *)pmd_page_vaddr(*pmd); + pmd_clear(pmd); + free_page((unsigned long)pte); + + return 1; +} #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index c3c5274410a9..9bb7f0ab9fe6 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -63,7 +63,7 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval) * It's enough to flush this one mapping. * (PGE mappings get flushed as well) */ - __flush_tlb_one(vaddr); + __flush_tlb_one_kernel(vaddr); } unsigned long __FIXADDR_TOP = 0xfffff000; diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index ce38f165489b..4d418e705878 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -66,12 +66,22 @@ static void __init pti_print_if_secure(const char *reason) pr_info("%s\n", reason); } +enum pti_mode { + PTI_AUTO = 0, + PTI_FORCE_OFF, + PTI_FORCE_ON +} pti_mode; + void __init pti_check_boottime_disable(void) { char arg[5]; int ret; + /* Assume mode is auto unless overridden. */ + pti_mode = PTI_AUTO; + if (hypervisor_is_type(X86_HYPER_XEN_PV)) { + pti_mode = PTI_FORCE_OFF; pti_print_if_insecure("disabled on XEN PV."); return; } @@ -79,18 +89,23 @@ void __init pti_check_boottime_disable(void) ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); if (ret > 0) { if (ret == 3 && !strncmp(arg, "off", 3)) { + pti_mode = PTI_FORCE_OFF; pti_print_if_insecure("disabled on command line."); return; } if (ret == 2 && !strncmp(arg, "on", 2)) { + pti_mode = PTI_FORCE_ON; pti_print_if_secure("force enabled on command line."); goto enable; } - if (ret == 4 && !strncmp(arg, "auto", 4)) + if (ret == 4 && !strncmp(arg, "auto", 4)) { + pti_mode = PTI_AUTO; goto autosel; + } } if (cmdline_find_option_bool(boot_command_line, "nopti")) { + pti_mode = PTI_FORCE_OFF; pti_print_if_insecure("disabled on command line."); return; } @@ -149,7 +164,7 @@ pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) * * Returns a pointer to a P4D on success, or NULL on failure. */ -static __init p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) +static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) { pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); @@ -177,7 +192,7 @@ static __init p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) * * Returns a pointer to a PMD on success, or NULL on failure. */ -static __init pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) +static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) { gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); p4d_t *p4d = pti_user_pagetable_walk_p4d(address); @@ -267,7 +282,7 @@ static void __init pti_setup_vsyscall(void) static void __init pti_setup_vsyscall(void) { } #endif -static void __init +static void pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) { unsigned long addr; @@ -300,6 +315,27 @@ pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) return; /* + * Only clone present PMDs. This ensures only setting + * _PAGE_GLOBAL on present PMDs. This should only be + * called on well-known addresses anyway, so a non- + * present PMD would be a surprise. + */ + if (WARN_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT))) + return; + + /* + * Setting 'target_pmd' below creates a mapping in both + * the user and kernel page tables. It is effectively + * global, so set it as global in both copies. Note: + * the X86_FEATURE_PGE check is not _required_ because + * the CPU ignores _PAGE_GLOBAL when PGE is not + * supported. The check keeps consistentency with + * code that only set this bit when supported. + */ + if (boot_cpu_has(X86_FEATURE_PGE)) + *pmd = pmd_set_flags(*pmd, _PAGE_GLOBAL); + + /* * Copy the PMD. That is, the kernelmode and usermode * tables will share the last-level page tables of this * address range @@ -332,7 +368,7 @@ static void __init pti_clone_user_shared(void) } /* - * Clone the ESPFIX P4D into the user space visinble page table + * Clone the ESPFIX P4D into the user space visible page table */ static void __init pti_setup_espfix64(void) { @@ -348,7 +384,103 @@ static void __init pti_clone_entry_text(void) { pti_clone_pmds((unsigned long) __entry_text_start, (unsigned long) __irqentry_text_end, - _PAGE_RW | _PAGE_GLOBAL); + _PAGE_RW); +} + +/* + * Global pages and PCIDs are both ways to make kernel TLB entries + * live longer, reduce TLB misses and improve kernel performance. + * But, leaving all kernel text Global makes it potentially accessible + * to Meltdown-style attacks which make it trivial to find gadgets or + * defeat KASLR. + * + * Only use global pages when it is really worth it. + */ +static inline bool pti_kernel_image_global_ok(void) +{ + /* + * Systems with PCIDs get litlle benefit from global + * kernel text and are not worth the downsides. + */ + if (cpu_feature_enabled(X86_FEATURE_PCID)) + return false; + + /* + * Only do global kernel image for pti=auto. Do the most + * secure thing (not global) if pti=on specified. + */ + if (pti_mode != PTI_AUTO) + return false; + + /* + * K8 may not tolerate the cleared _PAGE_RW on the userspace + * global kernel image pages. Do the safe thing (disable + * global kernel image). This is unlikely to ever be + * noticed because PTI is disabled by default on AMD CPUs. + */ + if (boot_cpu_has(X86_FEATURE_K8)) + return false; + + /* + * RANDSTRUCT derives its hardening benefits from the + * attacker's lack of knowledge about the layout of kernel + * data structures. Keep the kernel image non-global in + * cases where RANDSTRUCT is in use to help keep the layout a + * secret. + */ + if (IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT)) + return false; + + return true; +} + +/* + * For some configurations, map all of kernel text into the user page + * tables. This reduces TLB misses, especially on non-PCID systems. + */ +void pti_clone_kernel_text(void) +{ + /* + * rodata is part of the kernel image and is normally + * readable on the filesystem or on the web. But, do not + * clone the areas past rodata, they might contain secrets. + */ + unsigned long start = PFN_ALIGN(_text); + unsigned long end = (unsigned long)__end_rodata_hpage_align; + + if (!pti_kernel_image_global_ok()) + return; + + pr_debug("mapping partial kernel image into user address space\n"); + + /* + * Note that this will undo _some_ of the work that + * pti_set_kernel_image_nonglobal() did to clear the + * global bit. + */ + pti_clone_pmds(start, end, _PAGE_RW); +} + +/* + * This is the only user for it and it is not arch-generic like + * the other set_memory.h functions. Just extern it. + */ +extern int set_memory_nonglobal(unsigned long addr, int numpages); +void pti_set_kernel_image_nonglobal(void) +{ + /* + * The identity map is created with PMDs, regardless of the + * actual length of the kernel. We need to clear + * _PAGE_GLOBAL up to a PMD boundary, not just to the end + * of the image. + */ + unsigned long start = PFN_ALIGN(_text); + unsigned long end = ALIGN((unsigned long)_end, PMD_PAGE_SIZE); + + if (pti_kernel_image_global_ok()) + return; + + set_memory_nonglobal(start, (end - start) >> PAGE_SHIFT); } /* @@ -362,6 +494,10 @@ void __init pti_init(void) pr_info("enabled\n"); pti_clone_user_shared(); + + /* Undo all global bits from the init pagetables in head_64.S: */ + pti_set_kernel_image_nonglobal(); + /* Replace some of the global bits just for shared entry text: */ pti_clone_entry_text(); pti_setup_espfix64(); pti_setup_vsyscall(); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 8dcc0607f805..e055d1a06699 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -157,7 +157,7 @@ static void sync_current_stack_to_mm(struct mm_struct *mm) unsigned long sp = current_stack_pointer; pgd_t *pgd = pgd_offset(mm, sp); - if (CONFIG_PGTABLE_LEVELS > 4) { + if (pgtable_l5_enabled) { if (unlikely(pgd_none(*pgd))) { pgd_t *pgd_ref = pgd_offset_k(sp); @@ -498,7 +498,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, * flush that changes context.tlb_gen from 2 to 3. If they get * processed on this CPU in reverse order, we'll see * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL. - * If we were to use __flush_tlb_single() and set local_tlb_gen to + * If we were to use __flush_tlb_one_user() and set local_tlb_gen to * 3, we'd be break the invariant: we'd update local_tlb_gen above * 1 without the full flush that's needed for tlb_gen 2. * @@ -519,7 +519,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, addr = f->start; while (addr < f->end) { - __flush_tlb_single(addr); + __flush_tlb_one_user(addr); addr += PAGE_SIZE; } if (local) @@ -613,7 +613,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, { int cpu; - struct flush_tlb_info info = { + struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = { .mm = mm, }; @@ -666,7 +666,7 @@ static void do_kernel_range_flush(void *info) /* flush range by one by one 'invlpg' */ for (addr = f->start; addr < f->end; addr += PAGE_SIZE) - __flush_tlb_one(addr); + __flush_tlb_one_kernel(addr); } void flush_tlb_kernel_range(unsigned long start, unsigned long end) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 4923d92f918d..b725154182cc 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -11,10 +11,11 @@ #include <linux/netdevice.h> #include <linux/filter.h> #include <linux/if_vlan.h> -#include <asm/cacheflush.h> -#include <asm/set_memory.h> #include <linux/bpf.h> +#include <asm/set_memory.h> +#include <asm/nospec-branch.h> + /* * assembly code in arch/x86/net/bpf_jit.S */ @@ -60,7 +61,12 @@ static bool is_imm8(int value) static bool is_simm32(s64 value) { - return value == (s64) (s32) value; + return value == (s64)(s32)value; +} + +static bool is_uimm32(u64 value) +{ + return value == (u64)(u32)value; } /* mov dst, src */ @@ -97,16 +103,6 @@ static int bpf_size_to_x86_bytes(int bpf_size) #define X86_JLE 0x7E #define X86_JG 0x7F -static void bpf_flush_icache(void *start, void *end) -{ - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - smp_wmb(); - flush_icache_range((unsigned long)start, (unsigned long)end); - set_fs(old_fs); -} - #define CHOOSE_LOAD_FUNC(K, func) \ ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset) @@ -211,7 +207,7 @@ struct jit_context { /* emit x64 prologue code for BPF program and check it's size. * bpf_tail_call helper will skip it while jumping into another program */ -static void emit_prologue(u8 **pprog, u32 stack_depth) +static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) { u8 *prog = *pprog; int cnt = 0; @@ -246,18 +242,21 @@ static void emit_prologue(u8 **pprog, u32 stack_depth) /* mov qword ptr [rbp+24],r15 */ EMIT4(0x4C, 0x89, 0x7D, 24); - /* Clear the tail call counter (tail_call_cnt): for eBPF tail calls - * we need to reset the counter to 0. It's done in two instructions, - * resetting rax register to 0 (xor on eax gets 0 extended), and - * moving it to the counter location. - */ + if (!ebpf_from_cbpf) { + /* Clear the tail call counter (tail_call_cnt): for eBPF tail + * calls we need to reset the counter to 0. It's done in two + * instructions, resetting rax register to 0, and moving it + * to the counter location. + */ - /* xor eax, eax */ - EMIT2(0x31, 0xc0); - /* mov qword ptr [rbp+32], rax */ - EMIT4(0x48, 0x89, 0x45, 32); + /* xor eax, eax */ + EMIT2(0x31, 0xc0); + /* mov qword ptr [rbp+32], rax */ + EMIT4(0x48, 0x89, 0x45, 32); + + BUILD_BUG_ON(cnt != PROLOGUE_SIZE); + } - BUILD_BUG_ON(cnt != PROLOGUE_SIZE); *pprog = prog; } @@ -290,7 +289,7 @@ static void emit_bpf_tail_call(u8 **pprog) EMIT2(0x89, 0xD2); /* mov edx, edx */ EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */ offsetof(struct bpf_array, map.max_entries)); -#define OFFSET1 43 /* number of bytes to jump */ +#define OFFSET1 (41 + RETPOLINE_RAX_BPF_JIT_SIZE) /* number of bytes to jump */ EMIT2(X86_JBE, OFFSET1); /* jbe out */ label1 = cnt; @@ -299,7 +298,7 @@ static void emit_bpf_tail_call(u8 **pprog) */ EMIT2_off32(0x8B, 0x85, 36); /* mov eax, dword ptr [rbp + 36] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ -#define OFFSET2 32 +#define OFFSET2 (30 + RETPOLINE_RAX_BPF_JIT_SIZE) EMIT2(X86_JA, OFFSET2); /* ja out */ label2 = cnt; EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ @@ -313,7 +312,7 @@ static void emit_bpf_tail_call(u8 **pprog) * goto out; */ EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */ -#define OFFSET3 10 +#define OFFSET3 (8 + RETPOLINE_RAX_BPF_JIT_SIZE) EMIT2(X86_JE, OFFSET3); /* je out */ label3 = cnt; @@ -326,7 +325,7 @@ static void emit_bpf_tail_call(u8 **pprog) * rdi == ctx (1st arg) * rax == prog->bpf_func + prologue_size */ - EMIT2(0xFF, 0xE0); /* jmp rax */ + RETPOLINE_RAX_BPF_JIT(); /* out: */ BUILD_BUG_ON(cnt - label1 != OFFSET1); @@ -355,6 +354,86 @@ static void emit_load_skb_data_hlen(u8 **pprog) *pprog = prog; } +static void emit_mov_imm32(u8 **pprog, bool sign_propagate, + u32 dst_reg, const u32 imm32) +{ + u8 *prog = *pprog; + u8 b1, b2, b3; + int cnt = 0; + + /* optimization: if imm32 is positive, use 'mov %eax, imm32' + * (which zero-extends imm32) to save 2 bytes. + */ + if (sign_propagate && (s32)imm32 < 0) { + /* 'mov %rax, imm32' sign extends imm32 */ + b1 = add_1mod(0x48, dst_reg); + b2 = 0xC7; + b3 = 0xC0; + EMIT3_off32(b1, b2, add_1reg(b3, dst_reg), imm32); + goto done; + } + + /* optimization: if imm32 is zero, use 'xor %eax, %eax' + * to save 3 bytes. + */ + if (imm32 == 0) { + if (is_ereg(dst_reg)) + EMIT1(add_2mod(0x40, dst_reg, dst_reg)); + b2 = 0x31; /* xor */ + b3 = 0xC0; + EMIT2(b2, add_2reg(b3, dst_reg, dst_reg)); + goto done; + } + + /* mov %eax, imm32 */ + if (is_ereg(dst_reg)) + EMIT1(add_1mod(0x40, dst_reg)); + EMIT1_off32(add_1reg(0xB8, dst_reg), imm32); +done: + *pprog = prog; +} + +static void emit_mov_imm64(u8 **pprog, u32 dst_reg, + const u32 imm32_hi, const u32 imm32_lo) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) { + /* For emitting plain u32, where sign bit must not be + * propagated LLVM tends to load imm64 over mov32 + * directly, so save couple of bytes by just doing + * 'mov %eax, imm32' instead. + */ + emit_mov_imm32(&prog, false, dst_reg, imm32_lo); + } else { + /* movabsq %rax, imm64 */ + EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg)); + EMIT(imm32_lo, 4); + EMIT(imm32_hi, 4); + } + + *pprog = prog; +} + +static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (is64) { + /* mov dst, src */ + EMIT_mov(dst_reg, src_reg); + } else { + /* mov32 dst, src */ + if (is_ereg(dst_reg) || is_ereg(src_reg)) + EMIT1(add_2mod(0x40, dst_reg, src_reg)); + EMIT2(0x89, add_2reg(0xC0, dst_reg, src_reg)); + } + + *pprog = prog; +} + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int oldproglen, struct jit_context *ctx) { @@ -368,7 +447,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int proglen = 0; u8 *prog = temp; - emit_prologue(&prog, bpf_prog->aux->stack_depth); + emit_prologue(&prog, bpf_prog->aux->stack_depth, + bpf_prog_was_classic(bpf_prog)); if (seen_ld_abs) emit_load_skb_data_hlen(&prog); @@ -377,7 +457,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, const s32 imm32 = insn->imm; u32 dst_reg = insn->dst_reg; u32 src_reg = insn->src_reg; - u8 b1 = 0, b2 = 0, b3 = 0; + u8 b2 = 0, b3 = 0; s64 jmp_offset; u8 jmp_cond; bool reload_skb_data; @@ -413,16 +493,11 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, EMIT2(b2, add_2reg(0xC0, dst_reg, src_reg)); break; - /* mov dst, src */ case BPF_ALU64 | BPF_MOV | BPF_X: - EMIT_mov(dst_reg, src_reg); - break; - - /* mov32 dst, src */ case BPF_ALU | BPF_MOV | BPF_X: - if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT1(add_2mod(0x40, dst_reg, src_reg)); - EMIT2(0x89, add_2reg(0xC0, dst_reg, src_reg)); + emit_mov_reg(&prog, + BPF_CLASS(insn->code) == BPF_ALU64, + dst_reg, src_reg); break; /* neg dst */ @@ -485,58 +560,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, break; case BPF_ALU64 | BPF_MOV | BPF_K: - /* optimization: if imm32 is positive, - * use 'mov eax, imm32' (which zero-extends imm32) - * to save 2 bytes - */ - if (imm32 < 0) { - /* 'mov rax, imm32' sign extends imm32 */ - b1 = add_1mod(0x48, dst_reg); - b2 = 0xC7; - b3 = 0xC0; - EMIT3_off32(b1, b2, add_1reg(b3, dst_reg), imm32); - break; - } - case BPF_ALU | BPF_MOV | BPF_K: - /* optimization: if imm32 is zero, use 'xor <dst>,<dst>' - * to save 3 bytes. - */ - if (imm32 == 0) { - if (is_ereg(dst_reg)) - EMIT1(add_2mod(0x40, dst_reg, dst_reg)); - b2 = 0x31; /* xor */ - b3 = 0xC0; - EMIT2(b2, add_2reg(b3, dst_reg, dst_reg)); - break; - } - - /* mov %eax, imm32 */ - if (is_ereg(dst_reg)) - EMIT1(add_1mod(0x40, dst_reg)); - EMIT1_off32(add_1reg(0xB8, dst_reg), imm32); + emit_mov_imm32(&prog, BPF_CLASS(insn->code) == BPF_ALU64, + dst_reg, imm32); break; case BPF_LD | BPF_IMM | BPF_DW: - /* optimization: if imm64 is zero, use 'xor <dst>,<dst>' - * to save 7 bytes. - */ - if (insn[0].imm == 0 && insn[1].imm == 0) { - b1 = add_2mod(0x48, dst_reg, dst_reg); - b2 = 0x31; /* xor */ - b3 = 0xC0; - EMIT3(b1, b2, add_2reg(b3, dst_reg, dst_reg)); - - insn++; - i++; - break; - } - - /* movabsq %rax, imm64 */ - EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg)); - EMIT(insn[0].imm, 4); - EMIT(insn[1].imm, 4); - + emit_mov_imm64(&prog, dst_reg, insn[1].imm, insn[0].imm); insn++; i++; break; @@ -593,36 +623,38 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_ALU | BPF_MUL | BPF_X: case BPF_ALU64 | BPF_MUL | BPF_K: case BPF_ALU64 | BPF_MUL | BPF_X: - EMIT1(0x50); /* push rax */ - EMIT1(0x52); /* push rdx */ + { + bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; + + if (dst_reg != BPF_REG_0) + EMIT1(0x50); /* push rax */ + if (dst_reg != BPF_REG_3) + EMIT1(0x52); /* push rdx */ /* mov r11, dst_reg */ EMIT_mov(AUX_REG, dst_reg); if (BPF_SRC(insn->code) == BPF_X) - /* mov rax, src_reg */ - EMIT_mov(BPF_REG_0, src_reg); + emit_mov_reg(&prog, is64, BPF_REG_0, src_reg); else - /* mov rax, imm32 */ - EMIT3_off32(0x48, 0xC7, 0xC0, imm32); + emit_mov_imm32(&prog, is64, BPF_REG_0, imm32); - if (BPF_CLASS(insn->code) == BPF_ALU64) + if (is64) EMIT1(add_1mod(0x48, AUX_REG)); else if (is_ereg(AUX_REG)) EMIT1(add_1mod(0x40, AUX_REG)); /* mul(q) r11 */ EMIT2(0xF7, add_1reg(0xE0, AUX_REG)); - /* mov r11, rax */ - EMIT_mov(AUX_REG, BPF_REG_0); - - EMIT1(0x5A); /* pop rdx */ - EMIT1(0x58); /* pop rax */ - - /* mov dst_reg, r11 */ - EMIT_mov(dst_reg, AUX_REG); + if (dst_reg != BPF_REG_3) + EMIT1(0x5A); /* pop rdx */ + if (dst_reg != BPF_REG_0) { + /* mov dst_reg, rax */ + EMIT_mov(dst_reg, BPF_REG_0); + EMIT1(0x58); /* pop rax */ + } break; - + } /* shifts */ case BPF_ALU | BPF_LSH | BPF_K: case BPF_ALU | BPF_RSH | BPF_K: @@ -640,7 +672,11 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_RSH: b3 = 0xE8; break; case BPF_ARSH: b3 = 0xF8; break; } - EMIT3(0xC1, add_1reg(b3, dst_reg), imm32); + + if (imm32 == 1) + EMIT2(0xD1, add_1reg(b3, dst_reg)); + else + EMIT3(0xC1, add_1reg(b3, dst_reg), imm32); break; case BPF_ALU | BPF_LSH | BPF_X: @@ -1187,7 +1223,7 @@ skip_init_addrs: * may converge on the last pass. In such case do one more * pass to emit the final image */ - for (pass = 0; pass < 10 || image; pass++) { + for (pass = 0; pass < 20 || image; pass++) { proglen = do_jit(prog, addrs, image, oldproglen, &ctx); if (proglen <= 0) { image = NULL; @@ -1214,13 +1250,13 @@ skip_init_addrs: } } oldproglen = proglen; + cond_resched(); } if (bpf_jit_enable > 1) bpf_jit_dump(prog->len, proglen, pass + 1, image); if (image) { - bpf_flush_icache(header, image + proglen); if (!prog->is_func || extra_pass) { bpf_jit_binary_lock_ro(header); } else { diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 174c59774cc9..a7a7677265b6 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -460,7 +460,7 @@ static int nmi_setup(void) goto fail; for_each_possible_cpu(cpu) { - if (!cpu) + if (!IS_ENABLED(CONFIG_SMP) || !cpu) continue; memcpy(per_cpu(cpu_msrs, cpu).counters, diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 7df49c40665e..5559dcaddd5e 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -140,12 +140,10 @@ static const struct dmi_system_id pci_crs_quirks[] __initconst = { void __init pci_acpi_crs_quirks(void) { - int year; + int year = dmi_get_bios_year(); - if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && year < 2008) { - if (iomem_resource.end <= 0xffffffff) - pci_use_crs = false; - } + if (year >= 0 && year < 2008 && iomem_resource.end <= 0xffffffff) + pci_use_crs = false; dmi_check_system(pci_crs_quirks); diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c index 2d9503323d10..a51074c55982 100644 --- a/arch/x86/pci/direct.c +++ b/arch/x86/pci/direct.c @@ -195,14 +195,13 @@ static const struct pci_raw_ops pci_direct_conf2 = { static int __init pci_sanity_check(const struct pci_raw_ops *o) { u32 x = 0; - int year, devfn; + int devfn; if (pci_probe & PCI_NO_CHECKS) return 1; /* Assume Type 1 works for newer systems. This handles machines that don't have anything on PCI Bus 0. */ - dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL); - if (year >= 2001) + if (dmi_get_bios_year() >= 2001) return 1; for (devfn = 0; devfn < 0x100; devfn++) { diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index 1cb01abcb1be..dfbe6ac38830 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -4,6 +4,7 @@ #include <linux/init.h> #include <linux/export.h> #include <linux/pci.h> +#include <asm/jailhouse_para.h> #include <asm/pci_x86.h> /* @@ -34,13 +35,14 @@ int __init pci_legacy_init(void) void pcibios_scan_specific_bus(int busn) { + int stride = jailhouse_paravirt() ? 1 : 8; int devfn; u32 l; if (pci_find_bus(0, busn)) return; - for (devfn = 0; devfn < 256; devfn += 8) { + for (devfn = 0; devfn < 256; devfn += stride) { if (!raw_pci_read(0, busn, devfn, PCI_VENDOR_ID, 2, &l) && l != 0x0000 && l != 0xffff) { DBG("Found device at %02x:%02x [%04x]\n", busn, devfn, l); diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 96684d0adcf9..7389db538c30 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -94,8 +94,8 @@ static struct pci_mmcfg_region *pci_mmconfig_alloc(int segment, int start, return new; } -static struct pci_mmcfg_region *__init pci_mmconfig_add(int segment, int start, - int end, u64 addr) +struct pci_mmcfg_region *__init pci_mmconfig_add(int segment, int start, + int end, u64 addr) { struct pci_mmcfg_region *new; @@ -547,19 +547,14 @@ static void __init pci_mmcfg_reject_broken(int early) static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg, struct acpi_mcfg_allocation *cfg) { - int year; - if (cfg->address < 0xFFFFFFFF) return 0; if (!strncmp(mcfg->header.oem_id, "SGI", 3)) return 0; - if (mcfg->header.revision >= 1) { - if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && - year >= 2010) - return 0; - } + if ((mcfg->header.revision >= 1) && (dmi_get_bios_year() >= 2010)) + return 0; pr_err(PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx " "is above 4GB, ignored\n", cfg->pci_segment, diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c index 75577c1490c4..7a5bafb76d77 100644 --- a/arch/x86/pci/sta2x11-fixup.c +++ b/arch/x86/pci/sta2x11-fixup.c @@ -159,43 +159,6 @@ static dma_addr_t a2p(dma_addr_t a, struct pci_dev *pdev) return p; } -/** - * sta2x11_swiotlb_alloc_coherent - Allocate swiotlb bounce buffers - * returns virtual address. This is the only "special" function here. - * @dev: PCI device - * @size: Size of the buffer - * @dma_handle: DMA address - * @flags: memory flags - */ -static void *sta2x11_swiotlb_alloc_coherent(struct device *dev, - size_t size, - dma_addr_t *dma_handle, - gfp_t flags, - unsigned long attrs) -{ - void *vaddr; - - vaddr = x86_swiotlb_alloc_coherent(dev, size, dma_handle, flags, attrs); - *dma_handle = p2a(*dma_handle, to_pci_dev(dev)); - return vaddr; -} - -/* We have our own dma_ops: the same as swiotlb but from alloc (above) */ -static const struct dma_map_ops sta2x11_dma_ops = { - .alloc = sta2x11_swiotlb_alloc_coherent, - .free = x86_swiotlb_free_coherent, - .map_page = swiotlb_map_page, - .unmap_page = swiotlb_unmap_page, - .map_sg = swiotlb_map_sg_attrs, - .unmap_sg = swiotlb_unmap_sg_attrs, - .sync_single_for_cpu = swiotlb_sync_single_for_cpu, - .sync_single_for_device = swiotlb_sync_single_for_device, - .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, - .sync_sg_for_device = swiotlb_sync_sg_for_device, - .mapping_error = swiotlb_dma_mapping_error, - .dma_supported = x86_dma_supported, -}; - /* At setup time, we use our own ops if the device is a ConneXt one */ static void sta2x11_setup_pdev(struct pci_dev *pdev) { @@ -205,7 +168,8 @@ static void sta2x11_setup_pdev(struct pci_dev *pdev) return; pci_set_consistent_dma_mask(pdev, STA2X11_AMBA_SIZE - 1); pci_set_dma_mask(pdev, STA2X11_AMBA_SIZE - 1); - pdev->dev.dma_ops = &sta2x11_dma_ops; + pdev->dev.dma_ops = &swiotlb_dma_ops; + pdev->dev.archdata.is_sta2x11 = true; /* We must enable all devices as master, for audio DMA to work */ pci_set_master(pdev); @@ -225,7 +189,7 @@ bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) { struct sta2x11_mapping *map; - if (dev->dma_ops != &sta2x11_dma_ops) { + if (!dev->archdata.is_sta2x11) { if (!dev->dma_mask) return false; return addr + size - 1 <= *dev->dma_mask; @@ -243,13 +207,13 @@ bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) } /** - * phys_to_dma - Return the DMA AMBA address used for this STA2x11 device + * __phys_to_dma - Return the DMA AMBA address used for this STA2x11 device * @dev: device for a PCI device * @paddr: Physical address */ -dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) +dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr) { - if (dev->dma_ops != &sta2x11_dma_ops) + if (!dev->archdata.is_sta2x11) return paddr; return p2a(paddr, to_pci_dev(dev)); } @@ -259,9 +223,9 @@ dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) * @dev: device for a PCI device * @daddr: STA2x11 AMBA DMA address */ -phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) +phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t daddr) { - if (dev->dma_ops != &sta2x11_dma_ops) + if (!dev->archdata.is_sta2x11) return daddr; return a2p(daddr, to_pci_dev(dev)); } diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c index d49d3be81953..034813d4ab1e 100644 --- a/arch/x86/platform/atom/punit_atom_debug.c +++ b/arch/x86/platform/atom/punit_atom_debug.c @@ -109,18 +109,7 @@ static int punit_dev_state_show(struct seq_file *seq_file, void *unused) return 0; } - -static int punit_dev_state_open(struct inode *inode, struct file *file) -{ - return single_open(file, punit_dev_state_show, inode->i_private); -} - -static const struct file_operations punit_dev_state_ops = { - .open = punit_dev_state_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(punit_dev_state); static struct dentry *punit_dbg_file; @@ -132,9 +121,9 @@ static int punit_dbgfs_register(struct punit_device *punit_device) if (!punit_dbg_file) return -ENXIO; - dev_state = debugfs_create_file("dev_power_state", S_IFREG | S_IRUGO, + dev_state = debugfs_create_file("dev_power_state", 0444, punit_dbg_file, punit_device, - &punit_dev_state_ops); + &punit_dev_state_fops); if (!dev_state) { pr_err("punit_dev_state register failed\n"); debugfs_remove(punit_dbg_file); diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index c310a8284358..bed7e7f4e44c 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -27,12 +27,14 @@ #include <linux/ioport.h> #include <linux/mc146818rtc.h> #include <linux/efi.h> +#include <linux/export.h> #include <linux/uaccess.h> #include <linux/io.h> #include <linux/reboot.h> #include <linux/slab.h> #include <linux/ucs2_string.h> #include <linux/mem_encrypt.h> +#include <linux/sched/task.h> #include <asm/setup.h> #include <asm/page.h> @@ -81,9 +83,8 @@ pgd_t * __init efi_call_phys_prolog(void) int n_pgds, i, j; if (!efi_enabled(EFI_OLD_MEMMAP)) { - save_pgd = (pgd_t *)__read_cr3(); - write_cr3((unsigned long)efi_scratch.efi_pgt); - goto out; + efi_switch_mm(&efi_mm); + return NULL; } early_code_mapping_set_exec(1); @@ -155,8 +156,7 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd) pud_t *pud; if (!efi_enabled(EFI_OLD_MEMMAP)) { - write_cr3((unsigned long)save_pgd); - __flush_tlb_all(); + efi_switch_mm(efi_scratch.prev_mm); return; } @@ -190,7 +190,7 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd) early_code_mapping_set_exec(0); } -static pgd_t *efi_pgd; +EXPORT_SYMBOL_GPL(efi_mm); /* * We need our own copy of the higher levels of the page tables @@ -203,7 +203,7 @@ static pgd_t *efi_pgd; */ int __init efi_alloc_page_tables(void) { - pgd_t *pgd; + pgd_t *pgd, *efi_pgd; p4d_t *p4d; pud_t *pud; gfp_t gfp_mask; @@ -225,12 +225,16 @@ int __init efi_alloc_page_tables(void) pud = pud_alloc(&init_mm, p4d, EFI_VA_END); if (!pud) { - if (CONFIG_PGTABLE_LEVELS > 4) + if (pgtable_l5_enabled) free_page((unsigned long) pgd_page_vaddr(*pgd)); - free_page((unsigned long)efi_pgd); + free_pages((unsigned long)efi_pgd, PGD_ALLOCATION_ORDER); return -ENOMEM; } + efi_mm.pgd = efi_pgd; + mm_init_cpumask(&efi_mm); + init_new_context(NULL, &efi_mm); + return 0; } @@ -243,6 +247,7 @@ void efi_sync_low_kernel_mappings(void) pgd_t *pgd_k, *pgd_efi; p4d_t *p4d_k, *p4d_efi; pud_t *pud_k, *pud_efi; + pgd_t *efi_pgd = efi_mm.pgd; if (efi_enabled(EFI_OLD_MEMMAP)) return; @@ -255,8 +260,8 @@ void efi_sync_low_kernel_mappings(void) * only span a single PGD entry and that the entry also maps * other important kernel regions. */ - BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END)); - BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) != + MAYBE_BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END)); + MAYBE_BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) != (EFI_VA_END & PGDIR_MASK)); pgd_efi = efi_pgd + pgd_index(PAGE_OFFSET); @@ -336,20 +341,12 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) unsigned long pfn, text, pf; struct page *page; unsigned npages; - pgd_t *pgd; + pgd_t *pgd = efi_mm.pgd; if (efi_enabled(EFI_OLD_MEMMAP)) return 0; /* - * Since the PGD is encrypted, set the encryption mask so that when - * this value is loaded into cr3 the PGD will be decrypted during - * the pagetable walk. - */ - efi_scratch.efi_pgt = (pgd_t *)__sme_pa(efi_pgd); - pgd = efi_pgd; - - /* * It can happen that the physical address of new_memmap lands in memory * which is not mapped in the EFI page table. Therefore we need to go * and ident-map those pages containing the map before calling @@ -362,8 +359,6 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) return 1; } - efi_scratch.use_pgd = true; - /* * Certain firmware versions are way too sentimential and still believe * they are exclusive and unquestionable owners of the first physical page, @@ -417,7 +412,7 @@ static void __init __map_region(efi_memory_desc_t *md, u64 va) { unsigned long flags = _PAGE_RW; unsigned long pfn; - pgd_t *pgd = efi_pgd; + pgd_t *pgd = efi_mm.pgd; if (!(md->attribute & EFI_MEMORY_WB)) flags |= _PAGE_PCD; @@ -521,7 +516,7 @@ void __init parse_efi_setup(u64 phys_addr, u32 data_len) static int __init efi_update_mappings(efi_memory_desc_t *md, unsigned long pf) { unsigned long pfn; - pgd_t *pgd = efi_pgd; + pgd_t *pgd = efi_mm.pgd; int err1, err2; /* Update the 1:1 mapping */ @@ -618,10 +613,26 @@ void __init efi_dump_pagetable(void) if (efi_enabled(EFI_OLD_MEMMAP)) ptdump_walk_pgd_level(NULL, swapper_pg_dir); else - ptdump_walk_pgd_level(NULL, efi_pgd); + ptdump_walk_pgd_level(NULL, efi_mm.pgd); #endif } +/* + * Makes the calling thread switch to/from efi_mm context. Can be used + * for SetVirtualAddressMap() i.e. current->active_mm == init_mm as well + * as during efi runtime calls i.e current->active_mm == current_mm. + * We are not mm_dropping()/mm_grabbing() any mm, because we are not + * losing/creating any references. + */ +void efi_switch_mm(struct mm_struct *mm) +{ + task_lock(current); + efi_scratch.prev_mm = current->active_mm; + current->active_mm = mm; + switch_mm(efi_scratch.prev_mm, mm, NULL); + task_unlock(current); +} + #ifdef CONFIG_EFI_MIXED extern efi_status_t efi64_thunk(u32, ...); @@ -675,16 +686,13 @@ efi_status_t efi_thunk_set_virtual_address_map( efi_sync_low_kernel_mappings(); local_irq_save(flags); - efi_scratch.prev_cr3 = __read_cr3(); - write_cr3((unsigned long)efi_scratch.efi_pgt); - __flush_tlb_all(); + efi_switch_mm(&efi_mm); func = (u32)(unsigned long)phys_set_virtual_address_map; status = efi64_thunk(func, memory_map_size, descriptor_size, descriptor_version, virtual_map); - write_cr3(efi_scratch.prev_cr3); - __flush_tlb_all(); + efi_switch_mm(efi_scratch.prev_mm); local_irq_restore(flags); return status; diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S index 189b218da87c..46c58b08739c 100644 --- a/arch/x86/platform/efi/efi_thunk_64.S +++ b/arch/x86/platform/efi/efi_thunk_64.S @@ -33,7 +33,7 @@ ENTRY(efi64_thunk) * Switch to 1:1 mapped 32-bit stack pointer. */ movq %rsp, efi_saved_sp(%rip) - movq efi_scratch+25(%rip), %rsp + movq efi_scratch(%rip), %rsp /* * Calculate the physical address of the kernel text. diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 5b513ccffde4..36c1f8b9f7e0 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -75,7 +75,7 @@ struct quark_security_header { u32 rsvd[2]; }; -static efi_char16_t efi_dummy_name[6] = { 'D', 'U', 'M', 'M', 'Y', 0 }; +static const efi_char16_t efi_dummy_name[] = L"DUMMY"; static bool efi_no_storage_paranoia; @@ -105,7 +105,8 @@ early_param("efi_no_storage_paranoia", setup_storage_paranoia); */ void efi_delete_dummy_variable(void) { - efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID, + efi.set_variable((efi_char16_t *)efi_dummy_name, + &EFI_DUMMY_GUID, EFI_VARIABLE_NON_VOLATILE | EFI_VARIABLE_BOOTSERVICE_ACCESS | EFI_VARIABLE_RUNTIME_ACCESS, @@ -177,12 +178,13 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size, * that by attempting to use more space than is available. */ unsigned long dummy_size = remaining_size + 1024; - void *dummy = kzalloc(dummy_size, GFP_ATOMIC); + void *dummy = kzalloc(dummy_size, GFP_KERNEL); if (!dummy) return EFI_OUT_OF_RESOURCES; - status = efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID, + status = efi.set_variable((efi_char16_t *)efi_dummy_name, + &EFI_DUMMY_GUID, EFI_VARIABLE_NON_VOLATILE | EFI_VARIABLE_BOOTSERVICE_ACCESS | EFI_VARIABLE_RUNTIME_ACCESS, diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index 2c67bae6bb53..2ebdf31d9996 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -79,7 +79,7 @@ static void intel_mid_power_off(void) static void intel_mid_reboot(void) { - intel_scu_ipc_simple_command(IPCMSG_COLD_BOOT, 0); + intel_scu_ipc_simple_command(IPCMSG_COLD_RESET, 0); } static unsigned long __init intel_mid_calibrate_tsc(void) @@ -199,6 +199,12 @@ void __init x86_intel_mid_early_setup(void) legacy_pic = &null_legacy_pic; + /* + * Do nothing for now as everything needed done in + * x86_intel_mid_early_setup() below. + */ + x86_init.acpi.reduced_hw_early_init = x86_init_noop; + pm_power_off = intel_mid_power_off; machine_ops.emergency_restart = intel_mid_reboot; diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c index 17d6d2296e4d..49828c2707ac 100644 --- a/arch/x86/platform/intel-quark/imr.c +++ b/arch/x86/platform/intel-quark/imr.c @@ -224,25 +224,7 @@ static int imr_dbgfs_state_show(struct seq_file *s, void *unused) mutex_unlock(&idev->lock); return ret; } - -/** - * imr_state_open - debugfs open callback. - * - * @inode: pointer to struct inode. - * @file: pointer to struct file. - * @return: result of single open. - */ -static int imr_state_open(struct inode *inode, struct file *file) -{ - return single_open(file, imr_dbgfs_state_show, inode->i_private); -} - -static const struct file_operations imr_state_ops = { - .open = imr_state_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(imr_dbgfs_state); /** * imr_debugfs_register - register debugfs hooks. @@ -252,8 +234,8 @@ static const struct file_operations imr_state_ops = { */ static int imr_debugfs_register(struct imr_device *idev) { - idev->file = debugfs_create_file("imr_state", S_IFREG | S_IRUGO, NULL, - idev, &imr_state_ops); + idev->file = debugfs_create_file("imr_state", 0444, NULL, idev, + &imr_dbgfs_state_fops); return PTR_ERR_OR_ZERO(idev->file); } diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index c2e9285d1bf1..b36caae0fb2f 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -299,7 +299,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp, local_flush_tlb(); stat->d_alltlb++; } else { - __flush_tlb_single(msg->address); + __flush_tlb_one_user(msg->address); stat->d_onetlb++; } stat->d_requestee++; @@ -2255,8 +2255,6 @@ static int __init uv_bau_init(void) init_uvhub(uvhub, vector, uv_base_pnode); } - alloc_intr_gate(vector, uv_bau_message_intr1); - for_each_possible_blade(uvhub) { if (uv_blade_nr_possible_cpus(uvhub)) { unsigned long val; diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 0ef5e5204968..ccf4a49bb065 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -50,7 +50,13 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) { pmd_t *pmd; pud_t *pud; - p4d_t *p4d; + p4d_t *p4d = NULL; + pgprot_t pgtable_prot = __pgprot(_KERNPG_TABLE); + pgprot_t pmd_text_prot = __pgprot(__PAGE_KERNEL_LARGE_EXEC); + + /* Filter out unsupported __PAGE_KERNEL* bits: */ + pgprot_val(pmd_text_prot) &= __default_kernel_pte_mask; + pgprot_val(pgtable_prot) &= __default_kernel_pte_mask; /* * The new mapping only has to cover the page containing the image @@ -66,7 +72,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) * tables used by the image kernel. */ - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + if (pgtable_l5_enabled) { p4d = (p4d_t *)get_safe_page(GFP_ATOMIC); if (!p4d) return -ENOMEM; @@ -81,15 +87,19 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) return -ENOMEM; set_pmd(pmd + pmd_index(restore_jump_address), - __pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC)); + __pmd((jump_address_phys & PMD_MASK) | pgprot_val(pmd_text_prot))); set_pud(pud + pud_index(restore_jump_address), - __pud(__pa(pmd) | _KERNPG_TABLE)); - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - set_p4d(p4d + p4d_index(restore_jump_address), __p4d(__pa(pud) | _KERNPG_TABLE)); - set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(p4d) | _KERNPG_TABLE)); + __pud(__pa(pmd) | pgprot_val(pgtable_prot))); + if (p4d) { + p4d_t new_p4d = __p4d(__pa(pud) | pgprot_val(pgtable_prot)); + pgd_t new_pgd = __pgd(__pa(p4d) | pgprot_val(pgtable_prot)); + + set_p4d(p4d + p4d_index(restore_jump_address), new_p4d); + set_pgd(pgd + pgd_index(restore_jump_address), new_pgd); } else { /* No p4d for 4-level paging: point the pgd to the pud page table */ - set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(pud) | _KERNPG_TABLE)); + pgd_t new_pgd = __pgd(__pa(pud) | pgprot_val(pgtable_prot)); + set_pgd(pgd + pgd_index(restore_jump_address), new_pgd); } return 0; diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 2f15a2ac4209..2e9ee023e6bc 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -6,6 +6,9 @@ purgatory-y := purgatory.o stack.o setup-x86_$(BITS).o sha256.o entry64.o string targets += $(purgatory-y) PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) +$(obj)/sha256.o: $(srctree)/lib/sha256.c + $(call if_changed_rule,cc_o_c) + LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib targets += purgatory.ro @@ -16,7 +19,7 @@ KCOV_INSTRUMENT := n # in turn leaves some undefined symbols like __fentry__ in purgatory and not # sure how to relocate those. Like kexec-tools, use custom flags. -KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fno-builtin -ffreestanding -c -MD -Os -mcmodel=large +KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fno-builtin -ffreestanding -c -Os -mcmodel=large KBUILD_CFLAGS += -m$(BITS) KBUILD_CFLAGS += $(call cc-option,-fno-PIE) diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c index 470edad96bb9..025c34ac0d84 100644 --- a/arch/x86/purgatory/purgatory.c +++ b/arch/x86/purgatory/purgatory.c @@ -11,9 +11,9 @@ */ #include <linux/bug.h> +#include <linux/sha256.h> #include <asm/purgatory.h> -#include "sha256.h" #include "../boot/string.h" unsigned long purgatory_backup_dest __section(.kexec-purgatory); diff --git a/arch/x86/purgatory/sha256.c b/arch/x86/purgatory/sha256.c deleted file mode 100644 index 548ca675a14a..000000000000 --- a/arch/x86/purgatory/sha256.c +++ /dev/null @@ -1,283 +0,0 @@ -/* - * SHA-256, as specified in - * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf - * - * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>. - * - * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> - * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> - * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> - * Copyright (c) 2014 Red Hat Inc. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - */ - -#include <linux/bitops.h> -#include <asm/byteorder.h> -#include "sha256.h" -#include "../boot/string.h" - -static inline u32 Ch(u32 x, u32 y, u32 z) -{ - return z ^ (x & (y ^ z)); -} - -static inline u32 Maj(u32 x, u32 y, u32 z) -{ - return (x & y) | (z & (x | y)); -} - -#define e0(x) (ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22)) -#define e1(x) (ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25)) -#define s0(x) (ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3)) -#define s1(x) (ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10)) - -static inline void LOAD_OP(int I, u32 *W, const u8 *input) -{ - W[I] = __be32_to_cpu(((__be32 *)(input))[I]); -} - -static inline void BLEND_OP(int I, u32 *W) -{ - W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16]; -} - -static void sha256_transform(u32 *state, const u8 *input) -{ - u32 a, b, c, d, e, f, g, h, t1, t2; - u32 W[64]; - int i; - - /* load the input */ - for (i = 0; i < 16; i++) - LOAD_OP(i, W, input); - - /* now blend */ - for (i = 16; i < 64; i++) - BLEND_OP(i, W); - - /* load the state into our registers */ - a = state[0]; b = state[1]; c = state[2]; d = state[3]; - e = state[4]; f = state[5]; g = state[6]; h = state[7]; - - /* now iterate */ - t1 = h + e1(e) + Ch(e, f, g) + 0x428a2f98 + W[0]; - t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; - t1 = g + e1(d) + Ch(d, e, f) + 0x71374491 + W[1]; - t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; - t1 = f + e1(c) + Ch(c, d, e) + 0xb5c0fbcf + W[2]; - t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; - t1 = e + e1(b) + Ch(b, c, d) + 0xe9b5dba5 + W[3]; - t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; - t1 = d + e1(a) + Ch(a, b, c) + 0x3956c25b + W[4]; - t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; - t1 = c + e1(h) + Ch(h, a, b) + 0x59f111f1 + W[5]; - t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; - t1 = b + e1(g) + Ch(g, h, a) + 0x923f82a4 + W[6]; - t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; - t1 = a + e1(f) + Ch(f, g, h) + 0xab1c5ed5 + W[7]; - t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2; - - t1 = h + e1(e) + Ch(e, f, g) + 0xd807aa98 + W[8]; - t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; - t1 = g + e1(d) + Ch(d, e, f) + 0x12835b01 + W[9]; - t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; - t1 = f + e1(c) + Ch(c, d, e) + 0x243185be + W[10]; - t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; - t1 = e + e1(b) + Ch(b, c, d) + 0x550c7dc3 + W[11]; - t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; - t1 = d + e1(a) + Ch(a, b, c) + 0x72be5d74 + W[12]; - t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; - t1 = c + e1(h) + Ch(h, a, b) + 0x80deb1fe + W[13]; - t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; - t1 = b + e1(g) + Ch(g, h, a) + 0x9bdc06a7 + W[14]; - t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; - t1 = a + e1(f) + Ch(f, g, h) + 0xc19bf174 + W[15]; - t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; - - t1 = h + e1(e) + Ch(e, f, g) + 0xe49b69c1 + W[16]; - t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; - t1 = g + e1(d) + Ch(d, e, f) + 0xefbe4786 + W[17]; - t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; - t1 = f + e1(c) + Ch(c, d, e) + 0x0fc19dc6 + W[18]; - t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; - t1 = e + e1(b) + Ch(b, c, d) + 0x240ca1cc + W[19]; - t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; - t1 = d + e1(a) + Ch(a, b, c) + 0x2de92c6f + W[20]; - t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; - t1 = c + e1(h) + Ch(h, a, b) + 0x4a7484aa + W[21]; - t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; - t1 = b + e1(g) + Ch(g, h, a) + 0x5cb0a9dc + W[22]; - t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; - t1 = a + e1(f) + Ch(f, g, h) + 0x76f988da + W[23]; - t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; - - t1 = h + e1(e) + Ch(e, f, g) + 0x983e5152 + W[24]; - t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; - t1 = g + e1(d) + Ch(d, e, f) + 0xa831c66d + W[25]; - t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; - t1 = f + e1(c) + Ch(c, d, e) + 0xb00327c8 + W[26]; - t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; - t1 = e + e1(b) + Ch(b, c, d) + 0xbf597fc7 + W[27]; - t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; - t1 = d + e1(a) + Ch(a, b, c) + 0xc6e00bf3 + W[28]; - t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; - t1 = c + e1(h) + Ch(h, a, b) + 0xd5a79147 + W[29]; - t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; - t1 = b + e1(g) + Ch(g, h, a) + 0x06ca6351 + W[30]; - t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; - t1 = a + e1(f) + Ch(f, g, h) + 0x14292967 + W[31]; - t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; - - t1 = h + e1(e) + Ch(e, f, g) + 0x27b70a85 + W[32]; - t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; - t1 = g + e1(d) + Ch(d, e, f) + 0x2e1b2138 + W[33]; - t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; - t1 = f + e1(c) + Ch(c, d, e) + 0x4d2c6dfc + W[34]; - t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; - t1 = e + e1(b) + Ch(b, c, d) + 0x53380d13 + W[35]; - t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; - t1 = d + e1(a) + Ch(a, b, c) + 0x650a7354 + W[36]; - t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; - t1 = c + e1(h) + Ch(h, a, b) + 0x766a0abb + W[37]; - t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; - t1 = b + e1(g) + Ch(g, h, a) + 0x81c2c92e + W[38]; - t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; - t1 = a + e1(f) + Ch(f, g, h) + 0x92722c85 + W[39]; - t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; - - t1 = h + e1(e) + Ch(e, f, g) + 0xa2bfe8a1 + W[40]; - t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; - t1 = g + e1(d) + Ch(d, e, f) + 0xa81a664b + W[41]; - t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; - t1 = f + e1(c) + Ch(c, d, e) + 0xc24b8b70 + W[42]; - t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; - t1 = e + e1(b) + Ch(b, c, d) + 0xc76c51a3 + W[43]; - t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; - t1 = d + e1(a) + Ch(a, b, c) + 0xd192e819 + W[44]; - t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; - t1 = c + e1(h) + Ch(h, a, b) + 0xd6990624 + W[45]; - t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; - t1 = b + e1(g) + Ch(g, h, a) + 0xf40e3585 + W[46]; - t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; - t1 = a + e1(f) + Ch(f, g, h) + 0x106aa070 + W[47]; - t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; - - t1 = h + e1(e) + Ch(e, f, g) + 0x19a4c116 + W[48]; - t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; - t1 = g + e1(d) + Ch(d, e, f) + 0x1e376c08 + W[49]; - t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; - t1 = f + e1(c) + Ch(c, d, e) + 0x2748774c + W[50]; - t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; - t1 = e + e1(b) + Ch(b, c, d) + 0x34b0bcb5 + W[51]; - t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; - t1 = d + e1(a) + Ch(a, b, c) + 0x391c0cb3 + W[52]; - t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; - t1 = c + e1(h) + Ch(h, a, b) + 0x4ed8aa4a + W[53]; - t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; - t1 = b + e1(g) + Ch(g, h, a) + 0x5b9cca4f + W[54]; - t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; - t1 = a + e1(f) + Ch(f, g, h) + 0x682e6ff3 + W[55]; - t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; - - t1 = h + e1(e) + Ch(e, f, g) + 0x748f82ee + W[56]; - t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; - t1 = g + e1(d) + Ch(d, e, f) + 0x78a5636f + W[57]; - t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; - t1 = f + e1(c) + Ch(c, d, e) + 0x84c87814 + W[58]; - t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; - t1 = e + e1(b) + Ch(b, c, d) + 0x8cc70208 + W[59]; - t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; - t1 = d + e1(a) + Ch(a, b, c) + 0x90befffa + W[60]; - t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; - t1 = c + e1(h) + Ch(h, a, b) + 0xa4506ceb + W[61]; - t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; - t1 = b + e1(g) + Ch(g, h, a) + 0xbef9a3f7 + W[62]; - t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; - t1 = a + e1(f) + Ch(f, g, h) + 0xc67178f2 + W[63]; - t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; - - state[0] += a; state[1] += b; state[2] += c; state[3] += d; - state[4] += e; state[5] += f; state[6] += g; state[7] += h; - - /* clear any sensitive info... */ - a = b = c = d = e = f = g = h = t1 = t2 = 0; - memset(W, 0, 64 * sizeof(u32)); -} - -int sha256_init(struct sha256_state *sctx) -{ - sctx->state[0] = SHA256_H0; - sctx->state[1] = SHA256_H1; - sctx->state[2] = SHA256_H2; - sctx->state[3] = SHA256_H3; - sctx->state[4] = SHA256_H4; - sctx->state[5] = SHA256_H5; - sctx->state[6] = SHA256_H6; - sctx->state[7] = SHA256_H7; - sctx->count = 0; - - return 0; -} - -int sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len) -{ - unsigned int partial, done; - const u8 *src; - - partial = sctx->count & 0x3f; - sctx->count += len; - done = 0; - src = data; - - if ((partial + len) > 63) { - if (partial) { - done = -partial; - memcpy(sctx->buf + partial, data, done + 64); - src = sctx->buf; - } - - do { - sha256_transform(sctx->state, src); - done += 64; - src = data + done; - } while (done + 63 < len); - - partial = 0; - } - memcpy(sctx->buf + partial, src, len - done); - - return 0; -} - -int sha256_final(struct sha256_state *sctx, u8 *out) -{ - __be32 *dst = (__be32 *)out; - __be64 bits; - unsigned int index, pad_len; - int i; - static const u8 padding[64] = { 0x80, }; - - /* Save number of bits */ - bits = cpu_to_be64(sctx->count << 3); - - /* Pad out to 56 mod 64. */ - index = sctx->count & 0x3f; - pad_len = (index < 56) ? (56 - index) : ((64+56) - index); - sha256_update(sctx, padding, pad_len); - - /* Append length (before padding) */ - sha256_update(sctx, (const u8 *)&bits, sizeof(bits)); - - /* Store state in digest */ - for (i = 0; i < 8; i++) - dst[i] = cpu_to_be32(sctx->state[i]); - - /* Zeroize sensitive information. */ - memset(sctx, 0, sizeof(*sctx)); - - return 0; -} diff --git a/arch/x86/purgatory/sha256.h b/arch/x86/purgatory/sha256.h deleted file mode 100644 index 2867d9825a57..000000000000 --- a/arch/x86/purgatory/sha256.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (C) 2014 Red Hat Inc. - * - * Author: Vivek Goyal <vgoyal@redhat.com> - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. - */ - -#ifndef SHA256_H -#define SHA256_H - -#include <linux/types.h> -#include <crypto/sha.h> - -extern int sha256_init(struct sha256_state *sctx); -extern int sha256_update(struct sha256_state *sctx, const u8 *input, - unsigned int length); -extern int sha256_final(struct sha256_state *sctx, u8 *hash); - -#endif /* SHA256_H */ diff --git a/arch/x86/purgatory/string.c b/arch/x86/purgatory/string.c index d886b1fa36f0..795ca4f2cb3c 100644 --- a/arch/x86/purgatory/string.c +++ b/arch/x86/purgatory/string.c @@ -10,4 +10,16 @@ * Version 2. See the file COPYING for more details. */ +#include <linux/types.h> + #include "../boot/string.c" + +void *memcpy(void *dst, const void *src, size_t len) +{ + return __builtin_memcpy(dst, src, len); +} + +void *memset(void *dst, int c, size_t len) +{ + return __builtin_memset(dst, c, len); +} diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index de53bd15df5a..24bb7598774e 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -102,7 +102,7 @@ ENTRY(startup_32) * don't we'll eventually crash trying to execute encrypted * instructions. */ - bt $TH_FLAGS_SME_ACTIVE_BIT, pa_tr_flags + btl $TH_FLAGS_SME_ACTIVE_BIT, pa_tr_flags jnc .Ldone movl $MSR_K8_SYSCFG, %ecx rdmsr diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 5d73c443e778..220e97841e49 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -770,9 +770,12 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, break; case R_X86_64_PC32: + case R_X86_64_PLT32: /* * PC relative relocations don't need to be adjusted unless * referencing a percpu symbol. + * + * NB: R_X86_64_PLT32 can be treated as R_X86_64_PC32. */ if (is_percpu_sym(sym, symname)) add_reloc(&relocs32neg, offset); diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index b7d73400ea29..f31e5d903161 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h @@ -30,11 +30,7 @@ #endif /* CONFIG_X86_32 */ -#ifdef CONFIG_X86_PPRO_FENCE -#define dma_rmb() rmb() -#else /* CONFIG_X86_PPRO_FENCE */ #define dma_rmb() barrier() -#endif /* CONFIG_X86_PPRO_FENCE */ #define dma_wmb() barrier() #include <asm-generic/barrier.h> diff --git a/arch/x86/um/stub_segv.c b/arch/x86/um/stub_segv.c index 1518d2805ae8..27361cbb7ca9 100644 --- a/arch/x86/um/stub_segv.c +++ b/arch/x86/um/stub_segv.c @@ -6,11 +6,12 @@ #include <sysdep/stub.h> #include <sysdep/faultinfo.h> #include <sysdep/mcontext.h> +#include <sys/ucontext.h> void __attribute__ ((__section__ (".__syscall_stub"))) stub_segv_handler(int sig, siginfo_t *info, void *p) { - struct ucontext *uc = p; + ucontext_t *uc = p; GET_FAULTINFO_FROM_MC(*((struct faultinfo *) STUB_DATA), &uc->uc_mcontext); diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index f605825a04ab..c1f98f32c45f 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -18,9 +18,6 @@ config XEN_PV bool "Xen PV guest support" default y depends on XEN - # XEN_PV is not ready to work with 5-level paging. - # Changes to hypervisor are also required. - depends on !X86_5LEVEL select XEN_HAVE_PVMMU select XEN_HAVE_VPMU help @@ -79,6 +76,4 @@ config XEN_DEBUG_FS config XEN_PVH bool "Support for running as a PVH guest" depends on XEN && XEN_PVHVM && ACPI - # Pre-built page tables are not ready to handle 5-level paging. - depends on !X86_5LEVEL def_bool n diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index de58533d3664..5e53bfbe5823 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -112,7 +112,7 @@ static int xen_madt_oem_check(char *oem_id, char *oem_table_id) return xen_pv_domain(); } -static int xen_id_always_valid(int apicid) +static int xen_id_always_valid(u32 apicid) { return 1; } @@ -215,7 +215,7 @@ static void __init xen_apic_check(void) } void __init xen_init_apic(void) { - x86_io_apic_ops.read = xen_io_apic_read; + x86_apic_ops.io_apic_read = xen_io_apic_read; /* On PV guests the APIC CPUID bit is disabled so none of the * routines end up executing. */ if (!xen_initial_domain()) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index c047f42552e1..c36d23aa6c35 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1259,10 +1259,6 @@ asmlinkage __visible void __init xen_start_kernel(void) */ __userpte_alloc_gfp &= ~__GFP_HIGHMEM; - /* Work out if we support NX */ - get_cpu_cap(&boot_cpu_data); - x86_configure_nx(); - /* Get mfn list */ xen_build_dynamic_phys_to_machine(); @@ -1272,6 +1268,10 @@ asmlinkage __visible void __init xen_start_kernel(void) */ xen_setup_gdt(0); + /* Work out if we support NX */ + get_cpu_cap(&boot_cpu_data); + x86_configure_nx(); + xen_init_irq_ops(); /* Let's presume PV guests always boot on vCPU with id 0. */ @@ -1376,8 +1376,6 @@ asmlinkage __visible void __init xen_start_kernel(void) if (!xen_initial_domain()) { add_preferred_console("xenboot", 0, NULL); - add_preferred_console("tty", 0, NULL); - add_preferred_console("hvc", 0, NULL); if (pci_xen) x86_init.pci.arch_init = pci_xen_init; } else { @@ -1410,6 +1408,10 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_boot_params_init_edd(); } + + add_preferred_console("tty", 0, NULL); + add_preferred_console("hvc", 0, NULL); + #ifdef CONFIG_PCI /* PCI BIOS service won't work from a PV guest. */ pci_probe &= ~PCI_PROBE_BIOS; diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index 436c4f003e17..aa1c6a6831a9 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -6,6 +6,7 @@ #include <asm/io_apic.h> #include <asm/hypervisor.h> #include <asm/e820/api.h> +#include <asm/x86_init.h> #include <asm/xen/interface.h> #include <asm/xen/hypercall.h> @@ -16,15 +17,20 @@ /* * PVH variables. * - * xen_pvh and pvh_bootparams need to live in data segment since they - * are used after startup_{32|64}, which clear .bss, are invoked. + * xen_pvh pvh_bootparams and pvh_start_info need to live in data segment + * since they are used after startup_{32|64}, which clear .bss, are invoked. */ bool xen_pvh __attribute__((section(".data"))) = 0; struct boot_params pvh_bootparams __attribute__((section(".data"))); +struct hvm_start_info pvh_start_info __attribute__((section(".data"))); -struct hvm_start_info pvh_start_info; unsigned int pvh_start_info_sz = sizeof(pvh_start_info); +static u64 pvh_get_root_pointer(void) +{ + return pvh_start_info.rsdp_paddr; +} + static void __init init_pvh_bootparams(void) { struct xen_memory_map memmap; @@ -71,6 +77,8 @@ static void __init init_pvh_bootparams(void) */ pvh_bootparams.hdr.version = 0x212; pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */ + + x86_init.acpi.get_root_pointer = pvh_get_root_pointer; } /* diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index d85076223a69..486c0a34d00b 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -116,6 +116,8 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ static phys_addr_t xen_pt_base, xen_pt_size __initdata; +static DEFINE_STATIC_KEY_FALSE(xen_struct_pages_ready); + /* * Just beyond the highest usermode address. STACK_TOP_MAX has a * redzone above it, so round it up to a PGD boundary. @@ -155,11 +157,18 @@ void make_lowmem_page_readwrite(void *vaddr) } +/* + * During early boot all page table pages are pinned, but we do not have struct + * pages, so return true until struct pages are ready. + */ static bool xen_page_pinned(void *ptr) { - struct page *page = virt_to_page(ptr); + if (static_branch_likely(&xen_struct_pages_ready)) { + struct page *page = virt_to_page(ptr); - return PagePinned(page); + return PagePinned(page); + } + return true; } static void xen_extend_mmu_update(const struct mmu_update *update) @@ -538,6 +547,22 @@ static void xen_set_p4d(p4d_t *ptr, p4d_t val) xen_mc_issue(PARAVIRT_LAZY_MMU); } + +#if CONFIG_PGTABLE_LEVELS >= 5 +__visible p4dval_t xen_p4d_val(p4d_t p4d) +{ + return pte_mfn_to_pfn(p4d.p4d); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_p4d_val); + +__visible p4d_t xen_make_p4d(p4dval_t p4d) +{ + p4d = pte_pfn_to_mfn(p4d); + + return native_make_p4d(p4d); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_make_p4d); +#endif /* CONFIG_PGTABLE_LEVELS >= 5 */ #endif /* CONFIG_X86_64 */ static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd, @@ -820,11 +845,6 @@ void xen_mm_pin_all(void) spin_unlock(&pgd_lock); } -/* - * The init_mm pagetable is really pinned as soon as its created, but - * that's before we have page structures to store the bits. So do all - * the book-keeping now. - */ static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page, enum pt_level level) { @@ -832,8 +852,18 @@ static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page, return 0; } -static void __init xen_mark_init_mm_pinned(void) +/* + * The init_mm pagetable is really pinned as soon as its created, but + * that's before we have page structures to store the bits. So do all + * the book-keeping now once struct pages for allocated pages are + * initialized. This happens only after free_all_bootmem() is called. + */ +static void __init xen_after_bootmem(void) { + static_branch_enable(&xen_struct_pages_ready); +#ifdef CONFIG_X86_64 + SetPagePinned(virt_to_page(level3_user_vsyscall)); +#endif xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); } @@ -1300,12 +1330,12 @@ static void xen_flush_tlb(void) preempt_enable(); } -static void xen_flush_tlb_single(unsigned long addr) +static void xen_flush_tlb_one_user(unsigned long addr) { struct mmuext_op *op; struct multicall_space mcs; - trace_xen_mmu_flush_tlb_single(addr); + trace_xen_mmu_flush_tlb_one_user(addr); preempt_disable(); @@ -1607,14 +1637,15 @@ static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot) static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level) { - bool pinned = PagePinned(virt_to_page(mm->pgd)); + bool pinned = xen_page_pinned(mm->pgd); trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned); if (pinned) { struct page *page = pfn_to_page(pfn); - SetPagePinned(page); + if (static_branch_likely(&xen_struct_pages_ready)) + SetPagePinned(page); if (!PageHighMem(page)) { xen_mc_batch(); @@ -2348,9 +2379,7 @@ static void __init xen_post_allocator_init(void) #ifdef CONFIG_X86_64 pv_mmu_ops.write_cr3 = &xen_write_cr3; - SetPagePinned(virt_to_page(level3_user_vsyscall)); #endif - xen_mark_init_mm_pinned(); } static void xen_leave_lazy_mmu(void) @@ -2370,7 +2399,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { .flush_tlb_user = xen_flush_tlb, .flush_tlb_kernel = xen_flush_tlb, - .flush_tlb_single = xen_flush_tlb_single, + .flush_tlb_one_user = xen_flush_tlb_one_user, .flush_tlb_others = xen_flush_tlb_others, .pgd_alloc = xen_pgd_alloc, @@ -2411,6 +2440,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { .alloc_pud = xen_alloc_pmd_init, .release_pud = xen_release_pmd_init, + +#if CONFIG_PGTABLE_LEVELS >= 5 + .p4d_val = PV_CALLEE_SAVE(xen_p4d_val), + .make_p4d = PV_CALLEE_SAVE(xen_make_p4d), +#endif #endif /* CONFIG_X86_64 */ .activate_mm = xen_activate_mm, @@ -2429,6 +2463,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { void __init xen_init_mmu_ops(void) { x86_init.paging.pagetable_init = xen_pagetable_init; + x86_init.hyper.init_after_bootmem = xen_after_bootmem; pv_mmu_ops = xen_mmu_ops; diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 77c959cf81e7..7a43b2ae19f1 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -122,6 +122,8 @@ void __init xen_smp_cpus_done(unsigned int max_cpus) if (xen_hvm_domain()) native_smp_cpus_done(max_cpus); + else + calculate_max_logical_packages(); if (xen_have_vcpu_info_placement) return; diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index c0c756c76afe..2e20ae2fa2d6 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -425,6 +425,7 @@ static void xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */ * data back is to call: */ tick_nohz_idle_enter(); + tick_nohz_idle_stop_tick_protected(); cpuhp_online_idle(CPUHP_AP_ONLINE_IDLE); } diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index d9f96cc5d743..1d83152c761b 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -1,12 +1,15 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include <linux/tick.h> +#include <linux/percpu-defs.h> #include <xen/xen.h> #include <xen/interface/xen.h> #include <xen/grant_table.h> #include <xen/events.h> +#include <asm/cpufeatures.h> +#include <asm/msr-index.h> #include <asm/xen/hypercall.h> #include <asm/xen/page.h> #include <asm/fixmap.h> @@ -15,6 +18,8 @@ #include "mmu.h" #include "pmu.h" +static DEFINE_PER_CPU(u64, spec_ctrl); + void xen_arch_pre_suspend(void) { xen_save_time_memory_area(); @@ -35,6 +40,9 @@ void xen_arch_post_suspend(int cancelled) static void xen_vcpu_notify_restore(void *data) { + if (xen_pv_domain() && boot_cpu_has(X86_FEATURE_SPEC_CTRL)) + wrmsrl(MSR_IA32_SPEC_CTRL, this_cpu_read(spec_ctrl)); + /* Boot processor notified via generic timekeeping_resume() */ if (smp_processor_id() == 0) return; @@ -44,7 +52,15 @@ static void xen_vcpu_notify_restore(void *data) static void xen_vcpu_notify_suspend(void *data) { + u64 tmp; + tick_suspend_local(); + + if (xen_pv_domain() && boot_cpu_has(X86_FEATURE_SPEC_CTRL)) { + rdmsrl(MSR_IA32_SPEC_CTRL, tmp); + this_cpu_write(spec_ctrl, tmp); + wrmsrl(MSR_IA32_SPEC_CTRL, 0); + } } void xen_arch_resume(void) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 96f26e026783..5077ead5e59c 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -89,7 +89,9 @@ END(hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables|pae_pgdir_above_4gb") ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, - .long (1 << XENFEAT_writable_page_tables) | (1 << XENFEAT_dom0)) + .long (1 << XENFEAT_writable_page_tables) | \ + (1 << XENFEAT_dom0) | \ + (1 << XENFEAT_linux_rsdp_unrestricted)) ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, |