diff options
Diffstat (limited to 'arch/x86')
125 files changed, 3085 insertions, 2172 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d234cca296db..f2ee6a8ffe65 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -28,7 +28,10 @@ config X86_64 select ARCH_USE_CMPXCHG_LOCKREF select HAVE_ARCH_SOFT_DIRTY select MODULES_USE_ELF_RELA + select NEED_DMA_MAP_STATE + select SWIOTLB select X86_DEV_DMA_OPS + select ARCH_HAS_SYSCALL_WRAPPER # # Arch settings @@ -51,6 +54,7 @@ config X86 select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER + select ARCH_HAS_FILTER_PGPROT select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_KCOV if X86_64 @@ -132,7 +136,6 @@ config X86 select HAVE_C_RECORDMCOUNT select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_STACKOVERFLOW - select HAVE_DMA_API_DEBUG select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS @@ -182,6 +185,7 @@ config X86 select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_USER_RETURN_NOTIFIER select IRQ_FORCED_THREADING + select NEED_SG_DMA_LENGTH select PCI_LOCKLESS_CONFIG select PERF_EVENTS select RTC_LIB @@ -234,13 +238,6 @@ config ARCH_MMAP_RND_COMPAT_BITS_MAX config SBUS bool -config NEED_DMA_MAP_STATE - def_bool y - depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG || SWIOTLB - -config NEED_SG_DMA_LENGTH - def_bool y - config GENERIC_ISA_DMA def_bool y depends on ISA_DMA_API @@ -272,6 +269,9 @@ config ARCH_HAS_CPU_RELAX config ARCH_HAS_CACHE_LINE_SIZE def_bool y +config ARCH_HAS_FILTER_PGPROT + def_bool y + config HAVE_SETUP_PER_CPU_AREA def_bool y @@ -870,6 +870,7 @@ config DMI config GART_IOMMU bool "Old AMD GART IOMMU support" + select IOMMU_HELPER select SWIOTLB depends on X86_64 && PCI && AMD_NB ---help--- @@ -891,6 +892,7 @@ config GART_IOMMU config CALGARY_IOMMU bool "IBM Calgary IOMMU support" + select IOMMU_HELPER select SWIOTLB depends on X86_64 && PCI ---help--- @@ -918,20 +920,6 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT Calgary anyway, pass 'iommu=calgary' on the kernel command line. If unsure, say Y. -# need this always selected by IOMMU for the VIA workaround -config SWIOTLB - def_bool y if X86_64 - ---help--- - Support for software bounce buffers used on x86-64 systems - which don't have a hardware IOMMU. Using this PCI devices - which can only access 32-bits of memory can be used on systems - with more than 3 GB of memory. - If unsure, say Y. - -config IOMMU_HELPER - def_bool y - depends on CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU - config MAXSMP bool "Enable Maximum number of SMP Processors and NUMA Nodes" depends on X86_64 && SMP && DEBUG_KERNEL @@ -1453,6 +1441,7 @@ config HIGHMEM config X86_PAE bool "PAE (Physical Address Extension) Support" depends on X86_32 && !HIGHMEM4G + select PHYS_ADDR_T_64BIT select SWIOTLB ---help--- PAE is required for NX support, and furthermore enables @@ -1480,14 +1469,6 @@ config X86_5LEVEL Say N if unsure. -config ARCH_PHYS_ADDR_T_64BIT - def_bool y - depends on X86_64 || X86_PAE - -config ARCH_DMA_ADDR_T_64BIT - def_bool y - depends on X86_64 || HIGHMEM64G - config X86_DIRECT_GBPAGES def_bool y depends on X86_64 && !DEBUG_PAGEALLOC @@ -2008,6 +1989,9 @@ config KEXEC_FILE for kernel and initramfs as opposed to list of segments as accepted by previous system call. +config ARCH_HAS_KEXEC_PURGATORY + def_bool KEXEC_FILE + config KEXEC_VERIFY_SIG bool "Verify kernel signature during kexec_file_load() syscall" depends on KEXEC_FILE @@ -2760,11 +2744,9 @@ config OLPC_XO1_RTC config OLPC_XO1_SCI bool "OLPC XO-1 SCI extras" - depends on OLPC && OLPC_XO1_PM + depends on OLPC && OLPC_XO1_PM && GPIO_CS5535=y depends on INPUT=y select POWER_SUPPLY - select GPIO_CS5535 - select MFD_CORE ---help--- Add support for SCI-based features of the OLPC XO-1 laptop: - EC-driven system wakeups diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 47d3efff6805..09f36c0d9d4f 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -163,7 +163,8 @@ __setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom) if (status != EFI_SUCCESS) goto free_struct; - memcpy(rom->romdata, pci->romimage, pci->romsize); + memcpy(rom->romdata, (void *)(unsigned long)pci->romimage, + pci->romsize); return status; free_struct: @@ -269,7 +270,8 @@ __setup_efi_pci64(efi_pci_io_protocol_64 *pci, struct pci_setup_rom **__rom) if (status != EFI_SUCCESS) goto free_struct; - memcpy(rom->romdata, pci->romimage, pci->romsize); + memcpy(rom->romdata, (void *)(unsigned long)pci->romimage, + pci->romsize); return status; free_struct: diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index fca012baba19..8169e8b7a4dc 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -306,6 +306,25 @@ ENTRY(startup_64) leaq boot_stack_end(%rbx), %rsp /* + * paging_prepare() and cleanup_trampoline() below can have GOT + * references. Adjust the table with address we are running at. + * + * Zero RAX for adjust_got: the GOT was not adjusted before; + * there's no adjustment to undo. + */ + xorq %rax, %rax + + /* + * Calculate the address the binary is loaded at and use it as + * a GOT adjustment. + */ + call 1f +1: popq %rdi + subq $1b, %rdi + + call adjust_got + + /* * At this point we are in long mode with 4-level paging enabled, * but we might want to enable 5-level paging or vice versa. * @@ -370,10 +389,14 @@ trampoline_return: /* * cleanup_trampoline() would restore trampoline memory. * + * RDI is address of the page table to use instead of page table + * in trampoline memory (if required). + * * RSI holds real mode data and needs to be preserved across * this function call. */ pushq %rsi + leaq top_pgtable(%rbx), %rdi call cleanup_trampoline popq %rsi @@ -381,6 +404,21 @@ trampoline_return: pushq $0 popfq + /* + * Previously we've adjusted the GOT with address the binary was + * loaded at. Now we need to re-adjust for relocation address. + * + * Calculate the address the binary is loaded at, so that we can + * undo the previous GOT adjustment. + */ + call 1f +1: popq %rax + subq $1b, %rax + + /* The new adjustment is the relocation address */ + movq %rbx, %rdi + call adjust_got + /* * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. @@ -482,19 +520,6 @@ relocated: rep stosq /* - * Adjust our own GOT - */ - leaq _got(%rip), %rdx - leaq _egot(%rip), %rcx -1: - cmpq %rcx, %rdx - jae 2f - addq %rbx, (%rdx) - addq $8, %rdx - jmp 1b -2: - -/* * Do the extraction, and jump to the new kernel.. */ pushq %rsi /* Save the real mode argument */ @@ -512,6 +537,27 @@ relocated: */ jmp *%rax +/* + * Adjust the global offset table + * + * RAX is the previous adjustment of the table to undo (use 0 if it's the + * first time we touch GOT). + * RDI is the new adjustment to apply. + */ +adjust_got: + /* Walk through the GOT adding the address to the entries */ + leaq _got(%rip), %rdx + leaq _egot(%rip), %rcx +1: + cmpq %rcx, %rdx + jae 2f + subq %rax, (%rdx) /* Undo previous adjustment */ + addq %rdi, (%rdx) /* Apply the new adjustment */ + addq $8, %rdx + jmp 1b +2: + ret + .code32 /* * This is the 32-bit trampoline that will be copied over to low memory. @@ -649,3 +695,10 @@ boot_stack_end: .balign 4096 pgtable: .fill BOOT_PGT_SIZE, 1, 0 + +/* + * The page table is going to be used instead of page table in the trampoline + * memory. + */ +top_pgtable: + .fill PAGE_SIZE, 1, 0 diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 66e42a098d70..a0a50b91ecef 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -54,6 +54,9 @@ unsigned int ptrs_per_p4d __ro_after_init = 1; extern unsigned long get_cmd_line_ptr(void); +/* Used by PAGE_KERN* macros: */ +pteval_t __default_kernel_pte_mask __read_mostly = ~0; + /* Simplified build-specific string for starting entropy. */ static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index 32af1cbcd903..a362fa0b849c 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -23,14 +23,6 @@ struct paging_config { static char trampoline_save[TRAMPOLINE_32BIT_SIZE]; /* - * The page table is going to be used instead of page table in the trampoline - * memory. - * - * It must not be in BSS as BSS is cleared after cleanup_trampoline(). - */ -static char top_pgtable[PAGE_SIZE] __aligned(PAGE_SIZE) __section(.data); - -/* * Trampoline address will be printed by extract_kernel() for debugging * purposes. * @@ -134,7 +126,7 @@ out: return paging_config; } -void cleanup_trampoline(void) +void cleanup_trampoline(void *pgtable) { void *trampoline_pgtable; @@ -145,8 +137,8 @@ void cleanup_trampoline(void) * if it's there. */ if ((void *)__native_read_cr3() == trampoline_pgtable) { - memcpy(top_pgtable, trampoline_pgtable, PAGE_SIZE); - native_write_cr3((unsigned long)top_pgtable); + memcpy(pgtable, trampoline_pgtable, PAGE_SIZE); + native_write_cr3((unsigned long)pgtable); } /* Restore trampoline memory */ diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index be63330c5511..352e70cd33e8 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -114,7 +114,9 @@ For 32-bit we have the following conventions - kernel is built with pushq %rsi /* pt_regs->si */ .endif pushq \rdx /* pt_regs->dx */ + xorl %edx, %edx /* nospec dx */ pushq %rcx /* pt_regs->cx */ + xorl %ecx, %ecx /* nospec cx */ pushq \rax /* pt_regs->ax */ pushq %r8 /* pt_regs->r8 */ xorl %r8d, %r8d /* nospec r8 */ diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 74f6eee15179..fbf6a6c3fd2d 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -266,14 +266,13 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs) } #ifdef CONFIG_X86_64 -__visible void do_syscall_64(struct pt_regs *regs) +__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs) { - struct thread_info *ti = current_thread_info(); - unsigned long nr = regs->orig_ax; + struct thread_info *ti; enter_from_user_mode(); local_irq_enable(); - + ti = current_thread_info(); if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) nr = syscall_trace_enter(regs); @@ -282,11 +281,10 @@ __visible void do_syscall_64(struct pt_regs *regs) * table. The only functional difference is the x32 bit in * regs->orig_ax, which changes the behavior of some syscalls. */ - if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) { - nr = array_index_nospec(nr & __SYSCALL_MASK, NR_syscalls); - regs->ax = sys_call_table[nr]( - regs->di, regs->si, regs->dx, - regs->r10, regs->r8, regs->r9); + nr &= __SYSCALL_MASK; + if (likely(nr < NR_syscalls)) { + nr = array_index_nospec(nr, NR_syscalls); + regs->ax = sys_call_table[nr](regs); } syscall_return_slowpath(regs); @@ -321,6 +319,9 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) if (likely(nr < IA32_NR_syscalls)) { nr = array_index_nospec(nr, IA32_NR_syscalls); +#ifdef CONFIG_IA32_EMULATION + regs->ax = ia32_sys_call_table[nr](regs); +#else /* * It's possible that a 32-bit syscall implementation * takes a 64-bit parameter but nonetheless assumes that @@ -331,6 +332,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) (unsigned int)regs->bx, (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->si, (unsigned int)regs->di, (unsigned int)regs->bp); +#endif /* CONFIG_IA32_EMULATION */ } syscall_return_slowpath(regs); diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index b0a4649e55ce..3166b9674429 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -233,7 +233,8 @@ GLOBAL(entry_SYSCALL_64_after_hwframe) TRACE_IRQS_OFF /* IRQs are off. */ - movq %rsp, %rdi + movq %rax, %rdi + movq %rsp, %rsi call do_syscall_64 /* returns with IRQs disabled */ TRACE_IRQS_IRETQ /* we're about to change IF */ @@ -913,7 +914,7 @@ ENTRY(\sym) pushq $-1 /* ORIG_RAX: no syscall to restart */ .endif - .if \paranoid < 2 + .if \paranoid == 1 testb $3, CS-ORIG_RAX(%rsp) /* If coming from userspace, switch stacks */ jnz .Lfrom_usermode_switch_stack_\@ .endif @@ -960,7 +961,7 @@ ENTRY(\sym) jmp error_exit .endif - .if \paranoid < 2 + .if \paranoid == 1 /* * Entry from userspace. Switch stacks and treat it * as a normal entry. This means that paranoid handlers diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 08425c42f8b7..9de7f1e1dede 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -84,13 +84,13 @@ ENTRY(entry_SYSENTER_compat) pushq %rdx /* pt_regs->dx */ pushq %rcx /* pt_regs->cx */ pushq $-ENOSYS /* pt_regs->ax */ - pushq $0 /* pt_regs->r8 = 0 */ + pushq %r8 /* pt_regs->r8 */ xorl %r8d, %r8d /* nospec r8 */ - pushq $0 /* pt_regs->r9 = 0 */ + pushq %r9 /* pt_regs->r9 */ xorl %r9d, %r9d /* nospec r9 */ - pushq $0 /* pt_regs->r10 = 0 */ + pushq %r10 /* pt_regs->r10 */ xorl %r10d, %r10d /* nospec r10 */ - pushq $0 /* pt_regs->r11 = 0 */ + pushq %r11 /* pt_regs->r11 */ xorl %r11d, %r11d /* nospec r11 */ pushq %rbx /* pt_regs->rbx */ xorl %ebx, %ebx /* nospec rbx */ @@ -220,8 +220,11 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe) pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ + xorl %esi, %esi /* nospec si */ pushq %rdx /* pt_regs->dx */ + xorl %edx, %edx /* nospec dx */ pushq %rbp /* pt_regs->cx (stashed in bp) */ + xorl %ecx, %ecx /* nospec cx */ pushq $-ENOSYS /* pt_regs->ax */ pushq $0 /* pt_regs->r8 = 0 */ xorl %r8d, %r8d /* nospec r8 */ @@ -365,8 +368,11 @@ ENTRY(entry_INT80_compat) pushq (%rdi) /* pt_regs->di */ pushq %rsi /* pt_regs->si */ + xorl %esi, %esi /* nospec si */ pushq %rdx /* pt_regs->dx */ + xorl %edx, %edx /* nospec dx */ pushq %rcx /* pt_regs->cx */ + xorl %ecx, %ecx /* nospec cx */ pushq $-ENOSYS /* pt_regs->ax */ pushq $0 /* pt_regs->r8 = 0 */ xorl %r8d, %r8d /* nospec r8 */ diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 95c294963612..aa3336a7cb15 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -7,14 +7,23 @@ #include <asm/asm-offsets.h> #include <asm/syscall.h> -#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#ifdef CONFIG_IA32_EMULATION +/* On X86_64, we use struct pt_regs * to pass parameters to syscalls */ +#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(const struct pt_regs *); + +/* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */ +extern asmlinkage long sys_ni_syscall(const struct pt_regs *); + +#else /* CONFIG_IA32_EMULATION */ +#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); +extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); +#endif /* CONFIG_IA32_EMULATION */ + #include <asm/syscalls_32.h> #undef __SYSCALL_I386 #define __SYSCALL_I386(nr, sym, qual) [nr] = sym, -extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); - __visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = { /* * Smells like a compiler bug -- it doesn't work diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index c176d2fab1da..d5252bc1e380 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -7,14 +7,14 @@ #include <asm/asm-offsets.h> #include <asm/syscall.h> -#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); +/* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */ +extern asmlinkage long sys_ni_syscall(const struct pt_regs *); +#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(const struct pt_regs *); #include <asm/syscalls_64.h> #undef __SYSCALL_64 #define __SYSCALL_64(nr, sym, qual) [nr] = sym, -extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); - asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* * Smells like a compiler bug -- it doesn't work diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index c58f75b088c5..d6b27dab1b30 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -4,390 +4,395 @@ # The format is: # <number> <abi> <name> <entry point> <compat entry point> # +# The __ia32_sys and __ia32_compat_sys stubs are created on-the-fly for +# sys_*() system calls and compat_sys_*() compat system calls if +# IA32_EMULATION is defined, and expect struct pt_regs *regs as their only +# parameter. +# # The abi is always "i386" for this file. # -0 i386 restart_syscall sys_restart_syscall -1 i386 exit sys_exit -2 i386 fork sys_fork -3 i386 read sys_read -4 i386 write sys_write -5 i386 open sys_open compat_sys_open -6 i386 close sys_close -7 i386 waitpid sys_waitpid -8 i386 creat sys_creat -9 i386 link sys_link -10 i386 unlink sys_unlink -11 i386 execve sys_execve compat_sys_execve -12 i386 chdir sys_chdir -13 i386 time sys_time compat_sys_time -14 i386 mknod sys_mknod -15 i386 chmod sys_chmod -16 i386 lchown sys_lchown16 +0 i386 restart_syscall sys_restart_syscall __ia32_sys_restart_syscall +1 i386 exit sys_exit __ia32_sys_exit +2 i386 fork sys_fork __ia32_sys_fork +3 i386 read sys_read __ia32_sys_read +4 i386 write sys_write __ia32_sys_write +5 i386 open sys_open __ia32_compat_sys_open +6 i386 close sys_close __ia32_sys_close +7 i386 waitpid sys_waitpid __ia32_sys_waitpid +8 i386 creat sys_creat __ia32_sys_creat +9 i386 link sys_link __ia32_sys_link +10 i386 unlink sys_unlink __ia32_sys_unlink +11 i386 execve sys_execve __ia32_compat_sys_execve +12 i386 chdir sys_chdir __ia32_sys_chdir +13 i386 time sys_time __ia32_compat_sys_time +14 i386 mknod sys_mknod __ia32_sys_mknod +15 i386 chmod sys_chmod __ia32_sys_chmod +16 i386 lchown sys_lchown16 __ia32_sys_lchown16 17 i386 break -18 i386 oldstat sys_stat -19 i386 lseek sys_lseek compat_sys_lseek -20 i386 getpid sys_getpid -21 i386 mount sys_mount compat_sys_mount -22 i386 umount sys_oldumount -23 i386 setuid sys_setuid16 -24 i386 getuid sys_getuid16 -25 i386 stime sys_stime compat_sys_stime -26 i386 ptrace sys_ptrace compat_sys_ptrace -27 i386 alarm sys_alarm -28 i386 oldfstat sys_fstat -29 i386 pause sys_pause -30 i386 utime sys_utime compat_sys_utime +18 i386 oldstat sys_stat __ia32_sys_stat +19 i386 lseek sys_lseek __ia32_compat_sys_lseek +20 i386 getpid sys_getpid __ia32_sys_getpid +21 i386 mount sys_mount __ia32_compat_sys_mount +22 i386 umount sys_oldumount __ia32_sys_oldumount +23 i386 setuid sys_setuid16 __ia32_sys_setuid16 +24 i386 getuid sys_getuid16 __ia32_sys_getuid16 +25 i386 stime sys_stime __ia32_compat_sys_stime +26 i386 ptrace sys_ptrace __ia32_compat_sys_ptrace +27 i386 alarm sys_alarm __ia32_sys_alarm +28 i386 oldfstat sys_fstat __ia32_sys_fstat +29 i386 pause sys_pause __ia32_sys_pause +30 i386 utime sys_utime __ia32_compat_sys_utime 31 i386 stty 32 i386 gtty -33 i386 access sys_access -34 i386 nice sys_nice +33 i386 access sys_access __ia32_sys_access +34 i386 nice sys_nice __ia32_sys_nice 35 i386 ftime -36 i386 sync sys_sync -37 i386 kill sys_kill -38 i386 rename sys_rename -39 i386 mkdir sys_mkdir -40 i386 rmdir sys_rmdir -41 i386 dup sys_dup -42 i386 pipe sys_pipe -43 i386 times sys_times compat_sys_times +36 i386 sync sys_sync __ia32_sys_sync +37 i386 kill sys_kill __ia32_sys_kill +38 i386 rename sys_rename __ia32_sys_rename +39 i386 mkdir sys_mkdir __ia32_sys_mkdir +40 i386 rmdir sys_rmdir __ia32_sys_rmdir +41 i386 dup sys_dup __ia32_sys_dup +42 i386 pipe sys_pipe __ia32_sys_pipe +43 i386 times sys_times __ia32_compat_sys_times 44 i386 prof -45 i386 brk sys_brk -46 i386 setgid sys_setgid16 -47 i386 getgid sys_getgid16 -48 i386 signal sys_signal -49 i386 geteuid sys_geteuid16 -50 i386 getegid sys_getegid16 -51 i386 acct sys_acct -52 i386 umount2 sys_umount +45 i386 brk sys_brk __ia32_sys_brk +46 i386 setgid sys_setgid16 __ia32_sys_setgid16 +47 i386 getgid sys_getgid16 __ia32_sys_getgid16 +48 i386 signal sys_signal __ia32_sys_signal +49 i386 geteuid sys_geteuid16 __ia32_sys_geteuid16 +50 i386 getegid sys_getegid16 __ia32_sys_getegid16 +51 i386 acct sys_acct __ia32_sys_acct +52 i386 umount2 sys_umount __ia32_sys_umount 53 i386 lock -54 i386 ioctl sys_ioctl compat_sys_ioctl -55 i386 fcntl sys_fcntl compat_sys_fcntl64 +54 i386 ioctl sys_ioctl __ia32_compat_sys_ioctl +55 i386 fcntl sys_fcntl __ia32_compat_sys_fcntl64 56 i386 mpx -57 i386 setpgid sys_setpgid +57 i386 setpgid sys_setpgid __ia32_sys_setpgid 58 i386 ulimit -59 i386 oldolduname sys_olduname -60 i386 umask sys_umask -61 i386 chroot sys_chroot -62 i386 ustat sys_ustat compat_sys_ustat -63 i386 dup2 sys_dup2 -64 i386 getppid sys_getppid -65 i386 getpgrp sys_getpgrp -66 i386 setsid sys_setsid -67 i386 sigaction sys_sigaction compat_sys_sigaction -68 i386 sgetmask sys_sgetmask -69 i386 ssetmask sys_ssetmask -70 i386 setreuid sys_setreuid16 -71 i386 setregid sys_setregid16 -72 i386 sigsuspend sys_sigsuspend -73 i386 sigpending sys_sigpending compat_sys_sigpending -74 i386 sethostname sys_sethostname -75 i386 setrlimit sys_setrlimit compat_sys_setrlimit -76 i386 getrlimit sys_old_getrlimit compat_sys_old_getrlimit -77 i386 getrusage sys_getrusage compat_sys_getrusage -78 i386 gettimeofday sys_gettimeofday compat_sys_gettimeofday -79 i386 settimeofday sys_settimeofday compat_sys_settimeofday -80 i386 getgroups sys_getgroups16 -81 i386 setgroups sys_setgroups16 -82 i386 select sys_old_select compat_sys_old_select -83 i386 symlink sys_symlink -84 i386 oldlstat sys_lstat -85 i386 readlink sys_readlink -86 i386 uselib sys_uselib -87 i386 swapon sys_swapon -88 i386 reboot sys_reboot -89 i386 readdir sys_old_readdir compat_sys_old_readdir -90 i386 mmap sys_old_mmap compat_sys_x86_mmap -91 i386 munmap sys_munmap -92 i386 truncate sys_truncate compat_sys_truncate -93 i386 ftruncate sys_ftruncate compat_sys_ftruncate -94 i386 fchmod sys_fchmod -95 i386 fchown sys_fchown16 -96 i386 getpriority sys_getpriority -97 i386 setpriority sys_setpriority +59 i386 oldolduname sys_olduname __ia32_sys_olduname +60 i386 umask sys_umask __ia32_sys_umask +61 i386 chroot sys_chroot __ia32_sys_chroot +62 i386 ustat sys_ustat __ia32_compat_sys_ustat +63 i386 dup2 sys_dup2 __ia32_sys_dup2 +64 i386 getppid sys_getppid __ia32_sys_getppid +65 i386 getpgrp sys_getpgrp __ia32_sys_getpgrp +66 i386 setsid sys_setsid __ia32_sys_setsid +67 i386 sigaction sys_sigaction __ia32_compat_sys_sigaction +68 i386 sgetmask sys_sgetmask __ia32_sys_sgetmask +69 i386 ssetmask sys_ssetmask __ia32_sys_ssetmask +70 i386 setreuid sys_setreuid16 __ia32_sys_setreuid16 +71 i386 setregid sys_setregid16 __ia32_sys_setregid16 +72 i386 sigsuspend sys_sigsuspend __ia32_sys_sigsuspend +73 i386 sigpending sys_sigpending __ia32_compat_sys_sigpending +74 i386 sethostname sys_sethostname __ia32_sys_sethostname +75 i386 setrlimit sys_setrlimit __ia32_compat_sys_setrlimit +76 i386 getrlimit sys_old_getrlimit __ia32_compat_sys_old_getrlimit +77 i386 getrusage sys_getrusage __ia32_compat_sys_getrusage +78 i386 gettimeofday sys_gettimeofday __ia32_compat_sys_gettimeofday +79 i386 settimeofday sys_settimeofday __ia32_compat_sys_settimeofday +80 i386 getgroups sys_getgroups16 __ia32_sys_getgroups16 +81 i386 setgroups sys_setgroups16 __ia32_sys_setgroups16 +82 i386 select sys_old_select __ia32_compat_sys_old_select +83 i386 symlink sys_symlink __ia32_sys_symlink +84 i386 oldlstat sys_lstat __ia32_sys_lstat +85 i386 readlink sys_readlink __ia32_sys_readlink +86 i386 uselib sys_uselib __ia32_sys_uselib +87 i386 swapon sys_swapon __ia32_sys_swapon +88 i386 reboot sys_reboot __ia32_sys_reboot +89 i386 readdir sys_old_readdir __ia32_compat_sys_old_readdir +90 i386 mmap sys_old_mmap __ia32_compat_sys_x86_mmap +91 i386 munmap sys_munmap __ia32_sys_munmap +92 i386 truncate sys_truncate __ia32_compat_sys_truncate +93 i386 ftruncate sys_ftruncate __ia32_compat_sys_ftruncate +94 i386 fchmod sys_fchmod __ia32_sys_fchmod +95 i386 fchown sys_fchown16 __ia32_sys_fchown16 +96 i386 getpriority sys_getpriority __ia32_sys_getpriority +97 i386 setpriority sys_setpriority __ia32_sys_setpriority 98 i386 profil -99 i386 statfs sys_statfs compat_sys_statfs -100 i386 fstatfs sys_fstatfs compat_sys_fstatfs -101 i386 ioperm sys_ioperm -102 i386 socketcall sys_socketcall compat_sys_socketcall -103 i386 syslog sys_syslog -104 i386 setitimer sys_setitimer compat_sys_setitimer -105 i386 getitimer sys_getitimer compat_sys_getitimer -106 i386 stat sys_newstat compat_sys_newstat -107 i386 lstat sys_newlstat compat_sys_newlstat -108 i386 fstat sys_newfstat compat_sys_newfstat -109 i386 olduname sys_uname -110 i386 iopl sys_iopl -111 i386 vhangup sys_vhangup +99 i386 statfs sys_statfs __ia32_compat_sys_statfs +100 i386 fstatfs sys_fstatfs __ia32_compat_sys_fstatfs +101 i386 ioperm sys_ioperm __ia32_sys_ioperm +102 i386 socketcall sys_socketcall __ia32_compat_sys_socketcall +103 i386 syslog sys_syslog __ia32_sys_syslog +104 i386 setitimer sys_setitimer __ia32_compat_sys_setitimer +105 i386 getitimer sys_getitimer __ia32_compat_sys_getitimer +106 i386 stat sys_newstat __ia32_compat_sys_newstat +107 i386 lstat sys_newlstat __ia32_compat_sys_newlstat +108 i386 fstat sys_newfstat __ia32_compat_sys_newfstat +109 i386 olduname sys_uname __ia32_sys_uname +110 i386 iopl sys_iopl __ia32_sys_iopl +111 i386 vhangup sys_vhangup __ia32_sys_vhangup 112 i386 idle 113 i386 vm86old sys_vm86old sys_ni_syscall -114 i386 wait4 sys_wait4 compat_sys_wait4 -115 i386 swapoff sys_swapoff -116 i386 sysinfo sys_sysinfo compat_sys_sysinfo -117 i386 ipc sys_ipc compat_sys_ipc -118 i386 fsync sys_fsync +114 i386 wait4 sys_wait4 __ia32_compat_sys_wait4 +115 i386 swapoff sys_swapoff __ia32_sys_swapoff +116 i386 sysinfo sys_sysinfo __ia32_compat_sys_sysinfo +117 i386 ipc sys_ipc __ia32_compat_sys_ipc +118 i386 fsync sys_fsync __ia32_sys_fsync 119 i386 sigreturn sys_sigreturn sys32_sigreturn -120 i386 clone sys_clone compat_sys_x86_clone -121 i386 setdomainname sys_setdomainname -122 i386 uname sys_newuname -123 i386 modify_ldt sys_modify_ldt -124 i386 adjtimex sys_adjtimex compat_sys_adjtimex -125 i386 mprotect sys_mprotect -126 i386 sigprocmask sys_sigprocmask compat_sys_sigprocmask +120 i386 clone sys_clone __ia32_compat_sys_x86_clone +121 i386 setdomainname sys_setdomainname __ia32_sys_setdomainname +122 i386 uname sys_newuname __ia32_sys_newuname +123 i386 modify_ldt sys_modify_ldt __ia32_sys_modify_ldt +124 i386 adjtimex sys_adjtimex __ia32_compat_sys_adjtimex +125 i386 mprotect sys_mprotect __ia32_sys_mprotect +126 i386 sigprocmask sys_sigprocmask __ia32_compat_sys_sigprocmask 127 i386 create_module -128 i386 init_module sys_init_module -129 i386 delete_module sys_delete_module +128 i386 init_module sys_init_module __ia32_sys_init_module +129 i386 delete_module sys_delete_module __ia32_sys_delete_module 130 i386 get_kernel_syms -131 i386 quotactl sys_quotactl compat_sys_quotactl32 -132 i386 getpgid sys_getpgid -133 i386 fchdir sys_fchdir -134 i386 bdflush sys_bdflush -135 i386 sysfs sys_sysfs -136 i386 personality sys_personality +131 i386 quotactl sys_quotactl __ia32_compat_sys_quotactl32 +132 i386 getpgid sys_getpgid __ia32_sys_getpgid +133 i386 fchdir sys_fchdir __ia32_sys_fchdir +134 i386 bdflush sys_bdflush __ia32_sys_bdflush +135 i386 sysfs sys_sysfs __ia32_sys_sysfs +136 i386 personality sys_personality __ia32_sys_personality 137 i386 afs_syscall -138 i386 setfsuid sys_setfsuid16 -139 i386 setfsgid sys_setfsgid16 -140 i386 _llseek sys_llseek -141 i386 getdents sys_getdents compat_sys_getdents -142 i386 _newselect sys_select compat_sys_select -143 i386 flock sys_flock -144 i386 msync sys_msync -145 i386 readv sys_readv compat_sys_readv -146 i386 writev sys_writev compat_sys_writev -147 i386 getsid sys_getsid -148 i386 fdatasync sys_fdatasync -149 i386 _sysctl sys_sysctl compat_sys_sysctl -150 i386 mlock sys_mlock -151 i386 munlock sys_munlock -152 i386 mlockall sys_mlockall -153 i386 munlockall sys_munlockall -154 i386 sched_setparam sys_sched_setparam -155 i386 sched_getparam sys_sched_getparam -156 i386 sched_setscheduler sys_sched_setscheduler -157 i386 sched_getscheduler sys_sched_getscheduler -158 i386 sched_yield sys_sched_yield -159 i386 sched_get_priority_max sys_sched_get_priority_max -160 i386 sched_get_priority_min sys_sched_get_priority_min -161 i386 sched_rr_get_interval sys_sched_rr_get_interval compat_sys_sched_rr_get_interval -162 i386 nanosleep sys_nanosleep compat_sys_nanosleep -163 i386 mremap sys_mremap -164 i386 setresuid sys_setresuid16 -165 i386 getresuid sys_getresuid16 +138 i386 setfsuid sys_setfsuid16 __ia32_sys_setfsuid16 +139 i386 setfsgid sys_setfsgid16 __ia32_sys_setfsgid16 +140 i386 _llseek sys_llseek __ia32_sys_llseek +141 i386 getdents sys_getdents __ia32_compat_sys_getdents +142 i386 _newselect sys_select __ia32_compat_sys_select +143 i386 flock sys_flock __ia32_sys_flock +144 i386 msync sys_msync __ia32_sys_msync +145 i386 readv sys_readv __ia32_compat_sys_readv +146 i386 writev sys_writev __ia32_compat_sys_writev +147 i386 getsid sys_getsid __ia32_sys_getsid +148 i386 fdatasync sys_fdatasync __ia32_sys_fdatasync +149 i386 _sysctl sys_sysctl __ia32_compat_sys_sysctl +150 i386 mlock sys_mlock __ia32_sys_mlock +151 i386 munlock sys_munlock __ia32_sys_munlock +152 i386 mlockall sys_mlockall __ia32_sys_mlockall +153 i386 munlockall sys_munlockall __ia32_sys_munlockall +154 i386 sched_setparam sys_sched_setparam __ia32_sys_sched_setparam +155 i386 sched_getparam sys_sched_getparam __ia32_sys_sched_getparam +156 i386 sched_setscheduler sys_sched_setscheduler __ia32_sys_sched_setscheduler +157 i386 sched_getscheduler sys_sched_getscheduler __ia32_sys_sched_getscheduler +158 i386 sched_yield sys_sched_yield __ia32_sys_sched_yield +159 i386 sched_get_priority_max sys_sched_get_priority_max __ia32_sys_sched_get_priority_max +160 i386 sched_get_priority_min sys_sched_get_priority_min __ia32_sys_sched_get_priority_min +161 i386 sched_rr_get_interval sys_sched_rr_get_interval __ia32_compat_sys_sched_rr_get_interval +162 i386 nanosleep sys_nanosleep __ia32_compat_sys_nanosleep +163 i386 mremap sys_mremap __ia32_sys_mremap +164 i386 setresuid sys_setresuid16 __ia32_sys_setresuid16 +165 i386 getresuid sys_getresuid16 __ia32_sys_getresuid16 166 i386 vm86 sys_vm86 sys_ni_syscall 167 i386 query_module -168 i386 poll sys_poll +168 i386 poll sys_poll __ia32_sys_poll 169 i386 nfsservctl -170 i386 setresgid sys_setresgid16 -171 i386 getresgid sys_getresgid16 -172 i386 prctl sys_prctl +170 i386 setresgid sys_setresgid16 __ia32_sys_setresgid16 +171 i386 getresgid sys_getresgid16 __ia32_sys_getresgid16 +172 i386 prctl sys_prctl __ia32_sys_prctl 173 i386 rt_sigreturn sys_rt_sigreturn sys32_rt_sigreturn -174 i386 rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction -175 i386 rt_sigprocmask sys_rt_sigprocmask -176 i386 rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending -177 i386 rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait -178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo -179 i386 rt_sigsuspend sys_rt_sigsuspend -180 i386 pread64 sys_pread64 compat_sys_x86_pread -181 i386 pwrite64 sys_pwrite64 compat_sys_x86_pwrite -182 i386 chown sys_chown16 -183 i386 getcwd sys_getcwd -184 i386 capget sys_capget -185 i386 capset sys_capset -186 i386 sigaltstack sys_sigaltstack compat_sys_sigaltstack -187 i386 sendfile sys_sendfile compat_sys_sendfile +174 i386 rt_sigaction sys_rt_sigaction __ia32_compat_sys_rt_sigaction +175 i386 rt_sigprocmask sys_rt_sigprocmask __ia32_sys_rt_sigprocmask +176 i386 rt_sigpending sys_rt_sigpending __ia32_compat_sys_rt_sigpending +177 i386 rt_sigtimedwait sys_rt_sigtimedwait __ia32_compat_sys_rt_sigtimedwait +178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo __ia32_compat_sys_rt_sigqueueinfo +179 i386 rt_sigsuspend sys_rt_sigsuspend __ia32_sys_rt_sigsuspend +180 i386 pread64 sys_pread64 __ia32_compat_sys_x86_pread +181 i386 pwrite64 sys_pwrite64 __ia32_compat_sys_x86_pwrite +182 i386 chown sys_chown16 __ia32_sys_chown16 +183 i386 getcwd sys_getcwd __ia32_sys_getcwd +184 i386 capget sys_capget __ia32_sys_capget +185 i386 capset sys_capset __ia32_sys_capset +186 i386 sigaltstack sys_sigaltstack __ia32_compat_sys_sigaltstack +187 i386 sendfile sys_sendfile __ia32_compat_sys_sendfile 188 i386 getpmsg 189 i386 putpmsg -190 i386 vfork sys_vfork -191 i386 ugetrlimit sys_getrlimit compat_sys_getrlimit -192 i386 mmap2 sys_mmap_pgoff -193 i386 truncate64 sys_truncate64 compat_sys_x86_truncate64 -194 i386 ftruncate64 sys_ftruncate64 compat_sys_x86_ftruncate64 -195 i386 stat64 sys_stat64 compat_sys_x86_stat64 -196 i386 lstat64 sys_lstat64 compat_sys_x86_lstat64 -197 i386 fstat64 sys_fstat64 compat_sys_x86_fstat64 -198 i386 lchown32 sys_lchown -199 i386 getuid32 sys_getuid -200 i386 getgid32 sys_getgid -201 i386 geteuid32 sys_geteuid -202 i386 getegid32 sys_getegid -203 i386 setreuid32 sys_setreuid -204 i386 setregid32 sys_setregid -205 i386 getgroups32 sys_getgroups -206 i386 setgroups32 sys_setgroups -207 i386 fchown32 sys_fchown -208 i386 setresuid32 sys_setresuid -209 i386 getresuid32 sys_getresuid -210 i386 setresgid32 sys_setresgid -211 i386 getresgid32 sys_getresgid -212 i386 chown32 sys_chown -213 i386 setuid32 sys_setuid -214 i386 setgid32 sys_setgid -215 i386 setfsuid32 sys_setfsuid -216 i386 setfsgid32 sys_setfsgid -217 i386 pivot_root sys_pivot_root -218 i386 mincore sys_mincore -219 i386 madvise sys_madvise -220 i386 getdents64 sys_getdents64 -221 i386 fcntl64 sys_fcntl64 compat_sys_fcntl64 +190 i386 vfork sys_vfork __ia32_sys_vfork +191 i386 ugetrlimit sys_getrlimit __ia32_compat_sys_getrlimit +192 i386 mmap2 sys_mmap_pgoff __ia32_sys_mmap_pgoff +193 i386 truncate64 sys_truncate64 __ia32_compat_sys_x86_truncate64 +194 i386 ftruncate64 sys_ftruncate64 __ia32_compat_sys_x86_ftruncate64 +195 i386 stat64 sys_stat64 __ia32_compat_sys_x86_stat64 +196 i386 lstat64 sys_lstat64 __ia32_compat_sys_x86_lstat64 +197 i386 fstat64 sys_fstat64 __ia32_compat_sys_x86_fstat64 +198 i386 lchown32 sys_lchown __ia32_sys_lchown +199 i386 getuid32 sys_getuid __ia32_sys_getuid +200 i386 getgid32 sys_getgid __ia32_sys_getgid +201 i386 geteuid32 sys_geteuid __ia32_sys_geteuid +202 i386 getegid32 sys_getegid __ia32_sys_getegid +203 i386 setreuid32 sys_setreuid __ia32_sys_setreuid +204 i386 setregid32 sys_setregid __ia32_sys_setregid +205 i386 getgroups32 sys_getgroups __ia32_sys_getgroups +206 i386 setgroups32 sys_setgroups __ia32_sys_setgroups +207 i386 fchown32 sys_fchown __ia32_sys_fchown +208 i386 setresuid32 sys_setresuid __ia32_sys_setresuid +209 i386 getresuid32 sys_getresuid __ia32_sys_getresuid +210 i386 setresgid32 sys_setresgid __ia32_sys_setresgid +211 i386 getresgid32 sys_getresgid __ia32_sys_getresgid +212 i386 chown32 sys_chown __ia32_sys_chown +213 i386 setuid32 sys_setuid __ia32_sys_setuid +214 i386 setgid32 sys_setgid __ia32_sys_setgid +215 i386 setfsuid32 sys_setfsuid __ia32_sys_setfsuid +216 i386 setfsgid32 sys_setfsgid __ia32_sys_setfsgid +217 i386 pivot_root sys_pivot_root __ia32_sys_pivot_root +218 i386 mincore sys_mincore __ia32_sys_mincore +219 i386 madvise sys_madvise __ia32_sys_madvise +220 i386 getdents64 sys_getdents64 __ia32_sys_getdents64 +221 i386 fcntl64 sys_fcntl64 __ia32_compat_sys_fcntl64 # 222 is unused # 223 is unused -224 i386 gettid sys_gettid -225 i386 readahead sys_readahead compat_sys_x86_readahead -226 i386 setxattr sys_setxattr -227 i386 lsetxattr sys_lsetxattr -228 i386 fsetxattr sys_fsetxattr -229 i386 getxattr sys_getxattr -230 i386 lgetxattr sys_lgetxattr -231 i386 fgetxattr sys_fgetxattr -232 i386 listxattr sys_listxattr -233 i386 llistxattr sys_llistxattr -234 i386 flistxattr sys_flistxattr -235 i386 removexattr sys_removexattr -236 i386 lremovexattr sys_lremovexattr -237 i386 fremovexattr sys_fremovexattr -238 i386 tkill sys_tkill -239 i386 sendfile64 sys_sendfile64 -240 i386 futex sys_futex compat_sys_futex -241 i386 sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity -242 i386 sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity -243 i386 set_thread_area sys_set_thread_area -244 i386 get_thread_area sys_get_thread_area -245 i386 io_setup sys_io_setup compat_sys_io_setup -246 i386 io_destroy sys_io_destroy -247 i386 io_getevents sys_io_getevents compat_sys_io_getevents -248 i386 io_submit sys_io_submit compat_sys_io_submit -249 i386 io_cancel sys_io_cancel -250 i386 fadvise64 sys_fadvise64 compat_sys_x86_fadvise64 +224 i386 gettid sys_gettid __ia32_sys_gettid +225 i386 readahead sys_readahead __ia32_compat_sys_x86_readahead +226 i386 setxattr sys_setxattr __ia32_sys_setxattr +227 i386 lsetxattr sys_lsetxattr __ia32_sys_lsetxattr +228 i386 fsetxattr sys_fsetxattr __ia32_sys_fsetxattr +229 i386 getxattr sys_getxattr __ia32_sys_getxattr +230 i386 lgetxattr sys_lgetxattr __ia32_sys_lgetxattr +231 i386 fgetxattr sys_fgetxattr __ia32_sys_fgetxattr +232 i386 listxattr sys_listxattr __ia32_sys_listxattr +233 i386 llistxattr sys_llistxattr __ia32_sys_llistxattr +234 i386 flistxattr sys_flistxattr __ia32_sys_flistxattr +235 i386 removexattr sys_removexattr __ia32_sys_removexattr +236 i386 lremovexattr sys_lremovexattr __ia32_sys_lremovexattr +237 i386 fremovexattr sys_fremovexattr __ia32_sys_fremovexattr +238 i386 tkill sys_tkill __ia32_sys_tkill +239 i386 sendfile64 sys_sendfile64 __ia32_sys_sendfile64 +240 i386 futex sys_futex __ia32_compat_sys_futex +241 i386 sched_setaffinity sys_sched_setaffinity __ia32_compat_sys_sched_setaffinity +242 i386 sched_getaffinity sys_sched_getaffinity __ia32_compat_sys_sched_getaffinity +243 i386 set_thread_area sys_set_thread_area __ia32_sys_set_thread_area +244 i386 get_thread_area sys_get_thread_area __ia32_sys_get_thread_area +245 i386 io_setup sys_io_setup __ia32_compat_sys_io_setup +246 i386 io_destroy sys_io_destroy __ia32_sys_io_destroy +247 i386 io_getevents sys_io_getevents __ia32_compat_sys_io_getevents +248 i386 io_submit sys_io_submit __ia32_compat_sys_io_submit +249 i386 io_cancel sys_io_cancel __ia32_sys_io_cancel +250 i386 fadvise64 sys_fadvise64 __ia32_compat_sys_x86_fadvise64 # 251 is available for reuse (was briefly sys_set_zone_reclaim) -252 i386 exit_group sys_exit_group -253 i386 lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie -254 i386 epoll_create sys_epoll_create -255 i386 epoll_ctl sys_epoll_ctl -256 i386 epoll_wait sys_epoll_wait -257 i386 remap_file_pages sys_remap_file_pages -258 i386 set_tid_address sys_set_tid_address -259 i386 timer_create sys_timer_create compat_sys_timer_create -260 i386 timer_settime sys_timer_settime compat_sys_timer_settime -261 i386 timer_gettime sys_timer_gettime compat_sys_timer_gettime -262 i386 timer_getoverrun sys_timer_getoverrun -263 i386 timer_delete sys_timer_delete -264 i386 clock_settime sys_clock_settime compat_sys_clock_settime -265 i386 clock_gettime sys_clock_gettime compat_sys_clock_gettime -266 i386 clock_getres sys_clock_getres compat_sys_clock_getres -267 i386 clock_nanosleep sys_clock_nanosleep compat_sys_clock_nanosleep -268 i386 statfs64 sys_statfs64 compat_sys_statfs64 -269 i386 fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 -270 i386 tgkill sys_tgkill -271 i386 utimes sys_utimes compat_sys_utimes -272 i386 fadvise64_64 sys_fadvise64_64 compat_sys_x86_fadvise64_64 +252 i386 exit_group sys_exit_group __ia32_sys_exit_group +253 i386 lookup_dcookie sys_lookup_dcookie __ia32_compat_sys_lookup_dcookie +254 i386 epoll_create sys_epoll_create __ia32_sys_epoll_create +255 i386 epoll_ctl sys_epoll_ctl __ia32_sys_epoll_ctl +256 i386 epoll_wait sys_epoll_wait __ia32_sys_epoll_wait +257 i386 remap_file_pages sys_remap_file_pages __ia32_sys_remap_file_pages +258 i386 set_tid_address sys_set_tid_address __ia32_sys_set_tid_address +259 i386 timer_create sys_timer_create __ia32_compat_sys_timer_create +260 i386 timer_settime sys_timer_settime __ia32_compat_sys_timer_settime +261 i386 timer_gettime sys_timer_gettime __ia32_compat_sys_timer_gettime +262 i386 timer_getoverrun sys_timer_getoverrun __ia32_sys_timer_getoverrun +263 i386 timer_delete sys_timer_delete __ia32_sys_timer_delete +264 i386 clock_settime sys_clock_settime __ia32_compat_sys_clock_settime +265 i386 clock_gettime sys_clock_gettime __ia32_compat_sys_clock_gettime +266 i386 clock_getres sys_clock_getres __ia32_compat_sys_clock_getres +267 i386 clock_nanosleep sys_clock_nanosleep __ia32_compat_sys_clock_nanosleep +268 i386 statfs64 sys_statfs64 __ia32_compat_sys_statfs64 +269 i386 fstatfs64 sys_fstatfs64 __ia32_compat_sys_fstatfs64 +270 i386 tgkill sys_tgkill __ia32_sys_tgkill +271 i386 utimes sys_utimes __ia32_compat_sys_utimes +272 i386 fadvise64_64 sys_fadvise64_64 __ia32_compat_sys_x86_fadvise64_64 273 i386 vserver -274 i386 mbind sys_mbind -275 i386 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy -276 i386 set_mempolicy sys_set_mempolicy -277 i386 mq_open sys_mq_open compat_sys_mq_open -278 i386 mq_unlink sys_mq_unlink -279 i386 mq_timedsend sys_mq_timedsend compat_sys_mq_timedsend -280 i386 mq_timedreceive sys_mq_timedreceive compat_sys_mq_timedreceive -281 i386 mq_notify sys_mq_notify compat_sys_mq_notify -282 i386 mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr -283 i386 kexec_load sys_kexec_load compat_sys_kexec_load -284 i386 waitid sys_waitid compat_sys_waitid +274 i386 mbind sys_mbind __ia32_sys_mbind +275 i386 get_mempolicy sys_get_mempolicy __ia32_compat_sys_get_mempolicy +276 i386 set_mempolicy sys_set_mempolicy __ia32_sys_set_mempolicy +277 i386 mq_open sys_mq_open __ia32_compat_sys_mq_open +278 i386 mq_unlink sys_mq_unlink __ia32_sys_mq_unlink +279 i386 mq_timedsend sys_mq_timedsend __ia32_compat_sys_mq_timedsend +280 i386 mq_timedreceive sys_mq_timedreceive __ia32_compat_sys_mq_timedreceive +281 i386 mq_notify sys_mq_notify __ia32_compat_sys_mq_notify +282 i386 mq_getsetattr sys_mq_getsetattr __ia32_compat_sys_mq_getsetattr +283 i386 kexec_load sys_kexec_load __ia32_compat_sys_kexec_load +284 i386 waitid sys_waitid __ia32_compat_sys_waitid # 285 sys_setaltroot -286 i386 add_key sys_add_key -287 i386 request_key sys_request_key -288 i386 keyctl sys_keyctl compat_sys_keyctl -289 i386 ioprio_set sys_ioprio_set -290 i386 ioprio_get sys_ioprio_get -291 i386 inotify_init sys_inotify_init -292 i386 inotify_add_watch sys_inotify_add_watch -293 i386 inotify_rm_watch sys_inotify_rm_watch -294 i386 migrate_pages sys_migrate_pages -295 i386 openat sys_openat compat_sys_openat -296 i386 mkdirat sys_mkdirat -297 i386 mknodat sys_mknodat -298 i386 fchownat sys_fchownat -299 i386 futimesat sys_futimesat compat_sys_futimesat -300 i386 fstatat64 sys_fstatat64 compat_sys_x86_fstatat -301 i386 unlinkat sys_unlinkat -302 i386 renameat sys_renameat -303 i386 linkat sys_linkat -304 i386 symlinkat sys_symlinkat -305 i386 readlinkat sys_readlinkat -306 i386 fchmodat sys_fchmodat -307 i386 faccessat sys_faccessat -308 i386 pselect6 sys_pselect6 compat_sys_pselect6 -309 i386 ppoll sys_ppoll compat_sys_ppoll -310 i386 unshare sys_unshare -311 i386 set_robust_list sys_set_robust_list compat_sys_set_robust_list -312 i386 get_robust_list sys_get_robust_list compat_sys_get_robust_list -313 i386 splice sys_splice -314 i386 sync_file_range sys_sync_file_range compat_sys_x86_sync_file_range -315 i386 tee sys_tee -316 i386 vmsplice sys_vmsplice compat_sys_vmsplice -317 i386 move_pages sys_move_pages compat_sys_move_pages -318 i386 getcpu sys_getcpu -319 i386 epoll_pwait sys_epoll_pwait -320 i386 utimensat sys_utimensat compat_sys_utimensat -321 i386 signalfd sys_signalfd compat_sys_signalfd -322 i386 timerfd_create sys_timerfd_create -323 i386 eventfd sys_eventfd -324 i386 fallocate sys_fallocate compat_sys_x86_fallocate -325 i386 timerfd_settime sys_timerfd_settime compat_sys_timerfd_settime -326 i386 timerfd_gettime sys_timerfd_gettime compat_sys_timerfd_gettime -327 i386 signalfd4 sys_signalfd4 compat_sys_signalfd4 -328 i386 eventfd2 sys_eventfd2 -329 i386 epoll_create1 sys_epoll_create1 -330 i386 dup3 sys_dup3 -331 i386 pipe2 sys_pipe2 -332 i386 inotify_init1 sys_inotify_init1 -333 i386 preadv sys_preadv compat_sys_preadv -334 i386 pwritev sys_pwritev compat_sys_pwritev -335 i386 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo -336 i386 perf_event_open sys_perf_event_open -337 i386 recvmmsg sys_recvmmsg compat_sys_recvmmsg -338 i386 fanotify_init sys_fanotify_init -339 i386 fanotify_mark sys_fanotify_mark compat_sys_fanotify_mark -340 i386 prlimit64 sys_prlimit64 -341 i386 name_to_handle_at sys_name_to_handle_at -342 i386 open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at -343 i386 clock_adjtime sys_clock_adjtime compat_sys_clock_adjtime -344 i386 syncfs sys_syncfs -345 i386 sendmmsg sys_sendmmsg compat_sys_sendmmsg -346 i386 setns sys_setns -347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv -348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev -349 i386 kcmp sys_kcmp -350 i386 finit_module sys_finit_module -351 i386 sched_setattr sys_sched_setattr -352 i386 sched_getattr sys_sched_getattr -353 i386 renameat2 sys_renameat2 -354 i386 seccomp sys_seccomp -355 i386 getrandom sys_getrandom -356 i386 memfd_create sys_memfd_create -357 i386 bpf sys_bpf -358 i386 execveat sys_execveat compat_sys_execveat -359 i386 socket sys_socket -360 i386 socketpair sys_socketpair -361 i386 bind sys_bind -362 i386 connect sys_connect -363 i386 listen sys_listen -364 i386 accept4 sys_accept4 -365 i386 getsockopt sys_getsockopt compat_sys_getsockopt -366 i386 setsockopt sys_setsockopt compat_sys_setsockopt -367 i386 getsockname sys_getsockname -368 i386 getpeername sys_getpeername -369 i386 sendto sys_sendto -370 i386 sendmsg sys_sendmsg compat_sys_sendmsg -371 i386 recvfrom sys_recvfrom compat_sys_recvfrom -372 i386 recvmsg sys_recvmsg compat_sys_recvmsg -373 i386 shutdown sys_shutdown -374 i386 userfaultfd sys_userfaultfd -375 i386 membarrier sys_membarrier -376 i386 mlock2 sys_mlock2 -377 i386 copy_file_range sys_copy_file_range -378 i386 preadv2 sys_preadv2 compat_sys_preadv2 -379 i386 pwritev2 sys_pwritev2 compat_sys_pwritev2 -380 i386 pkey_mprotect sys_pkey_mprotect -381 i386 pkey_alloc sys_pkey_alloc -382 i386 pkey_free sys_pkey_free -383 i386 statx sys_statx -384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl +286 i386 add_key sys_add_key __ia32_sys_add_key +287 i386 request_key sys_request_key __ia32_sys_request_key +288 i386 keyctl sys_keyctl __ia32_compat_sys_keyctl +289 i386 ioprio_set sys_ioprio_set __ia32_sys_ioprio_set +290 i386 ioprio_get sys_ioprio_get __ia32_sys_ioprio_get +291 i386 inotify_init sys_inotify_init __ia32_sys_inotify_init +292 i386 inotify_add_watch sys_inotify_add_watch __ia32_sys_inotify_add_watch +293 i386 inotify_rm_watch sys_inotify_rm_watch __ia32_sys_inotify_rm_watch +294 i386 migrate_pages sys_migrate_pages __ia32_sys_migrate_pages +295 i386 openat sys_openat __ia32_compat_sys_openat +296 i386 mkdirat sys_mkdirat __ia32_sys_mkdirat +297 i386 mknodat sys_mknodat __ia32_sys_mknodat +298 i386 fchownat sys_fchownat __ia32_sys_fchownat +299 i386 futimesat sys_futimesat __ia32_compat_sys_futimesat +300 i386 fstatat64 sys_fstatat64 __ia32_compat_sys_x86_fstatat +301 i386 unlinkat sys_unlinkat __ia32_sys_unlinkat +302 i386 renameat sys_renameat __ia32_sys_renameat +303 i386 linkat sys_linkat __ia32_sys_linkat +304 i386 symlinkat sys_symlinkat __ia32_sys_symlinkat +305 i386 readlinkat sys_readlinkat __ia32_sys_readlinkat +306 i386 fchmodat sys_fchmodat __ia32_sys_fchmodat +307 i386 faccessat sys_faccessat __ia32_sys_faccessat +308 i386 pselect6 sys_pselect6 __ia32_compat_sys_pselect6 +309 i386 ppoll sys_ppoll __ia32_compat_sys_ppoll +310 i386 unshare sys_unshare __ia32_sys_unshare +311 i386 set_robust_list sys_set_robust_list __ia32_compat_sys_set_robust_list +312 i386 get_robust_list sys_get_robust_list __ia32_compat_sys_get_robust_list +313 i386 splice sys_splice __ia32_sys_splice +314 i386 sync_file_range sys_sync_file_range __ia32_compat_sys_x86_sync_file_range +315 i386 tee sys_tee __ia32_sys_tee +316 i386 vmsplice sys_vmsplice __ia32_compat_sys_vmsplice +317 i386 move_pages sys_move_pages __ia32_compat_sys_move_pages +318 i386 getcpu sys_getcpu __ia32_sys_getcpu +319 i386 epoll_pwait sys_epoll_pwait __ia32_sys_epoll_pwait +320 i386 utimensat sys_utimensat __ia32_compat_sys_utimensat +321 i386 signalfd sys_signalfd __ia32_compat_sys_signalfd +322 i386 timerfd_create sys_timerfd_create __ia32_sys_timerfd_create +323 i386 eventfd sys_eventfd __ia32_sys_eventfd +324 i386 fallocate sys_fallocate __ia32_compat_sys_x86_fallocate +325 i386 timerfd_settime sys_timerfd_settime __ia32_compat_sys_timerfd_settime +326 i386 timerfd_gettime sys_timerfd_gettime __ia32_compat_sys_timerfd_gettime +327 i386 signalfd4 sys_signalfd4 __ia32_compat_sys_signalfd4 +328 i386 eventfd2 sys_eventfd2 __ia32_sys_eventfd2 +329 i386 epoll_create1 sys_epoll_create1 __ia32_sys_epoll_create1 +330 i386 dup3 sys_dup3 __ia32_sys_dup3 +331 i386 pipe2 sys_pipe2 __ia32_sys_pipe2 +332 i386 inotify_init1 sys_inotify_init1 __ia32_sys_inotify_init1 +333 i386 preadv sys_preadv __ia32_compat_sys_preadv +334 i386 pwritev sys_pwritev __ia32_compat_sys_pwritev +335 i386 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo __ia32_compat_sys_rt_tgsigqueueinfo +336 i386 perf_event_open sys_perf_event_open __ia32_sys_perf_event_open +337 i386 recvmmsg sys_recvmmsg __ia32_compat_sys_recvmmsg +338 i386 fanotify_init sys_fanotify_init __ia32_sys_fanotify_init +339 i386 fanotify_mark sys_fanotify_mark __ia32_compat_sys_fanotify_mark +340 i386 prlimit64 sys_prlimit64 __ia32_sys_prlimit64 +341 i386 name_to_handle_at sys_name_to_handle_at __ia32_sys_name_to_handle_at +342 i386 open_by_handle_at sys_open_by_handle_at __ia32_compat_sys_open_by_handle_at +343 i386 clock_adjtime sys_clock_adjtime __ia32_compat_sys_clock_adjtime +344 i386 syncfs sys_syncfs __ia32_sys_syncfs +345 i386 sendmmsg sys_sendmmsg __ia32_compat_sys_sendmmsg +346 i386 setns sys_setns __ia32_sys_setns +347 i386 process_vm_readv sys_process_vm_readv __ia32_compat_sys_process_vm_readv +348 i386 process_vm_writev sys_process_vm_writev __ia32_compat_sys_process_vm_writev +349 i386 kcmp sys_kcmp __ia32_sys_kcmp +350 i386 finit_module sys_finit_module __ia32_sys_finit_module +351 i386 sched_setattr sys_sched_setattr __ia32_sys_sched_setattr +352 i386 sched_getattr sys_sched_getattr __ia32_sys_sched_getattr +353 i386 renameat2 sys_renameat2 __ia32_sys_renameat2 +354 i386 seccomp sys_seccomp __ia32_sys_seccomp +355 i386 getrandom sys_getrandom __ia32_sys_getrandom +356 i386 memfd_create sys_memfd_create __ia32_sys_memfd_create +357 i386 bpf sys_bpf __ia32_sys_bpf +358 i386 execveat sys_execveat __ia32_compat_sys_execveat +359 i386 socket sys_socket __ia32_sys_socket +360 i386 socketpair sys_socketpair __ia32_sys_socketpair +361 i386 bind sys_bind __ia32_sys_bind +362 i386 connect sys_connect __ia32_sys_connect +363 i386 listen sys_listen __ia32_sys_listen +364 i386 accept4 sys_accept4 __ia32_sys_accept4 +365 i386 getsockopt sys_getsockopt __ia32_compat_sys_getsockopt +366 i386 setsockopt sys_setsockopt __ia32_compat_sys_setsockopt +367 i386 getsockname sys_getsockname __ia32_sys_getsockname +368 i386 getpeername sys_getpeername __ia32_sys_getpeername +369 i386 sendto sys_sendto __ia32_sys_sendto +370 i386 sendmsg sys_sendmsg __ia32_compat_sys_sendmsg +371 i386 recvfrom sys_recvfrom __ia32_compat_sys_recvfrom +372 i386 recvmsg sys_recvmsg __ia32_compat_sys_recvmsg +373 i386 shutdown sys_shutdown __ia32_sys_shutdown +374 i386 userfaultfd sys_userfaultfd __ia32_sys_userfaultfd +375 i386 membarrier sys_membarrier __ia32_sys_membarrier +376 i386 mlock2 sys_mlock2 __ia32_sys_mlock2 +377 i386 copy_file_range sys_copy_file_range __ia32_sys_copy_file_range +378 i386 preadv2 sys_preadv2 __ia32_compat_sys_preadv2 +379 i386 pwritev2 sys_pwritev2 __ia32_compat_sys_pwritev2 +380 i386 pkey_mprotect sys_pkey_mprotect __ia32_sys_pkey_mprotect +381 i386 pkey_alloc sys_pkey_alloc __ia32_sys_pkey_alloc +382 i386 pkey_free sys_pkey_free __ia32_sys_pkey_free +383 i386 statx sys_statx __ia32_sys_statx +384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 5aef183e2f85..4dfe42666d0c 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -4,379 +4,383 @@ # The format is: # <number> <abi> <name> <entry point> # +# The __x64_sys_*() stubs are created on-the-fly for sys_*() system calls +# # The abi is "common", "64" or "x32" for this file. # -0 common read sys_read -1 common write sys_write -2 common open sys_open -3 common close sys_close -4 common stat sys_newstat -5 common fstat sys_newfstat -6 common lstat sys_newlstat -7 common poll sys_poll -8 common lseek sys_lseek -9 common mmap sys_mmap -10 common mprotect sys_mprotect -11 common munmap sys_munmap -12 common brk sys_brk -13 64 rt_sigaction sys_rt_sigaction -14 common rt_sigprocmask sys_rt_sigprocmask -15 64 rt_sigreturn sys_rt_sigreturn/ptregs -16 64 ioctl sys_ioctl -17 common pread64 sys_pread64 -18 common pwrite64 sys_pwrite64 -19 64 readv sys_readv -20 64 writev sys_writev -21 common access sys_access -22 common pipe sys_pipe -23 common select sys_select -24 common sched_yield sys_sched_yield -25 common mremap sys_mremap -26 common msync sys_msync -27 common mincore sys_mincore -28 common madvise sys_madvise -29 common shmget sys_shmget -30 common shmat sys_shmat -31 common shmctl sys_shmctl -32 common dup sys_dup -33 common dup2 sys_dup2 -34 common pause sys_pause -35 common nanosleep sys_nanosleep -36 common getitimer sys_getitimer -37 common alarm sys_alarm -38 common setitimer sys_setitimer -39 common getpid sys_getpid -40 common sendfile sys_sendfile64 -41 common socket sys_socket -42 common connect sys_connect -43 common accept sys_accept -44 common sendto sys_sendto -45 64 recvfrom sys_recvfrom -46 64 sendmsg sys_sendmsg -47 64 recvmsg sys_recvmsg -48 common shutdown sys_shutdown -49 common bind sys_bind -50 common listen sys_listen -51 common getsockname sys_getsockname -52 common getpeername sys_getpeername -53 common socketpair sys_socketpair -54 64 setsockopt sys_setsockopt -55 64 getsockopt sys_getsockopt -56 common clone sys_clone/ptregs -57 common fork sys_fork/ptregs -58 common vfork sys_vfork/ptregs -59 64 execve sys_execve/ptregs -60 common exit sys_exit -61 common wait4 sys_wait4 -62 common kill sys_kill -63 common uname sys_newuname -64 common semget sys_semget -65 common semop sys_semop -66 common semctl sys_semctl -67 common shmdt sys_shmdt -68 common msgget sys_msgget -69 common msgsnd sys_msgsnd -70 common msgrcv sys_msgrcv -71 common msgctl sys_msgctl -72 common fcntl sys_fcntl -73 common flock sys_flock -74 common fsync sys_fsync -75 common fdatasync sys_fdatasync -76 common truncate sys_truncate -77 common ftruncate sys_ftruncate -78 common getdents sys_getdents -79 common getcwd sys_getcwd -80 common chdir sys_chdir -81 common fchdir sys_fchdir -82 common rename sys_rename -83 common mkdir sys_mkdir -84 common rmdir sys_rmdir -85 common creat sys_creat -86 common link sys_link -87 common unlink sys_unlink -88 common symlink sys_symlink -89 common readlink sys_readlink -90 common chmod sys_chmod -91 common fchmod sys_fchmod -92 common chown sys_chown -93 common fchown sys_fchown -94 common lchown sys_lchown -95 common umask sys_umask -96 common gettimeofday sys_gettimeofday -97 common getrlimit sys_getrlimit -98 common getrusage sys_getrusage -99 common sysinfo sys_sysinfo -100 common times sys_times -101 64 ptrace sys_ptrace -102 common getuid sys_getuid -103 common syslog sys_syslog -104 common getgid sys_getgid -105 common setuid sys_setuid -106 common setgid sys_setgid -107 common geteuid sys_geteuid -108 common getegid sys_getegid -109 common setpgid sys_setpgid -110 common getppid sys_getppid -111 common getpgrp sys_getpgrp -112 common setsid sys_setsid -113 common setreuid sys_setreuid -114 common setregid sys_setregid -115 common getgroups sys_getgroups -116 common setgroups sys_setgroups -117 common setresuid sys_setresuid -118 common getresuid sys_getresuid -119 common setresgid sys_setresgid -120 common getresgid sys_getresgid -121 common getpgid sys_getpgid -122 common setfsuid sys_setfsuid -123 common setfsgid sys_setfsgid -124 common getsid sys_getsid -125 common capget sys_capget -126 common capset sys_capset -127 64 rt_sigpending sys_rt_sigpending -128 64 rt_sigtimedwait sys_rt_sigtimedwait -129 64 rt_sigqueueinfo sys_rt_sigqueueinfo -130 common rt_sigsuspend sys_rt_sigsuspend -131 64 sigaltstack sys_sigaltstack -132 common utime sys_utime -133 common mknod sys_mknod +0 common read __x64_sys_read +1 common write __x64_sys_write +2 common open __x64_sys_open +3 common close __x64_sys_close +4 common stat __x64_sys_newstat +5 common fstat __x64_sys_newfstat +6 common lstat __x64_sys_newlstat +7 common poll __x64_sys_poll +8 common lseek __x64_sys_lseek +9 common mmap __x64_sys_mmap +10 common mprotect __x64_sys_mprotect +11 common munmap __x64_sys_munmap +12 common brk __x64_sys_brk +13 64 rt_sigaction __x64_sys_rt_sigaction +14 common rt_sigprocmask __x64_sys_rt_sigprocmask +15 64 rt_sigreturn __x64_sys_rt_sigreturn/ptregs +16 64 ioctl __x64_sys_ioctl +17 common pread64 __x64_sys_pread64 +18 common pwrite64 __x64_sys_pwrite64 +19 64 readv __x64_sys_readv +20 64 writev __x64_sys_writev +21 common access __x64_sys_access +22 common pipe __x64_sys_pipe +23 common select __x64_sys_select +24 common sched_yield __x64_sys_sched_yield +25 common mremap __x64_sys_mremap +26 common msync __x64_sys_msync +27 common mincore __x64_sys_mincore +28 common madvise __x64_sys_madvise +29 common shmget __x64_sys_shmget +30 common shmat __x64_sys_shmat +31 common shmctl __x64_sys_shmctl +32 common dup __x64_sys_dup +33 common dup2 __x64_sys_dup2 +34 common pause __x64_sys_pause +35 common nanosleep __x64_sys_nanosleep +36 common getitimer __x64_sys_getitimer +37 common alarm __x64_sys_alarm +38 common setitimer __x64_sys_setitimer +39 common getpid __x64_sys_getpid +40 common sendfile __x64_sys_sendfile64 +41 common socket __x64_sys_socket +42 common connect __x64_sys_connect +43 common accept __x64_sys_accept +44 common sendto __x64_sys_sendto +45 64 recvfrom __x64_sys_recvfrom +46 64 sendmsg __x64_sys_sendmsg +47 64 recvmsg __x64_sys_recvmsg +48 common shutdown __x64_sys_shutdown +49 common bind __x64_sys_bind +50 common listen __x64_sys_listen +51 common getsockname __x64_sys_getsockname +52 common getpeername __x64_sys_getpeername +53 common socketpair __x64_sys_socketpair +54 64 setsockopt __x64_sys_setsockopt +55 64 getsockopt __x64_sys_getsockopt +56 common clone __x64_sys_clone/ptregs +57 common fork __x64_sys_fork/ptregs +58 common vfork __x64_sys_vfork/ptregs +59 64 execve __x64_sys_execve/ptregs +60 common exit __x64_sys_exit +61 common wait4 __x64_sys_wait4 +62 common kill __x64_sys_kill +63 common uname __x64_sys_newuname +64 common semget __x64_sys_semget +65 common semop __x64_sys_semop +66 common semctl __x64_sys_semctl +67 common shmdt __x64_sys_shmdt +68 common msgget __x64_sys_msgget +69 common msgsnd __x64_sys_msgsnd +70 common msgrcv __x64_sys_msgrcv +71 common msgctl __x64_sys_msgctl +72 common fcntl __x64_sys_fcntl +73 common flock __x64_sys_flock +74 common fsync __x64_sys_fsync +75 common fdatasync __x64_sys_fdatasync +76 common truncate __x64_sys_truncate +77 common ftruncate __x64_sys_ftruncate +78 common getdents __x64_sys_getdents +79 common getcwd __x64_sys_getcwd +80 common chdir __x64_sys_chdir +81 common fchdir __x64_sys_fchdir +82 common rename __x64_sys_rename +83 common mkdir __x64_sys_mkdir +84 common rmdir __x64_sys_rmdir +85 common creat __x64_sys_creat +86 common link __x64_sys_link +87 common unlink __x64_sys_unlink +88 common symlink __x64_sys_symlink +89 common readlink __x64_sys_readlink +90 common chmod __x64_sys_chmod +91 common fchmod __x64_sys_fchmod +92 common chown __x64_sys_chown +93 common fchown __x64_sys_fchown +94 common lchown __x64_sys_lchown +95 common umask __x64_sys_umask +96 common gettimeofday __x64_sys_gettimeofday +97 common getrlimit __x64_sys_getrlimit +98 common getrusage __x64_sys_getrusage +99 common sysinfo __x64_sys_sysinfo +100 common times __x64_sys_times +101 64 ptrace __x64_sys_ptrace +102 common getuid __x64_sys_getuid +103 common syslog __x64_sys_syslog +104 common getgid __x64_sys_getgid +105 common setuid __x64_sys_setuid +106 common setgid __x64_sys_setgid +107 common geteuid __x64_sys_geteuid +108 common getegid __x64_sys_getegid +109 common setpgid __x64_sys_setpgid +110 common getppid __x64_sys_getppid +111 common getpgrp __x64_sys_getpgrp +112 common setsid __x64_sys_setsid +113 common setreuid __x64_sys_setreuid +114 common setregid __x64_sys_setregid +115 common getgroups __x64_sys_getgroups +116 common setgroups __x64_sys_setgroups +117 common setresuid __x64_sys_setresuid +118 common getresuid __x64_sys_getresuid +119 common setresgid __x64_sys_setresgid +120 common getresgid __x64_sys_getresgid +121 common getpgid __x64_sys_getpgid +122 common setfsuid __x64_sys_setfsuid +123 common setfsgid __x64_sys_setfsgid +124 common getsid __x64_sys_getsid +125 common capget __x64_sys_capget +126 common capset __x64_sys_capset +127 64 rt_sigpending __x64_sys_rt_sigpending +128 64 rt_sigtimedwait __x64_sys_rt_sigtimedwait +129 64 rt_sigqueueinfo __x64_sys_rt_sigqueueinfo +130 common rt_sigsuspend __x64_sys_rt_sigsuspend +131 64 sigaltstack __x64_sys_sigaltstack +132 common utime __x64_sys_utime +133 common mknod __x64_sys_mknod 134 64 uselib -135 common personality sys_personality -136 common ustat sys_ustat -137 common statfs sys_statfs -138 common fstatfs sys_fstatfs -139 common sysfs sys_sysfs -140 common getpriority sys_getpriority -141 common setpriority sys_setpriority -142 common sched_setparam sys_sched_setparam -143 common sched_getparam sys_sched_getparam -144 common sched_setscheduler sys_sched_setscheduler -145 common sched_getscheduler sys_sched_getscheduler -146 common sched_get_priority_max sys_sched_get_priority_max -147 common sched_get_priority_min sys_sched_get_priority_min -148 common sched_rr_get_interval sys_sched_rr_get_interval -149 common mlock sys_mlock -150 common munlock sys_munlock -151 common mlockall sys_mlockall -152 common munlockall sys_munlockall -153 common vhangup sys_vhangup -154 common modify_ldt sys_modify_ldt -155 common pivot_root sys_pivot_root -156 64 _sysctl sys_sysctl -157 common prctl sys_prctl -158 common arch_prctl sys_arch_prctl -159 common adjtimex sys_adjtimex -160 common setrlimit sys_setrlimit -161 common chroot sys_chroot -162 common sync sys_sync -163 common acct sys_acct -164 common settimeofday sys_settimeofday -165 common mount sys_mount -166 common umount2 sys_umount -167 common swapon sys_swapon -168 common swapoff sys_swapoff -169 common reboot sys_reboot -170 common sethostname sys_sethostname -171 common setdomainname sys_setdomainname -172 common iopl sys_iopl/ptregs -173 common ioperm sys_ioperm +135 common personality __x64_sys_personality +136 common ustat __x64_sys_ustat +137 common statfs __x64_sys_statfs +138 common fstatfs __x64_sys_fstatfs +139 common sysfs __x64_sys_sysfs +140 common getpriority __x64_sys_getpriority +141 common setpriority __x64_sys_setpriority +142 common sched_setparam __x64_sys_sched_setparam +143 common sched_getparam __x64_sys_sched_getparam +144 common sched_setscheduler __x64_sys_sched_setscheduler +145 common sched_getscheduler __x64_sys_sched_getscheduler +146 common sched_get_priority_max __x64_sys_sched_get_priority_max +147 common sched_get_priority_min __x64_sys_sched_get_priority_min +148 common sched_rr_get_interval __x64_sys_sched_rr_get_interval +149 common mlock __x64_sys_mlock +150 common munlock __x64_sys_munlock +151 common mlockall __x64_sys_mlockall +152 common munlockall __x64_sys_munlockall +153 common vhangup __x64_sys_vhangup +154 common modify_ldt __x64_sys_modify_ldt +155 common pivot_root __x64_sys_pivot_root +156 64 _sysctl __x64_sys_sysctl +157 common prctl __x64_sys_prctl +158 common arch_prctl __x64_sys_arch_prctl +159 common adjtimex __x64_sys_adjtimex +160 common setrlimit __x64_sys_setrlimit +161 common chroot __x64_sys_chroot +162 common sync __x64_sys_sync +163 common acct __x64_sys_acct +164 common settimeofday __x64_sys_settimeofday +165 common mount __x64_sys_mount +166 common umount2 __x64_sys_umount +167 common swapon __x64_sys_swapon +168 common swapoff __x64_sys_swapoff +169 common reboot __x64_sys_reboot +170 common sethostname __x64_sys_sethostname +171 common setdomainname __x64_sys_setdomainname +172 common iopl __x64_sys_iopl/ptregs +173 common ioperm __x64_sys_ioperm 174 64 create_module -175 common init_module sys_init_module -176 common delete_module sys_delete_module +175 common init_module __x64_sys_init_module +176 common delete_module __x64_sys_delete_module 177 64 get_kernel_syms 178 64 query_module -179 common quotactl sys_quotactl +179 common quotactl __x64_sys_quotactl 180 64 nfsservctl 181 common getpmsg 182 common putpmsg 183 common afs_syscall 184 common tuxcall 185 common security -186 common gettid sys_gettid -187 common readahead sys_readahead -188 common setxattr sys_setxattr -189 common lsetxattr sys_lsetxattr -190 common fsetxattr sys_fsetxattr -191 common getxattr sys_getxattr -192 common lgetxattr sys_lgetxattr -193 common fgetxattr sys_fgetxattr -194 common listxattr sys_listxattr -195 common llistxattr sys_llistxattr -196 common flistxattr sys_flistxattr -197 common removexattr sys_removexattr -198 common lremovexattr sys_lremovexattr -199 common fremovexattr sys_fremovexattr -200 common tkill sys_tkill -201 common time sys_time -202 common futex sys_futex -203 common sched_setaffinity sys_sched_setaffinity -204 common sched_getaffinity sys_sched_getaffinity +186 common gettid __x64_sys_gettid +187 common readahead __x64_sys_readahead +188 common setxattr __x64_sys_setxattr +189 common lsetxattr __x64_sys_lsetxattr +190 common fsetxattr __x64_sys_fsetxattr +191 common getxattr __x64_sys_getxattr +192 common lgetxattr __x64_sys_lgetxattr +193 common fgetxattr __x64_sys_fgetxattr +194 common listxattr __x64_sys_listxattr +195 common llistxattr __x64_sys_llistxattr +196 common flistxattr __x64_sys_flistxattr +197 common removexattr __x64_sys_removexattr +198 common lremovexattr __x64_sys_lremovexattr +199 common fremovexattr __x64_sys_fremovexattr +200 common tkill __x64_sys_tkill +201 common time __x64_sys_time +202 common futex __x64_sys_futex +203 common sched_setaffinity __x64_sys_sched_setaffinity +204 common sched_getaffinity __x64_sys_sched_getaffinity 205 64 set_thread_area -206 64 io_setup sys_io_setup -207 common io_destroy sys_io_destroy -208 common io_getevents sys_io_getevents -209 64 io_submit sys_io_submit -210 common io_cancel sys_io_cancel +206 64 io_setup __x64_sys_io_setup +207 common io_destroy __x64_sys_io_destroy +208 common io_getevents __x64_sys_io_getevents +209 64 io_submit __x64_sys_io_submit +210 common io_cancel __x64_sys_io_cancel 211 64 get_thread_area -212 common lookup_dcookie sys_lookup_dcookie -213 common epoll_create sys_epoll_create +212 common lookup_dcookie __x64_sys_lookup_dcookie +213 common epoll_create __x64_sys_epoll_create 214 64 epoll_ctl_old 215 64 epoll_wait_old -216 common remap_file_pages sys_remap_file_pages -217 common getdents64 sys_getdents64 -218 common set_tid_address sys_set_tid_address -219 common restart_syscall sys_restart_syscall -220 common semtimedop sys_semtimedop -221 common fadvise64 sys_fadvise64 -222 64 timer_create sys_timer_create -223 common timer_settime sys_timer_settime -224 common timer_gettime sys_timer_gettime -225 common timer_getoverrun sys_timer_getoverrun -226 common timer_delete sys_timer_delete -227 common clock_settime sys_clock_settime -228 common clock_gettime sys_clock_gettime -229 common clock_getres sys_clock_getres -230 common clock_nanosleep sys_clock_nanosleep -231 common exit_group sys_exit_group -232 common epoll_wait sys_epoll_wait -233 common epoll_ctl sys_epoll_ctl -234 common tgkill sys_tgkill -235 common utimes sys_utimes +216 common remap_file_pages __x64_sys_remap_file_pages +217 common getdents64 __x64_sys_getdents64 +218 common set_tid_address __x64_sys_set_tid_address +219 common restart_syscall __x64_sys_restart_syscall +220 common semtimedop __x64_sys_semtimedop +221 common fadvise64 __x64_sys_fadvise64 +222 64 timer_create __x64_sys_timer_create +223 common timer_settime __x64_sys_timer_settime +224 common timer_gettime __x64_sys_timer_gettime +225 common timer_getoverrun __x64_sys_timer_getoverrun +226 common timer_delete __x64_sys_timer_delete +227 common clock_settime __x64_sys_clock_settime +228 common clock_gettime __x64_sys_clock_gettime +229 common clock_getres __x64_sys_clock_getres +230 common clock_nanosleep __x64_sys_clock_nanosleep +231 common exit_group __x64_sys_exit_group +232 common epoll_wait __x64_sys_epoll_wait +233 common epoll_ctl __x64_sys_epoll_ctl +234 common tgkill __x64_sys_tgkill +235 common utimes __x64_sys_utimes 236 64 vserver -237 common mbind sys_mbind -238 common set_mempolicy sys_set_mempolicy -239 common get_mempolicy sys_get_mempolicy -240 common mq_open sys_mq_open -241 common mq_unlink sys_mq_unlink -242 common mq_timedsend sys_mq_timedsend -243 common mq_timedreceive sys_mq_timedreceive -244 64 mq_notify sys_mq_notify -245 common mq_getsetattr sys_mq_getsetattr -246 64 kexec_load sys_kexec_load -247 64 waitid sys_waitid -248 common add_key sys_add_key -249 common request_key sys_request_key -250 common keyctl sys_keyctl -251 common ioprio_set sys_ioprio_set -252 common ioprio_get sys_ioprio_get -253 common inotify_init sys_inotify_init -254 common inotify_add_watch sys_inotify_add_watch -255 common inotify_rm_watch sys_inotify_rm_watch -256 common migrate_pages sys_migrate_pages -257 common openat sys_openat -258 common mkdirat sys_mkdirat -259 common mknodat sys_mknodat -260 common fchownat sys_fchownat -261 common futimesat sys_futimesat -262 common newfstatat sys_newfstatat -263 common unlinkat sys_unlinkat -264 common renameat sys_renameat -265 common linkat sys_linkat -266 common symlinkat sys_symlinkat -267 common readlinkat sys_readlinkat -268 common fchmodat sys_fchmodat -269 common faccessat sys_faccessat -270 common pselect6 sys_pselect6 -271 common ppoll sys_ppoll -272 common unshare sys_unshare -273 64 set_robust_list sys_set_robust_list -274 64 get_robust_list sys_get_robust_list -275 common splice sys_splice -276 common tee sys_tee -277 common sync_file_range sys_sync_file_range -278 64 vmsplice sys_vmsplice -279 64 move_pages sys_move_pages -280 common utimensat sys_utimensat -281 common epoll_pwait sys_epoll_pwait -282 common signalfd sys_signalfd -283 common timerfd_create sys_timerfd_create -284 common eventfd sys_eventfd -285 common fallocate sys_fallocate -286 common timerfd_settime sys_timerfd_settime -287 common timerfd_gettime sys_timerfd_gettime -288 common accept4 sys_accept4 -289 common signalfd4 sys_signalfd4 -290 common eventfd2 sys_eventfd2 -291 common epoll_create1 sys_epoll_create1 -292 common dup3 sys_dup3 -293 common pipe2 sys_pipe2 -294 common inotify_init1 sys_inotify_init1 -295 64 preadv sys_preadv -296 64 pwritev sys_pwritev -297 64 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo -298 common perf_event_open sys_perf_event_open -299 64 recvmmsg sys_recvmmsg -300 common fanotify_init sys_fanotify_init -301 common fanotify_mark sys_fanotify_mark -302 common prlimit64 sys_prlimit64 -303 common name_to_handle_at sys_name_to_handle_at -304 common open_by_handle_at sys_open_by_handle_at -305 common clock_adjtime sys_clock_adjtime -306 common syncfs sys_syncfs -307 64 sendmmsg sys_sendmmsg -308 common setns sys_setns -309 common getcpu sys_getcpu -310 64 process_vm_readv sys_process_vm_readv -311 64 process_vm_writev sys_process_vm_writev -312 common kcmp sys_kcmp -313 common finit_module sys_finit_module -314 common sched_setattr sys_sched_setattr -315 common sched_getattr sys_sched_getattr -316 common renameat2 sys_renameat2 -317 common seccomp sys_seccomp -318 common getrandom sys_getrandom -319 common memfd_create sys_memfd_create -320 common kexec_file_load sys_kexec_file_load -321 common bpf sys_bpf -322 64 execveat sys_execveat/ptregs -323 common userfaultfd sys_userfaultfd -324 common membarrier sys_membarrier -325 common mlock2 sys_mlock2 -326 common copy_file_range sys_copy_file_range -327 64 preadv2 sys_preadv2 -328 64 pwritev2 sys_pwritev2 -329 common pkey_mprotect sys_pkey_mprotect -330 common pkey_alloc sys_pkey_alloc -331 common pkey_free sys_pkey_free -332 common statx sys_statx +237 common mbind __x64_sys_mbind +238 common set_mempolicy __x64_sys_set_mempolicy +239 common get_mempolicy __x64_sys_get_mempolicy +240 common mq_open __x64_sys_mq_open +241 common mq_unlink __x64_sys_mq_unlink +242 common mq_timedsend __x64_sys_mq_timedsend +243 common mq_timedreceive __x64_sys_mq_timedreceive +244 64 mq_notify __x64_sys_mq_notify +245 common mq_getsetattr __x64_sys_mq_getsetattr +246 64 kexec_load __x64_sys_kexec_load +247 64 waitid __x64_sys_waitid +248 common add_key __x64_sys_add_key +249 common request_key __x64_sys_request_key +250 common keyctl __x64_sys_keyctl +251 common ioprio_set __x64_sys_ioprio_set +252 common ioprio_get __x64_sys_ioprio_get +253 common inotify_init __x64_sys_inotify_init +254 common inotify_add_watch __x64_sys_inotify_add_watch +255 common inotify_rm_watch __x64_sys_inotify_rm_watch +256 common migrate_pages __x64_sys_migrate_pages +257 common openat __x64_sys_openat +258 common mkdirat __x64_sys_mkdirat +259 common mknodat __x64_sys_mknodat +260 common fchownat __x64_sys_fchownat +261 common futimesat __x64_sys_futimesat +262 common newfstatat __x64_sys_newfstatat +263 common unlinkat __x64_sys_unlinkat +264 common renameat __x64_sys_renameat +265 common linkat __x64_sys_linkat +266 common symlinkat __x64_sys_symlinkat +267 common readlinkat __x64_sys_readlinkat +268 common fchmodat __x64_sys_fchmodat +269 common faccessat __x64_sys_faccessat +270 common pselect6 __x64_sys_pselect6 +271 common ppoll __x64_sys_ppoll +272 common unshare __x64_sys_unshare +273 64 set_robust_list __x64_sys_set_robust_list +274 64 get_robust_list __x64_sys_get_robust_list +275 common splice __x64_sys_splice +276 common tee __x64_sys_tee +277 common sync_file_range __x64_sys_sync_file_range +278 64 vmsplice __x64_sys_vmsplice +279 64 move_pages __x64_sys_move_pages +280 common utimensat __x64_sys_utimensat +281 common epoll_pwait __x64_sys_epoll_pwait +282 common signalfd __x64_sys_signalfd +283 common timerfd_create __x64_sys_timerfd_create +284 common eventfd __x64_sys_eventfd +285 common fallocate __x64_sys_fallocate +286 common timerfd_settime __x64_sys_timerfd_settime +287 common timerfd_gettime __x64_sys_timerfd_gettime +288 common accept4 __x64_sys_accept4 +289 common signalfd4 __x64_sys_signalfd4 +290 common eventfd2 __x64_sys_eventfd2 +291 common epoll_create1 __x64_sys_epoll_create1 +292 common dup3 __x64_sys_dup3 +293 common pipe2 __x64_sys_pipe2 +294 common inotify_init1 __x64_sys_inotify_init1 +295 64 preadv __x64_sys_preadv +296 64 pwritev __x64_sys_pwritev +297 64 rt_tgsigqueueinfo __x64_sys_rt_tgsigqueueinfo +298 common perf_event_open __x64_sys_perf_event_open +299 64 recvmmsg __x64_sys_recvmmsg +300 common fanotify_init __x64_sys_fanotify_init +301 common fanotify_mark __x64_sys_fanotify_mark +302 common prlimit64 __x64_sys_prlimit64 +303 common name_to_handle_at __x64_sys_name_to_handle_at +304 common open_by_handle_at __x64_sys_open_by_handle_at +305 common clock_adjtime __x64_sys_clock_adjtime +306 common syncfs __x64_sys_syncfs +307 64 sendmmsg __x64_sys_sendmmsg +308 common setns __x64_sys_setns +309 common getcpu __x64_sys_getcpu +310 64 process_vm_readv __x64_sys_process_vm_readv +311 64 process_vm_writev __x64_sys_process_vm_writev +312 common kcmp __x64_sys_kcmp +313 common finit_module __x64_sys_finit_module +314 common sched_setattr __x64_sys_sched_setattr +315 common sched_getattr __x64_sys_sched_getattr +316 common renameat2 __x64_sys_renameat2 +317 common seccomp __x64_sys_seccomp +318 common getrandom __x64_sys_getrandom +319 common memfd_create __x64_sys_memfd_create +320 common kexec_file_load __x64_sys_kexec_file_load +321 common bpf __x64_sys_bpf +322 64 execveat __x64_sys_execveat/ptregs +323 common userfaultfd __x64_sys_userfaultfd +324 common membarrier __x64_sys_membarrier +325 common mlock2 __x64_sys_mlock2 +326 common copy_file_range __x64_sys_copy_file_range +327 64 preadv2 __x64_sys_preadv2 +328 64 pwritev2 __x64_sys_pwritev2 +329 common pkey_mprotect __x64_sys_pkey_mprotect +330 common pkey_alloc __x64_sys_pkey_alloc +331 common pkey_free __x64_sys_pkey_free +332 common statx __x64_sys_statx # # x32-specific system call numbers start at 512 to avoid cache impact -# for native 64-bit operation. +# for native 64-bit operation. The __x32_compat_sys stubs are created +# on-the-fly for compat_sys_*() compatibility system calls if X86_X32 +# is defined. # -512 x32 rt_sigaction compat_sys_rt_sigaction +512 x32 rt_sigaction __x32_compat_sys_rt_sigaction 513 x32 rt_sigreturn sys32_x32_rt_sigreturn -514 x32 ioctl compat_sys_ioctl -515 x32 readv compat_sys_readv -516 x32 writev compat_sys_writev -517 x32 recvfrom compat_sys_recvfrom -518 x32 sendmsg compat_sys_sendmsg -519 x32 recvmsg compat_sys_recvmsg -520 x32 execve compat_sys_execve/ptregs -521 x32 ptrace compat_sys_ptrace -522 x32 rt_sigpending compat_sys_rt_sigpending -523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait -524 x32 rt_sigqueueinfo compat_sys_rt_sigqueueinfo -525 x32 sigaltstack compat_sys_sigaltstack -526 x32 timer_create compat_sys_timer_create -527 x32 mq_notify compat_sys_mq_notify -528 x32 kexec_load compat_sys_kexec_load -529 x32 waitid compat_sys_waitid -530 x32 set_robust_list compat_sys_set_robust_list -531 x32 get_robust_list compat_sys_get_robust_list -532 x32 vmsplice compat_sys_vmsplice -533 x32 move_pages compat_sys_move_pages -534 x32 preadv compat_sys_preadv64 -535 x32 pwritev compat_sys_pwritev64 -536 x32 rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo -537 x32 recvmmsg compat_sys_recvmmsg -538 x32 sendmmsg compat_sys_sendmmsg -539 x32 process_vm_readv compat_sys_process_vm_readv -540 x32 process_vm_writev compat_sys_process_vm_writev -541 x32 setsockopt compat_sys_setsockopt -542 x32 getsockopt compat_sys_getsockopt -543 x32 io_setup compat_sys_io_setup -544 x32 io_submit compat_sys_io_submit -545 x32 execveat compat_sys_execveat/ptregs -546 x32 preadv2 compat_sys_preadv64v2 -547 x32 pwritev2 compat_sys_pwritev64v2 +514 x32 ioctl __x32_compat_sys_ioctl +515 x32 readv __x32_compat_sys_readv +516 x32 writev __x32_compat_sys_writev +517 x32 recvfrom __x32_compat_sys_recvfrom +518 x32 sendmsg __x32_compat_sys_sendmsg +519 x32 recvmsg __x32_compat_sys_recvmsg +520 x32 execve __x32_compat_sys_execve/ptregs +521 x32 ptrace __x32_compat_sys_ptrace +522 x32 rt_sigpending __x32_compat_sys_rt_sigpending +523 x32 rt_sigtimedwait __x32_compat_sys_rt_sigtimedwait +524 x32 rt_sigqueueinfo __x32_compat_sys_rt_sigqueueinfo +525 x32 sigaltstack __x32_compat_sys_sigaltstack +526 x32 timer_create __x32_compat_sys_timer_create +527 x32 mq_notify __x32_compat_sys_mq_notify +528 x32 kexec_load __x32_compat_sys_kexec_load +529 x32 waitid __x32_compat_sys_waitid +530 x32 set_robust_list __x32_compat_sys_set_robust_list +531 x32 get_robust_list __x32_compat_sys_get_robust_list +532 x32 vmsplice __x32_compat_sys_vmsplice +533 x32 move_pages __x32_compat_sys_move_pages +534 x32 preadv __x32_compat_sys_preadv64 +535 x32 pwritev __x32_compat_sys_pwritev64 +536 x32 rt_tgsigqueueinfo __x32_compat_sys_rt_tgsigqueueinfo +537 x32 recvmmsg __x32_compat_sys_recvmmsg +538 x32 sendmmsg __x32_compat_sys_sendmmsg +539 x32 process_vm_readv __x32_compat_sys_process_vm_readv +540 x32 process_vm_writev __x32_compat_sys_process_vm_writev +541 x32 setsockopt __x32_compat_sys_setsockopt +542 x32 getsockopt __x32_compat_sys_getsockopt +543 x32 io_setup __x32_compat_sys_io_setup +544 x32 io_submit __x32_compat_sys_io_submit +545 x32 execveat __x32_compat_sys_execveat/ptregs +546 x32 preadv2 __x32_compat_sys_preadv64v2 +547 x32 pwritev2 __x32_compat_sys_pwritev64v2 diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh index d71ef4bd3615..94fcd1951aca 100644 --- a/arch/x86/entry/syscalls/syscalltbl.sh +++ b/arch/x86/entry/syscalls/syscalltbl.sh @@ -25,15 +25,27 @@ emit() { nr="$2" entry="$3" compat="$4" + umlentry="" if [ "$abi" = "64" -a -n "$compat" ]; then echo "a compat entry for a 64-bit syscall makes no sense" >&2 exit 1 fi + # For CONFIG_UML, we need to strip the __x64_sys prefix + if [ "$abi" = "64" -a "${entry}" != "${entry#__x64_sys}" ]; then + umlentry="sys${entry#__x64_sys}" + fi + if [ -z "$compat" ]; then - if [ -n "$entry" ]; then + if [ -n "$entry" -a -z "$umlentry" ]; then syscall_macro "$abi" "$nr" "$entry" + elif [ -n "$umlentry" ]; then # implies -n "$entry" + echo "#ifdef CONFIG_X86" + syscall_macro "$abi" "$nr" "$entry" + echo "#else /* CONFIG_UML */" + syscall_macro "$abi" "$nr" "$umlentry" + echo "#endif" fi else echo "#ifdef CONFIG_X86_32" diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 1943aebadede..d998a487c9b1 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -42,9 +42,7 @@ vdso_img_cfiles := $(vdso_img-y:%=vdso-image-%.c) vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg) obj-y += $(vdso_img_objs) targets += $(vdso_img_cfiles) -targets += $(vdso_img_sodbg) -.SECONDARY: $(vdso_img-y:%=$(obj)/vdso-image-%.c) \ - $(vdso_img-y:%=$(obj)/vdso%.so) +targets += $(vdso_img_sodbg) $(vdso_img-y:%=vdso%.so) export CPPFLAGS_vdso.lds += -P -C diff --git a/arch/x86/entry/vdso/vdso32/vdso-fakesections.c b/arch/x86/entry/vdso/vdso32/vdso-fakesections.c deleted file mode 100644 index 541468e25265..000000000000 --- a/arch/x86/entry/vdso/vdso32/vdso-fakesections.c +++ /dev/null @@ -1 +0,0 @@ -#include "../vdso-fakesections.c" diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 317be365bce3..70b7845434cb 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -127,6 +127,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) int vsyscall_nr, syscall_nr, tmp; int prev_sig_on_uaccess_err; long ret; + unsigned long orig_dx; /* * No point in checking CS -- the only way to get here is a user mode @@ -227,19 +228,22 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) ret = -EFAULT; switch (vsyscall_nr) { case 0: - ret = sys_gettimeofday( - (struct timeval __user *)regs->di, - (struct timezone __user *)regs->si); + /* this decodes regs->di and regs->si on its own */ + ret = __x64_sys_gettimeofday(regs); break; case 1: - ret = sys_time((time_t __user *)regs->di); + /* this decodes regs->di on its own */ + ret = __x64_sys_time(regs); break; case 2: - ret = sys_getcpu((unsigned __user *)regs->di, - (unsigned __user *)regs->si, - NULL); + /* while we could clobber regs->dx, we didn't in the past... */ + orig_dx = regs->dx; + regs->dx = 0; + /* this decodes regs->di, regs->si and regs->dx on its own */ + ret = __x64_sys_getcpu(regs); + regs->dx = orig_dx; break; } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index a6006e7bb729..45b2b1c93d04 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -27,6 +27,7 @@ #include <linux/cpu.h> #include <linux/bitops.h> #include <linux/device.h> +#include <linux/nospec.h> #include <asm/apic.h> #include <asm/stacktrace.h> @@ -304,17 +305,20 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) config = attr->config; - cache_type = (config >> 0) & 0xff; + cache_type = (config >> 0) & 0xff; if (cache_type >= PERF_COUNT_HW_CACHE_MAX) return -EINVAL; + cache_type = array_index_nospec(cache_type, PERF_COUNT_HW_CACHE_MAX); cache_op = (config >> 8) & 0xff; if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) return -EINVAL; + cache_op = array_index_nospec(cache_op, PERF_COUNT_HW_CACHE_OP_MAX); cache_result = (config >> 16) & 0xff; if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) return -EINVAL; + cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX); val = hw_cache_event_ids[cache_type][cache_op][cache_result]; @@ -421,6 +425,8 @@ int x86_setup_perfctr(struct perf_event *event) if (attr->config >= x86_pmu.max_events) return -EINVAL; + attr->config = array_index_nospec((unsigned long)attr->config, x86_pmu.max_events); + /* * The generic map: */ diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 607bf565a90c..707b2a96e516 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3339,7 +3339,8 @@ static void intel_pmu_cpu_starting(int cpu) cpuc->lbr_sel = NULL; - flip_smm_bit(&x86_pmu.attr_freeze_on_smi); + if (x86_pmu.version > 1) + flip_smm_bit(&x86_pmu.attr_freeze_on_smi); if (!cpuc->shared_regs) return; @@ -3502,6 +3503,8 @@ static __initconst const struct x86_pmu core_pmu = { .cpu_dying = intel_pmu_cpu_dying, }; +static struct attribute *intel_pmu_attrs[]; + static __initconst const struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, @@ -3533,6 +3536,8 @@ static __initconst const struct x86_pmu intel_pmu = { .format_attrs = intel_arch3_formats_attr, .events_sysfs_show = intel_event_sysfs_show, + .attrs = intel_pmu_attrs, + .cpu_prepare = intel_pmu_cpu_prepare, .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, @@ -3911,8 +3916,6 @@ __init int intel_pmu_init(void) x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters); - - x86_pmu.attrs = intel_pmu_attrs; /* * Quirk: v2 perfmon does not report fixed-purpose events, so * assume at least 3 events, when not running in a hypervisor: diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 9aca448bb8e6..9f8084f18d58 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -92,6 +92,7 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/perf_event.h> +#include <linux/nospec.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> #include "../perf_event.h" @@ -302,6 +303,7 @@ static int cstate_pmu_event_init(struct perf_event *event) } else if (event->pmu == &cstate_pkg_pmu) { if (cfg >= PERF_CSTATE_PKG_EVENT_MAX) return -EINVAL; + cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_PKG_EVENT_MAX); if (!pkg_msr[cfg].attr) return -EINVAL; event->hw.event_base = pkg_msr[cfg].msr; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index da6780122786..8a10a045b57b 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1153,7 +1153,6 @@ static void setup_pebs_sample_data(struct perf_event *event, if (pebs == NULL) return; - regs->flags &= ~PERF_EFLAGS_EXACT; sample_type = event->attr.sample_type; dsrc = sample_type & PERF_SAMPLE_DATA_SRC; @@ -1197,7 +1196,13 @@ static void setup_pebs_sample_data(struct perf_event *event, * and PMI. */ *regs = *iregs; - regs->flags = pebs->flags; + + /* + * Initialize regs_>flags from PEBS, + * Clear exact bit (which uses x86 EFLAGS Reserved bit 3), + * i.e., do not rely on it being zero: + */ + regs->flags = pebs->flags & ~PERF_EFLAGS_EXACT; if (sample_type & PERF_SAMPLE_REGS_INTR) { regs->ax = pebs->ax; @@ -1217,10 +1222,6 @@ static void setup_pebs_sample_data(struct perf_event *event, regs->sp = pebs->sp; } - /* - * Preserve PERF_EFLAGS_VM from set_linear_ip(). - */ - regs->flags = pebs->flags | (regs->flags & PERF_EFLAGS_VM); #ifndef CONFIG_X86_32 regs->r8 = pebs->r8; regs->r9 = pebs->r9; @@ -1234,20 +1235,33 @@ static void setup_pebs_sample_data(struct perf_event *event, } if (event->attr.precise_ip > 1) { - /* Haswell and later have the eventing IP, so use it: */ + /* + * Haswell and later processors have an 'eventing IP' + * (real IP) which fixes the off-by-1 skid in hardware. + * Use it when precise_ip >= 2 : + */ if (x86_pmu.intel_cap.pebs_format >= 2) { set_linear_ip(regs, pebs->real_ip); regs->flags |= PERF_EFLAGS_EXACT; } else { - /* Otherwise use PEBS off-by-1 IP: */ + /* Otherwise, use PEBS off-by-1 IP: */ set_linear_ip(regs, pebs->ip); - /* ... and try to fix it up using the LBR entries: */ + /* + * With precise_ip >= 2, try to fix up the off-by-1 IP + * using the LBR. If successful, the fixup function + * corrects regs->ip and calls set_linear_ip() on regs: + */ if (intel_pmu_pebs_fixup_ip(regs)) regs->flags |= PERF_EFLAGS_EXACT; } - } else + } else { + /* + * When precise_ip == 1, return the PEBS off-by-1 IP, + * no fixup attempted: + */ set_linear_ip(regs, pebs->ip); + } if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) && diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index c98b943e58b4..77076a102e34 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3028,10 +3028,27 @@ static struct intel_uncore_type bdx_uncore_cbox = { .format_group = &hswep_uncore_cbox_format_group, }; +static struct intel_uncore_type bdx_uncore_sbox = { + .name = "sbox", + .num_counters = 4, + .num_boxes = 4, + .perf_ctr_bits = 48, + .event_ctl = HSWEP_S0_MSR_PMON_CTL0, + .perf_ctr = HSWEP_S0_MSR_PMON_CTR0, + .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = HSWEP_S0_MSR_PMON_BOX_CTL, + .msr_offset = HSWEP_SBOX_MSR_OFFSET, + .ops = &hswep_uncore_sbox_msr_ops, + .format_group = &hswep_uncore_sbox_format_group, +}; + +#define BDX_MSR_UNCORE_SBOX 3 + static struct intel_uncore_type *bdx_msr_uncores[] = { &bdx_uncore_ubox, &bdx_uncore_cbox, &hswep_uncore_pcu, + &bdx_uncore_sbox, NULL, }; @@ -3043,10 +3060,25 @@ static struct event_constraint bdx_uncore_pcu_constraints[] = { void bdx_uncore_cpu_init(void) { + int pkg = topology_phys_to_logical_pkg(0); + if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; uncore_msr_uncores = bdx_msr_uncores; + /* BDX-DE doesn't have SBOX */ + if (boot_cpu_data.x86_model == 86) { + uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL; + /* Detect systems with no SBOXes */ + } else if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) { + struct pci_dev *pdev; + u32 capid4; + + pdev = uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]; + pci_read_config_dword(pdev, 0x94, &capid4); + if (((capid4 >> 6) & 0x3) == 0) + bdx_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL; + } hswep_uncore_pcu.constraints = bdx_uncore_pcu_constraints; } @@ -3264,6 +3296,11 @@ static const struct pci_device_id bdx_uncore_pci_ids[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46), .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 2), }, + { /* PCU.3 (for Capability registers) */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fc0), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + HSWEP_PCI_PCU_3), + }, { /* end: all zeroes */ } }; diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index e7edf19e64c2..b4771a6ddbc1 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/perf_event.h> +#include <linux/nospec.h> #include <asm/intel-family.h> enum perf_msr_id { @@ -158,9 +159,6 @@ static int msr_event_init(struct perf_event *event) if (event->attr.type != event->pmu->type) return -ENOENT; - if (cfg >= PERF_MSR_EVENT_MAX) - return -EINVAL; - /* unsupported modes and filters */ if (event->attr.exclude_user || event->attr.exclude_kernel || @@ -171,6 +169,11 @@ static int msr_event_init(struct perf_event *event) event->attr.sample_period) /* no sampling */ return -EINVAL; + if (cfg >= PERF_MSR_EVENT_MAX) + return -EINVAL; + + cfg = array_index_nospec((unsigned long)cfg, PERF_MSR_EVENT_MAX); + if (!msr[cfg].attr) return -EINVAL; diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 40a3d3642f3a..08acd954f00e 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -313,7 +313,7 @@ struct apic { /* Probe, setup and smpboot functions */ int (*probe)(void); int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); - int (*apic_id_valid)(int apicid); + int (*apic_id_valid)(u32 apicid); int (*apic_id_registered)(void); bool (*check_apicid_used)(physid_mask_t *map, int apicid); @@ -486,7 +486,7 @@ static inline unsigned int read_apic_id(void) return apic->get_apic_id(reg); } -extern int default_apic_id_valid(int apicid); +extern int default_apic_id_valid(u32 apicid); extern int default_acpi_madt_oem_check(char *, char *); extern void default_setup_apic_routing(void); diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 386a6900e206..219faaec51df 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -136,7 +136,6 @@ #endif #ifndef __ASSEMBLY__ -#ifndef __BPF__ /* * This output constraint should be used for any inline asm which has a "call" * instruction. Otherwise the asm may be inserted before the frame pointer @@ -146,6 +145,5 @@ register unsigned long current_stack_pointer asm(_ASM_SP); #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) #endif -#endif #endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index b27da9602a6d..aced6c9290d6 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -140,6 +140,20 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); #define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) +#if defined(__clang__) && !defined(CC_HAVE_ASM_GOTO) + +/* + * Workaround for the sake of BPF compilation which utilizes kernel + * headers, but clang does not support ASM GOTO and fails the build. + */ +#ifndef __BPF_TRACING__ +#warning "Compiler lacks ASM_GOTO support. Add -D __BPF_TRACING__ to your compiler arguments" +#endif + +#define static_cpu_has(bit) boot_cpu_has(bit) + +#else + /* * Static testing of CPU features. Used the same as boot_cpu_has(). * These will statically patch the target code for additional @@ -195,6 +209,7 @@ t_no: boot_cpu_has(bit) : \ _static_cpu_has(bit) \ ) +#endif #define cpu_has_bug(c, bit) cpu_has(c, (bit)) #define set_cpu_bug(c, bit) set_cpu_cap(c, (bit)) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index d554c11e01ff..fb00a2fca990 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -198,7 +198,6 @@ #define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ #define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ #define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */ - #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ @@ -207,13 +206,19 @@ #define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */ - +#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ +#define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */ #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ #define X86_FEATURE_SEV ( 7*32+20) /* AMD Secure Encrypted Virtualization */ - #define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ #define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */ +#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */ +#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* "" AMD SSBD implementation via LS_CFG MSR */ +#define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ +#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ +#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ @@ -274,9 +279,10 @@ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ #define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ -#define X86_FEATURE_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ -#define X86_FEATURE_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */ -#define X86_FEATURE_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */ +#define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */ +#define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ @@ -320,6 +326,7 @@ #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ #define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ #define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ +#define X86_FEATURE_CLDEMOTE (16*32+25) /* CLDEMOTE instruction */ /* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */ #define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */ @@ -333,6 +340,7 @@ #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ #define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ +#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ /* * BUG word(s) @@ -362,5 +370,6 @@ #define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ #define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */ #define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ +#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 89ce4bfd241f..ce4d176b3d13 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -30,10 +30,7 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) return dma_ops; } -int arch_dma_supported(struct device *dev, u64 mask); -#define arch_dma_supported arch_dma_supported - -bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp); +bool arch_dma_alloc_attrs(struct device **dev); #define arch_dma_alloc_attrs arch_dma_alloc_attrs #endif diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 09ad88572746..cc8f8fcf9b4a 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -46,7 +46,21 @@ int ftrace_int3_handler(struct pt_regs *regs); #endif /* CONFIG_FUNCTION_TRACER */ -#if !defined(__ASSEMBLY__) && !defined(COMPILE_OFFSETS) +#ifndef __ASSEMBLY__ + +#define ARCH_HAS_SYSCALL_MATCH_SYM_NAME +static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) +{ + /* + * Compare the symbol name with the system call name. Skip the + * "__x64_sys", "__ia32_sys" or simple "sys" prefix. + */ + return !strcmp(sym + 3, name + 3) || + (!strncmp(sym, "__x64_", 6) && !strcmp(sym + 9, name + 3)) || + (!strncmp(sym, "__ia32_", 7) && !strcmp(sym + 10, name + 3)); +} + +#ifndef COMPILE_OFFSETS #if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_IA32_EMULATION) #include <asm/compat.h> @@ -67,6 +81,7 @@ static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs) return false; } #endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_IA32_EMULATION */ -#endif /* !__ASSEMBLY__ && !COMPILE_OFFSETS */ +#endif /* !COMPILE_OFFSETS */ +#endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_FTRACE_H */ diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index b3e32b010ab1..c2c01f84df75 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -208,4 +208,22 @@ static inline int insn_offset_immediate(struct insn *insn) return insn_offset_displacement(insn) + insn->displacement.nbytes; } +#define POP_SS_OPCODE 0x1f +#define MOV_SREG_OPCODE 0x8e + +/* + * Intel SDM Vol.3A 6.8.3 states; + * "Any single-step trap that would be delivered following the MOV to SS + * instruction or POP to SS instruction (because EFLAGS.TF is 1) is + * suppressed." + * This function returns true if @insn is MOV SS or POP SS. On these + * instructions, single stepping is suppressed. + */ +static inline int insn_masking_exception(struct insn *insn) +{ + return insn->opcode.bytes[0] == POP_SS_OPCODE || + (insn->opcode.bytes[0] == MOV_SREG_OPCODE && + X86_MODRM_REG(insn->modrm.bytes[0]) == 2); +} + #endif /* _ASM_X86_INSN_H */ diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 404c5fdff859..548d90bbf919 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -34,11 +34,6 @@ * (0x80 is the syscall vector, 0x30-0x3f are for ISA) */ #define FIRST_EXTERNAL_VECTOR 0x20 -/* - * We start allocating at 0x21 to spread out vectors evenly between - * priority levels. (0x80 is the syscall vector) - */ -#define VECTOR_OFFSET_START 1 /* * Reserve the lowest usable vector (and hence lowest priority) 0x20 for @@ -119,8 +114,6 @@ #define FIRST_SYSTEM_VECTOR NR_VECTORS #endif -#define FPU_IRQ 13 - /* * Size the maximum number of interrupts. * diff --git a/arch/x86/include/asm/jailhouse_para.h b/arch/x86/include/asm/jailhouse_para.h index b885a961a150..a34897aef2c2 100644 --- a/arch/x86/include/asm/jailhouse_para.h +++ b/arch/x86/include/asm/jailhouse_para.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL2.0 */ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Jailhouse paravirt detection diff --git a/arch/x86/include/asm/kexec-bzimage64.h b/arch/x86/include/asm/kexec-bzimage64.h index 9f07cff43705..df89ee7d3e9e 100644 --- a/arch/x86/include/asm/kexec-bzimage64.h +++ b/arch/x86/include/asm/kexec-bzimage64.h @@ -2,6 +2,6 @@ #ifndef _ASM_KEXEC_BZIMAGE64_H #define _ASM_KEXEC_BZIMAGE64_H -extern struct kexec_file_ops kexec_bzImage64_ops; +extern const struct kexec_file_ops kexec_bzImage64_ops; #endif /* _ASM_KEXE_BZIMAGE64_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 949c977bc4c9..f4b2588865e9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -924,7 +924,7 @@ struct kvm_x86_ops { int (*hardware_setup)(void); /* __init */ void (*hardware_unsetup)(void); /* __exit */ bool (*cpu_has_accelerated_tpr)(void); - bool (*cpu_has_high_real_mode_segbase)(void); + bool (*has_emulated_msr)(int index); void (*cpuid_update)(struct kvm_vcpu *vcpu); struct kvm *(*vm_alloc)(void); @@ -1013,6 +1013,7 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); + u64 (*read_l1_tsc_offset)(struct kvm_vcpu *vcpu); void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 57e3785d0d26..cf9911b5a53c 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -193,7 +193,7 @@ static inline int init_new_context(struct task_struct *tsk, #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { - /* pkey 0 is the default and always allocated */ + /* pkey 0 is the default and allocated implicitly */ mm->context.pkey_allocation_map = 0x1; /* -1 means unallocated or invalid */ mm->context.execute_only_pkey = -1; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 53d5b1b9255e..fda2114197b3 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -42,6 +42,8 @@ #define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */ #define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */ #define SPEC_CTRL_STIBP (1 << 1) /* Single Thread Indirect Branch Predictors */ +#define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */ +#define SPEC_CTRL_SSBD (1 << SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ #define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */ @@ -68,6 +70,11 @@ #define MSR_IA32_ARCH_CAPABILITIES 0x0000010a #define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */ #define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */ +#define ARCH_CAP_SSB_NO (1 << 4) /* + * Not susceptible to Speculative Store Bypass + * attack, so no Speculative Store Bypass + * control required. + */ #define MSR_IA32_BBL_CR_CTL 0x00000119 #define MSR_IA32_BBL_CR_CTL3 0x0000011e @@ -340,6 +347,8 @@ #define MSR_AMD64_SEV_ENABLED_BIT 0 #define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT) +#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f + /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index f928ad9b143f..8b38df98548e 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -217,6 +217,14 @@ enum spectre_v2_mitigation { SPECTRE_V2_IBRS, }; +/* The Speculative Store Bypass disable variants */ +enum ssb_mitigation { + SPEC_STORE_BYPASS_NONE, + SPEC_STORE_BYPASS_DISABLE, + SPEC_STORE_BYPASS_PRCTL, + SPEC_STORE_BYPASS_SECCOMP, +}; + extern char __indirect_thunk_start[]; extern char __indirect_thunk_end[]; @@ -241,22 +249,27 @@ static inline void vmexit_fill_RSB(void) #endif } -#define alternative_msr_write(_msr, _val, _feature) \ - asm volatile(ALTERNATIVE("", \ - "movl %[msr], %%ecx\n\t" \ - "movl %[val], %%eax\n\t" \ - "movl $0, %%edx\n\t" \ - "wrmsr", \ - _feature) \ - : : [msr] "i" (_msr), [val] "i" (_val) \ - : "eax", "ecx", "edx", "memory") +static __always_inline +void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature) +{ + asm volatile(ALTERNATIVE("", "wrmsr", %c[feature]) + : : "c" (msr), + "a" ((u32)val), + "d" ((u32)(val >> 32)), + [feature] "i" (feature) + : "memory"); +} static inline void indirect_branch_prediction_barrier(void) { - alternative_msr_write(MSR_IA32_PRED_CMD, PRED_CMD_IBPB, - X86_FEATURE_USE_IBPB); + u64 val = PRED_CMD_IBPB; + + alternative_msr_write(MSR_IA32_PRED_CMD, val, X86_FEATURE_USE_IBPB); } +/* The Intel SPEC CTRL MSR base value cache */ +extern u64 x86_spec_ctrl_base; + /* * With retpoline, we must use IBRS to restrict branch prediction * before calling into firmware. @@ -265,14 +278,18 @@ static inline void indirect_branch_prediction_barrier(void) */ #define firmware_restrict_branch_speculation_start() \ do { \ + u64 val = x86_spec_ctrl_base | SPEC_CTRL_IBRS; \ + \ preempt_disable(); \ - alternative_msr_write(MSR_IA32_SPEC_CTRL, SPEC_CTRL_IBRS, \ + alternative_msr_write(MSR_IA32_SPEC_CTRL, val, \ X86_FEATURE_USE_IBRS_FW); \ } while (0) #define firmware_restrict_branch_speculation_end() \ do { \ - alternative_msr_write(MSR_IA32_SPEC_CTRL, 0, \ + u64 val = x86_spec_ctrl_base; \ + \ + alternative_msr_write(MSR_IA32_SPEC_CTRL, val, \ X86_FEATURE_USE_IBRS_FW); \ preempt_enable(); \ } while (0) diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index d32175e30259..662963681ea6 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -117,9 +117,6 @@ void native_restore_msi_irqs(struct pci_dev *dev); #define native_setup_msi_irqs NULL #define native_teardown_msi_irq NULL #endif - -#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) - #endif /* __KERNEL__ */ #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 89d5c8886c85..f1633de5a675 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -526,22 +526,39 @@ static inline pgprotval_t massage_pgprot(pgprot_t pgprot) return protval; } +static inline pgprotval_t check_pgprot(pgprot_t pgprot) +{ + pgprotval_t massaged_val = massage_pgprot(pgprot); + + /* mmdebug.h can not be included here because of dependencies */ +#ifdef CONFIG_DEBUG_VM + WARN_ONCE(pgprot_val(pgprot) != massaged_val, + "attempted to set unsupported pgprot: %016llx " + "bits: %016llx supported: %016llx\n", + (u64)pgprot_val(pgprot), + (u64)pgprot_val(pgprot) ^ massaged_val, + (u64)__supported_pte_mask); +#endif + + return massaged_val; +} + static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) { return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) | - massage_pgprot(pgprot)); + check_pgprot(pgprot)); } static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) { return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) | - massage_pgprot(pgprot)); + check_pgprot(pgprot)); } static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot) { return __pud(((phys_addr_t)page_nr << PAGE_SHIFT) | - massage_pgprot(pgprot)); + check_pgprot(pgprot)); } static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) @@ -553,7 +570,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) * the newprot (if present): */ val &= _PAGE_CHG_MASK; - val |= massage_pgprot(newprot) & ~_PAGE_CHG_MASK; + val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK; return __pte(val); } @@ -563,7 +580,7 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) pmdval_t val = pmd_val(pmd); val &= _HPAGE_CHG_MASK; - val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK; + val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK; return __pmd(val); } @@ -584,6 +601,11 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define canon_pgprot(p) __pgprot(massage_pgprot(p)) +static inline pgprot_t arch_filter_pgprot(pgprot_t prot) +{ + return canon_pgprot(prot); +} + static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, enum page_cache_mode pcm, enum page_cache_mode new_pcm) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index d5c21a382475..adb47552e6bb 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -105,14 +105,14 @@ extern unsigned int ptrs_per_p4d; #define LDT_PGD_ENTRY (pgtable_l5_enabled ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4) #define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) -#define __VMALLOC_BASE_L4 0xffffc90000000000 -#define __VMALLOC_BASE_L5 0xffa0000000000000 +#define __VMALLOC_BASE_L4 0xffffc90000000000UL +#define __VMALLOC_BASE_L5 0xffa0000000000000UL #define VMALLOC_SIZE_TB_L4 32UL #define VMALLOC_SIZE_TB_L5 12800UL -#define __VMEMMAP_BASE_L4 0xffffea0000000000 -#define __VMEMMAP_BASE_L5 0xffd4000000000000 +#define __VMEMMAP_BASE_L4 0xffffea0000000000UL +#define __VMEMMAP_BASE_L5 0xffd4000000000000UL #ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT # define VMALLOC_START vmalloc_base diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index acfe755562a6..1e5a40673953 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -196,19 +196,21 @@ enum page_cache_mode { #define __PAGE_KERNEL_NOENC (__PAGE_KERNEL) #define __PAGE_KERNEL_NOENC_WP (__PAGE_KERNEL_WP) -#define PAGE_KERNEL __pgprot(__PAGE_KERNEL | _PAGE_ENC) -#define PAGE_KERNEL_NOENC __pgprot(__PAGE_KERNEL) -#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC) -#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC) -#define PAGE_KERNEL_EXEC_NOENC __pgprot(__PAGE_KERNEL_EXEC) -#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX | _PAGE_ENC) -#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC) -#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC) -#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC) -#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC) - -#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) -#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) +#define default_pgprot(x) __pgprot((x) & __default_kernel_pte_mask) + +#define PAGE_KERNEL default_pgprot(__PAGE_KERNEL | _PAGE_ENC) +#define PAGE_KERNEL_NOENC default_pgprot(__PAGE_KERNEL) +#define PAGE_KERNEL_RO default_pgprot(__PAGE_KERNEL_RO | _PAGE_ENC) +#define PAGE_KERNEL_EXEC default_pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC) +#define PAGE_KERNEL_EXEC_NOENC default_pgprot(__PAGE_KERNEL_EXEC) +#define PAGE_KERNEL_RX default_pgprot(__PAGE_KERNEL_RX | _PAGE_ENC) +#define PAGE_KERNEL_NOCACHE default_pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC) +#define PAGE_KERNEL_LARGE default_pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC) +#define PAGE_KERNEL_LARGE_EXEC default_pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC) +#define PAGE_KERNEL_VVAR default_pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC) + +#define PAGE_KERNEL_IO default_pgprot(__PAGE_KERNEL_IO) +#define PAGE_KERNEL_IO_NOCACHE default_pgprot(__PAGE_KERNEL_IO_NOCACHE) #endif /* __ASSEMBLY__ */ @@ -483,6 +485,7 @@ static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot) typedef struct page *pgtable_t; extern pteval_t __supported_pte_mask; +extern pteval_t __default_kernel_pte_mask; extern void set_nx(void); extern int nx_enabled; diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index a0ba1ffda0df..851c04b7a092 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_PKEYS_H #define _ASM_X86_PKEYS_H +#define ARCH_DEFAULT_PKEY 0 + #define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1) extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, @@ -15,7 +17,7 @@ extern int __execute_only_pkey(struct mm_struct *mm); static inline int execute_only_pkey(struct mm_struct *mm) { if (!boot_cpu_has(X86_FEATURE_OSPKE)) - return 0; + return ARCH_DEFAULT_PKEY; return __execute_only_pkey(mm); } @@ -49,13 +51,21 @@ bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) { /* * "Allocated" pkeys are those that have been returned - * from pkey_alloc(). pkey 0 is special, and never - * returned from pkey_alloc(). + * from pkey_alloc() or pkey 0 which is allocated + * implicitly when the mm is created. */ - if (pkey <= 0) + if (pkey < 0) return false; if (pkey >= arch_max_pkey()) return false; + /* + * The exec-only pkey is set in the allocation map, but + * is not available to any of the user interfaces like + * mprotect_pkey(). + */ + if (pkey == mm->context.execute_only_pkey) + return false; + return mm_pkey_allocation_map(mm) & (1U << pkey); } diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 4fa4206029e3..21a114914ba4 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -749,13 +749,11 @@ enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT, extern void enable_sep_cpu(void); extern int sysenter_setup(void); -extern void early_trap_init(void); void early_trap_pf_init(void); /* Defined in head.S */ extern struct desc_ptr early_gdt_descr; -extern void cpu_set_gdt(int); extern void switch_to_new_gdt(int); extern void load_direct_gdt(int); extern void load_fixmap_gdt(int); diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h index 0b5ef05b2d2d..38a17f1d5c9d 100644 --- a/arch/x86/include/asm/pti.h +++ b/arch/x86/include/asm/pti.h @@ -6,8 +6,10 @@ #ifdef CONFIG_PAGE_TABLE_ISOLATION extern void pti_init(void); extern void pti_check_boottime_disable(void); +extern void pti_clone_kernel_text(void); #else static inline void pti_check_boottime_disable(void) { } +static inline void pti_clone_kernel_text(void) { } #endif #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h new file mode 100644 index 000000000000..ae7c2c5cd7f0 --- /dev/null +++ b/arch/x86/include/asm/spec-ctrl.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_SPECCTRL_H_ +#define _ASM_X86_SPECCTRL_H_ + +#include <linux/thread_info.h> +#include <asm/nospec-branch.h> + +/* + * On VMENTER we must preserve whatever view of the SPEC_CTRL MSR + * the guest has, while on VMEXIT we restore the host view. This + * would be easier if SPEC_CTRL were architecturally maskable or + * shadowable for guests but this is not (currently) the case. + * Takes the guest view of SPEC_CTRL MSR as a parameter and also + * the guest's version of VIRT_SPEC_CTRL, if emulated. + */ +extern void x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool guest); + +/** + * x86_spec_ctrl_set_guest - Set speculation control registers for the guest + * @guest_spec_ctrl: The guest content of MSR_SPEC_CTRL + * @guest_virt_spec_ctrl: The guest controlled bits of MSR_VIRT_SPEC_CTRL + * (may get translated to MSR_AMD64_LS_CFG bits) + * + * Avoids writing to the MSR if the content/bits are the same + */ +static inline +void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl) +{ + x86_virt_spec_ctrl(guest_spec_ctrl, guest_virt_spec_ctrl, true); +} + +/** + * x86_spec_ctrl_restore_host - Restore host speculation control registers + * @guest_spec_ctrl: The guest content of MSR_SPEC_CTRL + * @guest_virt_spec_ctrl: The guest controlled bits of MSR_VIRT_SPEC_CTRL + * (may get translated to MSR_AMD64_LS_CFG bits) + * + * Avoids writing to the MSR if the content/bits are the same + */ +static inline +void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl) +{ + x86_virt_spec_ctrl(guest_spec_ctrl, guest_virt_spec_ctrl, false); +} + +/* AMD specific Speculative Store Bypass MSR data */ +extern u64 x86_amd_ls_cfg_base; +extern u64 x86_amd_ls_cfg_ssbd_mask; + +static inline u64 ssbd_tif_to_spec_ctrl(u64 tifn) +{ + BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT); + return (tifn & _TIF_SSBD) >> (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT); +} + +static inline unsigned long ssbd_spec_ctrl_to_tif(u64 spec_ctrl) +{ + BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT); + return (spec_ctrl & SPEC_CTRL_SSBD) << (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT); +} + +static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn) +{ + return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL; +} + +#ifdef CONFIG_SMP +extern void speculative_store_bypass_ht_init(void); +#else +static inline void speculative_store_bypass_ht_init(void) { } +#endif + +extern void speculative_store_bypass_update(unsigned long tif); + +static inline void speculative_store_bypass_update_current(void) +{ + speculative_store_bypass_update(current_thread_info()->flags); +} + +#endif diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 03eedc21246d..d653139857af 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -20,9 +20,13 @@ #include <asm/thread_info.h> /* for TS_COMPAT */ #include <asm/unistd.h> +#ifdef CONFIG_X86_64 +typedef asmlinkage long (*sys_call_ptr_t)(const struct pt_regs *); +#else typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); +#endif /* CONFIG_X86_64 */ extern const sys_call_ptr_t sys_call_table[]; #if defined(CONFIG_X86_32) diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h new file mode 100644 index 000000000000..e046a405743d --- /dev/null +++ b/arch/x86/include/asm/syscall_wrapper.h @@ -0,0 +1,209 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * syscall_wrapper.h - x86 specific wrappers to syscall definitions + */ + +#ifndef _ASM_X86_SYSCALL_WRAPPER_H +#define _ASM_X86_SYSCALL_WRAPPER_H + +/* Mapping of registers to parameters for syscalls on x86-64 and x32 */ +#define SC_X86_64_REGS_TO_ARGS(x, ...) \ + __MAP(x,__SC_ARGS \ + ,,regs->di,,regs->si,,regs->dx \ + ,,regs->r10,,regs->r8,,regs->r9) \ + +/* Mapping of registers to parameters for syscalls on i386 */ +#define SC_IA32_REGS_TO_ARGS(x, ...) \ + __MAP(x,__SC_ARGS \ + ,,(unsigned int)regs->bx,,(unsigned int)regs->cx \ + ,,(unsigned int)regs->dx,,(unsigned int)regs->si \ + ,,(unsigned int)regs->di,,(unsigned int)regs->bp) + +#ifdef CONFIG_IA32_EMULATION +/* + * For IA32 emulation, we need to handle "compat" syscalls *and* create + * additional wrappers (aptly named __ia32_sys_xyzzy) which decode the + * ia32 regs in the proper order for shared or "common" syscalls. As some + * syscalls may not be implemented, we need to expand COND_SYSCALL in + * kernel/sys_ni.c and SYS_NI in kernel/time/posix-stubs.c to cover this + * case as well. + */ +#define __IA32_COMPAT_SYS_STUBx(x, name, ...) \ + asmlinkage long __ia32_compat_sys##name(const struct pt_regs *regs);\ + ALLOW_ERROR_INJECTION(__ia32_compat_sys##name, ERRNO); \ + asmlinkage long __ia32_compat_sys##name(const struct pt_regs *regs)\ + { \ + return __se_compat_sys##name(SC_IA32_REGS_TO_ARGS(x,__VA_ARGS__));\ + } \ + +#define __IA32_SYS_STUBx(x, name, ...) \ + asmlinkage long __ia32_sys##name(const struct pt_regs *regs); \ + ALLOW_ERROR_INJECTION(__ia32_sys##name, ERRNO); \ + asmlinkage long __ia32_sys##name(const struct pt_regs *regs) \ + { \ + return __se_sys##name(SC_IA32_REGS_TO_ARGS(x,__VA_ARGS__));\ + } + +/* + * To keep the naming coherent, re-define SYSCALL_DEFINE0 to create an alias + * named __ia32_sys_*() + */ +#define SYSCALL_DEFINE0(sname) \ + SYSCALL_METADATA(_##sname, 0); \ + asmlinkage long __x64_sys_##sname(void); \ + ALLOW_ERROR_INJECTION(__x64_sys_##sname, ERRNO); \ + SYSCALL_ALIAS(__ia32_sys_##sname, __x64_sys_##sname); \ + asmlinkage long __x64_sys_##sname(void) + +#define COND_SYSCALL(name) \ + cond_syscall(__x64_sys_##name); \ + cond_syscall(__ia32_sys_##name) + +#define SYS_NI(name) \ + SYSCALL_ALIAS(__x64_sys_##name, sys_ni_posix_timers); \ + SYSCALL_ALIAS(__ia32_sys_##name, sys_ni_posix_timers) + +#else /* CONFIG_IA32_EMULATION */ +#define __IA32_COMPAT_SYS_STUBx(x, name, ...) +#define __IA32_SYS_STUBx(x, fullname, name, ...) +#endif /* CONFIG_IA32_EMULATION */ + + +#ifdef CONFIG_X86_X32 +/* + * For the x32 ABI, we need to create a stub for compat_sys_*() which is aware + * of the x86-64-style parameter ordering of x32 syscalls. The syscalls common + * with x86_64 obviously do not need such care. + */ +#define __X32_COMPAT_SYS_STUBx(x, name, ...) \ + asmlinkage long __x32_compat_sys##name(const struct pt_regs *regs);\ + ALLOW_ERROR_INJECTION(__x32_compat_sys##name, ERRNO); \ + asmlinkage long __x32_compat_sys##name(const struct pt_regs *regs)\ + { \ + return __se_compat_sys##name(SC_X86_64_REGS_TO_ARGS(x,__VA_ARGS__));\ + } \ + +#else /* CONFIG_X86_X32 */ +#define __X32_COMPAT_SYS_STUBx(x, name, ...) +#endif /* CONFIG_X86_X32 */ + + +#ifdef CONFIG_COMPAT +/* + * Compat means IA32_EMULATION and/or X86_X32. As they use a different + * mapping of registers to parameters, we need to generate stubs for each + * of them. + */ +#define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ + static long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ + static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ + __IA32_COMPAT_SYS_STUBx(x, name, __VA_ARGS__) \ + __X32_COMPAT_SYS_STUBx(x, name, __VA_ARGS__) \ + static long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ + { \ + return __do_compat_sys##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__));\ + } \ + static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) + +/* + * As some compat syscalls may not be implemented, we need to expand + * COND_SYSCALL_COMPAT in kernel/sys_ni.c and COMPAT_SYS_NI in + * kernel/time/posix-stubs.c to cover this case as well. + */ +#define COND_SYSCALL_COMPAT(name) \ + cond_syscall(__ia32_compat_sys_##name); \ + cond_syscall(__x32_compat_sys_##name) + +#define COMPAT_SYS_NI(name) \ + SYSCALL_ALIAS(__ia32_compat_sys_##name, sys_ni_posix_timers); \ + SYSCALL_ALIAS(__x32_compat_sys_##name, sys_ni_posix_timers) + +#endif /* CONFIG_COMPAT */ + + +/* + * Instead of the generic __SYSCALL_DEFINEx() definition, this macro takes + * struct pt_regs *regs as the only argument of the syscall stub named + * __x64_sys_*(). It decodes just the registers it needs and passes them on to + * the __se_sys_*() wrapper performing sign extension and then to the + * __do_sys_*() function doing the actual job. These wrappers and functions + * are inlined (at least in very most cases), meaning that the assembly looks + * as follows (slightly re-ordered for better readability): + * + * <__x64_sys_recv>: <-- syscall with 4 parameters + * callq <__fentry__> + * + * mov 0x70(%rdi),%rdi <-- decode regs->di + * mov 0x68(%rdi),%rsi <-- decode regs->si + * mov 0x60(%rdi),%rdx <-- decode regs->dx + * mov 0x38(%rdi),%rcx <-- decode regs->r10 + * + * xor %r9d,%r9d <-- clear %r9 + * xor %r8d,%r8d <-- clear %r8 + * + * callq __sys_recvfrom <-- do the actual work in __sys_recvfrom() + * which takes 6 arguments + * + * cltq <-- extend return value to 64-bit + * retq <-- return + * + * This approach avoids leaking random user-provided register content down + * the call chain. + * + * If IA32_EMULATION is enabled, this macro generates an additional wrapper + * named __ia32_sys_*() which decodes the struct pt_regs *regs according + * to the i386 calling convention (bx, cx, dx, si, di, bp). + */ +#define __SYSCALL_DEFINEx(x, name, ...) \ + asmlinkage long __x64_sys##name(const struct pt_regs *regs); \ + ALLOW_ERROR_INJECTION(__x64_sys##name, ERRNO); \ + static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ + static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ + asmlinkage long __x64_sys##name(const struct pt_regs *regs) \ + { \ + return __se_sys##name(SC_X86_64_REGS_TO_ARGS(x,__VA_ARGS__));\ + } \ + __IA32_SYS_STUBx(x, name, __VA_ARGS__) \ + static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ + { \ + long ret = __do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__));\ + __MAP(x,__SC_TEST,__VA_ARGS__); \ + __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \ + return ret; \ + } \ + static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) + +/* + * As the generic SYSCALL_DEFINE0() macro does not decode any parameters for + * obvious reasons, and passing struct pt_regs *regs to it in %rdi does not + * hurt, we only need to re-define it here to keep the naming congruent to + * SYSCALL_DEFINEx() -- which is essential for the COND_SYSCALL() and SYS_NI() + * macros to work correctly. + */ +#ifndef SYSCALL_DEFINE0 +#define SYSCALL_DEFINE0(sname) \ + SYSCALL_METADATA(_##sname, 0); \ + asmlinkage long __x64_sys_##sname(void); \ + ALLOW_ERROR_INJECTION(__x64_sys_##sname, ERRNO); \ + asmlinkage long __x64_sys_##sname(void) +#endif + +#ifndef COND_SYSCALL +#define COND_SYSCALL(name) cond_syscall(__x64_sys_##name) +#endif + +#ifndef SYS_NI +#define SYS_NI(name) SYSCALL_ALIAS(__x64_sys_##name, sys_ni_posix_timers); +#endif + + +/* + * For VSYSCALLS, we need to declare these three syscalls with the new + * pt_regs-based calling convention for in-kernel use. + */ +struct pt_regs; +asmlinkage long __x64_sys_getcpu(const struct pt_regs *regs); +asmlinkage long __x64_sys_gettimeofday(const struct pt_regs *regs); +asmlinkage long __x64_sys_time(const struct pt_regs *regs); + +#endif /* _ASM_X86_SYSCALL_WRAPPER_H */ diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index ae6e05fdc24b..9fa979dd0d9d 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -18,6 +18,12 @@ /* Common in X86_32 and X86_64 */ /* kernel/ioport.c */ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on); + +#ifdef CONFIG_X86_32 +/* + * These definitions are only valid on pure 32-bit systems; x86-64 uses a + * different syscall calling convention + */ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); asmlinkage long sys_iopl(unsigned int); @@ -32,7 +38,6 @@ asmlinkage long sys_set_thread_area(struct user_desc __user *); asmlinkage long sys_get_thread_area(struct user_desc __user *); /* X86_32 only */ -#ifdef CONFIG_X86_32 /* kernel/signal.c */ asmlinkage long sys_sigreturn(void); @@ -42,15 +47,5 @@ struct vm86_struct; asmlinkage long sys_vm86old(struct vm86_struct __user *); asmlinkage long sys_vm86(unsigned long, unsigned long); -#else /* CONFIG_X86_32 */ - -/* X86_64 only */ -/* kernel/process_64.c */ -asmlinkage long sys_arch_prctl(int, unsigned long); - -/* kernel/sys_x86_64.c */ -asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, - unsigned long, unsigned long, unsigned long); - #endif /* CONFIG_X86_32 */ #endif /* _ASM_X86_SYSCALLS_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index a5d9521bb2cb..2ff2a30a264f 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -79,6 +79,7 @@ struct thread_info { #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ +#define TIF_SSBD 5 /* Reduced data speculation */ #define TIF_SYSCALL_EMU 6 /* syscall emulation active */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ @@ -105,6 +106,7 @@ struct thread_info { #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) +#define _TIF_SSBD (1 << TIF_SSBD) #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) @@ -144,7 +146,7 @@ struct thread_info { /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ - (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP) + (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP|_TIF_SSBD) #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY) #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 84137c22fdfa..6690cd3fc8b1 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -131,7 +131,12 @@ static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) { VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); - VM_WARN_ON_ONCE(!this_cpu_has(X86_FEATURE_PCID)); + /* + * Use boot_cpu_has() instead of this_cpu_has() as this function + * might be called during early boot. This should work even after + * boot because all CPU's the have same capabilities: + */ + VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID)); return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH; } diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index aebf60357758..a06cbf019744 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -137,15 +137,15 @@ struct boot_e820_entry { * setup data structure. */ struct jailhouse_setup_data { - u16 version; - u16 compatible_version; - u16 pm_timer_address; - u16 num_cpus; - u64 pci_mmconfig_base; - u32 tsc_khz; - u32 apic_khz; - u8 standard_ioapic; - u8 cpu_ids[255]; + __u16 version; + __u16 compatible_version; + __u16 pm_timer_address; + __u16 num_cpus; + __u64 pci_mmconfig_base; + __u32 tsc_khz; + __u32 apic_khz; + __u8 standard_ioapic; + __u8 cpu_ids[255]; } __attribute__((packed)); /* The so-called "zeropage" */ diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 4c851ebb3ceb..0ede697c3961 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -29,7 +29,7 @@ #define KVM_FEATURE_PV_TLB_FLUSH 9 #define KVM_FEATURE_ASYNC_PF_VMEXIT 10 -#define KVM_HINTS_DEDICATED 0 +#define KVM_HINTS_REALTIME 0 /* The last 8 bits are used to indicate how to interpret the flags field * in pvclock structure. If no bits are set, all flags are ignored. diff --git a/arch/x86/include/uapi/asm/msgbuf.h b/arch/x86/include/uapi/asm/msgbuf.h index 809134c644a6..90ab9a795b49 100644 --- a/arch/x86/include/uapi/asm/msgbuf.h +++ b/arch/x86/include/uapi/asm/msgbuf.h @@ -1 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __ASM_X64_MSGBUF_H +#define __ASM_X64_MSGBUF_H + +#if !defined(__x86_64__) || !defined(__ILP32__) #include <asm-generic/msgbuf.h> +#else +/* + * The msqid64_ds structure for x86 architecture with x32 ABI. + * + * On x86-32 and x86-64 we can just use the generic definition, but + * x32 uses the same binary layout as x86_64, which is differnet + * from other 32-bit architectures. + */ + +struct msqid64_ds { + struct ipc64_perm msg_perm; + __kernel_time_t msg_stime; /* last msgsnd time */ + __kernel_time_t msg_rtime; /* last msgrcv time */ + __kernel_time_t msg_ctime; /* last change time */ + __kernel_ulong_t msg_cbytes; /* current number of bytes on queue */ + __kernel_ulong_t msg_qnum; /* number of messages in queue */ + __kernel_ulong_t msg_qbytes; /* max number of bytes on queue */ + __kernel_pid_t msg_lspid; /* pid of last msgsnd */ + __kernel_pid_t msg_lrpid; /* last receive pid */ + __kernel_ulong_t __unused4; + __kernel_ulong_t __unused5; +}; + +#endif + +#endif /* __ASM_GENERIC_MSGBUF_H */ diff --git a/arch/x86/include/uapi/asm/shmbuf.h b/arch/x86/include/uapi/asm/shmbuf.h index 83c05fc2de38..644421f3823b 100644 --- a/arch/x86/include/uapi/asm/shmbuf.h +++ b/arch/x86/include/uapi/asm/shmbuf.h @@ -1 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __ASM_X86_SHMBUF_H +#define __ASM_X86_SHMBUF_H + +#if !defined(__x86_64__) || !defined(__ILP32__) #include <asm-generic/shmbuf.h> +#else +/* + * The shmid64_ds structure for x86 architecture with x32 ABI. + * + * On x86-32 and x86-64 we can just use the generic definition, but + * x32 uses the same binary layout as x86_64, which is differnet + * from other 32-bit architectures. + */ + +struct shmid64_ds { + struct ipc64_perm shm_perm; /* operation perms */ + size_t shm_segsz; /* size of segment (bytes) */ + __kernel_time_t shm_atime; /* last attach time */ + __kernel_time_t shm_dtime; /* last detach time */ + __kernel_time_t shm_ctime; /* last change time */ + __kernel_pid_t shm_cpid; /* pid of creator */ + __kernel_pid_t shm_lpid; /* pid of last operator */ + __kernel_ulong_t shm_nattch; /* no. of current attaches */ + __kernel_ulong_t __unused4; + __kernel_ulong_t __unused5; +}; + +struct shminfo64 { + __kernel_ulong_t shmmax; + __kernel_ulong_t shmmin; + __kernel_ulong_t shmmni; + __kernel_ulong_t shmseg; + __kernel_ulong_t shmall; + __kernel_ulong_t __unused1; + __kernel_ulong_t __unused2; + __kernel_ulong_t __unused3; + __kernel_ulong_t __unused4; +}; + +#endif + +#endif /* __ASM_X86_SHMBUF_H */ diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 7a37d9357bc4..3b20607d581b 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -200,7 +200,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) { struct acpi_madt_local_x2apic *processor = NULL; #ifdef CONFIG_X86_X2APIC - int apic_id; + u32 apic_id; u8 enabled; #endif @@ -215,6 +215,10 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) apic_id = processor->local_apic_id; enabled = processor->lapic_flags & ACPI_MADT_ENABLED; + /* Ignore invalid ID */ + if (apic_id == 0xffffffff) + return 0; + /* * We need to register disabled CPU as well to permit * counting disabled CPUs. This allows us to size @@ -222,10 +226,13 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) * to not preallocating memory for all NR_CPUS * when we use CPU hotplug. */ - if (!apic->apic_id_valid(apic_id) && enabled) - printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); - else - acpi_register_lapic(apic_id, processor->uid, enabled); + if (!apic->apic_id_valid(apic_id)) { + if (enabled) + pr_warn(PREFIX "x2apic entry ignored\n"); + return 0; + } + + acpi_register_lapic(apic_id, processor->uid, enabled); #else printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); #endif diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index c88e0b127810..b481b95bd8f6 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -14,8 +14,11 @@ #include <asm/amd_nb.h> #define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450 +#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0 #define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463 #define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 +#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F3 0x15eb +#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec /* Protect the PCI config register pairs used for SMN and DF indirect access. */ static DEFINE_MUTEX(smn_mutex); @@ -24,6 +27,7 @@ static u32 *flush_words; static const struct pci_device_id amd_root_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) }, {} }; @@ -39,6 +43,7 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, {} }; @@ -51,6 +56,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, {} }; diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c index a360801779ae..02b4839478b1 100644 --- a/arch/x86/kernel/apic/apic_common.c +++ b/arch/x86/kernel/apic/apic_common.c @@ -40,7 +40,7 @@ int default_check_phys_apicid_present(int phys_apicid) return physid_isset(phys_apicid, phys_cpu_present_map); } -int default_apic_id_valid(int apicid) +int default_apic_id_valid(u32 apicid) { return (apicid < 255); } diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 134e04506ab4..78778b54f904 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -56,7 +56,7 @@ static u32 numachip2_set_apic_id(unsigned int id) return id << 24; } -static int numachip_apic_id_valid(int apicid) +static int numachip_apic_id_valid(u32 apicid) { /* Trust what bootloader passes in MADT */ return 1; diff --git a/arch/x86/kernel/apic/x2apic.h b/arch/x86/kernel/apic/x2apic.h index b107de381cb5..a49b3604027f 100644 --- a/arch/x86/kernel/apic/x2apic.h +++ b/arch/x86/kernel/apic/x2apic.h @@ -1,6 +1,6 @@ /* Common bits for X2APIC cluster/physical modes. */ -int x2apic_apic_id_valid(int apicid); +int x2apic_apic_id_valid(u32 apicid); int x2apic_apic_id_registered(void); void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest); unsigned int x2apic_get_apic_id(unsigned long id); diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 8b04234e010b..7685444a106b 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -116,6 +116,7 @@ static void init_x2apic_ldr(void) goto update; } cmsk = cluster_hotplug_mask; + cmsk->clusterid = cluster; cluster_hotplug_mask = NULL; update: this_cpu_write(cluster_masks, cmsk); diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index e2829bf40e4a..b5cf9e7b3830 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -101,7 +101,7 @@ static int x2apic_phys_probe(void) } /* Common x2apic functions, also used by x2apic_cluster */ -int x2apic_apic_id_valid(int apicid) +int x2apic_apic_id_valid(u32 apicid) { return 1; } diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index f11910b44638..efaf2d4f9c3c 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -557,7 +557,7 @@ static void uv_send_IPI_all(int vector) uv_send_IPI_mask(cpu_online_mask, vector); } -static int uv_apic_id_valid(int apicid) +static int uv_apic_id_valid(u32 apicid) { return 1; } diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index dfcbe6924eaf..cadeafabf167 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1715,19 +1715,6 @@ static int proc_apm_show(struct seq_file *m, void *v) return 0; } -static int proc_apm_open(struct inode *inode, struct file *file) -{ - return single_open(file, proc_apm_show, NULL); -} - -static const struct file_operations apm_file_ops = { - .owner = THIS_MODULE, - .open = proc_apm_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int apm(void *unused) { unsigned short bx; @@ -2360,7 +2347,7 @@ static int __init apm_init(void) set_desc_base(&gdt[APM_DS >> 3], (unsigned long)__va((unsigned long)apm_info.bios.dseg << 4)); - proc_create("apm", 0, NULL, &apm_file_ops); + proc_create_single("apm", 0, NULL, proc_apm_show); kapmd_task = kthread_create(apm, NULL, "kapmd"); if (IS_ERR(kapmd_task)) { diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 12bc0a1139da..1b18be3f35a8 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -10,6 +10,7 @@ #include <asm/processor.h> #include <asm/apic.h> #include <asm/cpu.h> +#include <asm/spec-ctrl.h> #include <asm/smp.h> #include <asm/pci-direct.h> #include <asm/delay.h> @@ -554,6 +555,26 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) rdmsrl(MSR_FAM10H_NODE_ID, value); nodes_per_socket = ((value >> 3) & 7) + 1; } + + if (c->x86 >= 0x15 && c->x86 <= 0x17) { + unsigned int bit; + + switch (c->x86) { + case 0x15: bit = 54; break; + case 0x16: bit = 33; break; + case 0x17: bit = 10; break; + default: return; + } + /* + * Try to cache the base value so further operations can + * avoid RMW. If that faults, do not enable SSBD. + */ + if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { + setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD); + setup_force_cpu_cap(X86_FEATURE_SSBD); + x86_amd_ls_cfg_ssbd_mask = 1ULL << bit; + } + } } static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) @@ -791,6 +812,7 @@ static void init_amd_bd(struct cpuinfo_x86 *c) static void init_amd_zn(struct cpuinfo_x86 *c) { + set_cpu_cap(c, X86_FEATURE_ZEN); /* * Fix erratum 1076: CPB feature bit not being set in CPUID. It affects * all up to and including B1. diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index bfca937bdcc3..7416fc206b4a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -12,8 +12,10 @@ #include <linux/utsname.h> #include <linux/cpu.h> #include <linux/module.h> +#include <linux/nospec.h> +#include <linux/prctl.h> -#include <asm/nospec-branch.h> +#include <asm/spec-ctrl.h> #include <asm/cmdline.h> #include <asm/bugs.h> #include <asm/processor.h> @@ -27,6 +29,27 @@ #include <asm/intel-family.h> static void __init spectre_v2_select_mitigation(void); +static void __init ssb_select_mitigation(void); + +/* + * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any + * writes to SPEC_CTRL contain whatever reserved bits have been set. + */ +u64 __ro_after_init x86_spec_ctrl_base; +EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); + +/* + * The vendor and possibly platform specific bits which can be modified in + * x86_spec_ctrl_base. + */ +static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS; + +/* + * AMD specific MSR info for Speculative Store Bypass control. + * x86_amd_ls_cfg_ssbd_mask is initialized in identify_boot_cpu(). + */ +u64 __ro_after_init x86_amd_ls_cfg_base; +u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask; void __init check_bugs(void) { @@ -37,9 +60,27 @@ void __init check_bugs(void) print_cpu_info(&boot_cpu_data); } + /* + * Read the SPEC_CTRL MSR to account for reserved bits which may + * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD + * init code as it is not enumerated and depends on the family. + */ + if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) + rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + + /* Allow STIBP in MSR_SPEC_CTRL if supported */ + if (boot_cpu_has(X86_FEATURE_STIBP)) + x86_spec_ctrl_mask |= SPEC_CTRL_STIBP; + /* Select the proper spectre mitigation before patching alternatives */ spectre_v2_select_mitigation(); + /* + * Select proper mitigation for any exposure to the Speculative Store + * Bypass vulnerability. + */ + ssb_select_mitigation(); + #ifdef CONFIG_X86_32 /* * Check whether we are able to run this kernel safely on SMP. @@ -93,7 +134,76 @@ static const char *spectre_v2_strings[] = { #undef pr_fmt #define pr_fmt(fmt) "Spectre V2 : " fmt -static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; +static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = + SPECTRE_V2_NONE; + +void +x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) +{ + u64 msrval, guestval, hostval = x86_spec_ctrl_base; + struct thread_info *ti = current_thread_info(); + + /* Is MSR_SPEC_CTRL implemented ? */ + if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) { + /* + * Restrict guest_spec_ctrl to supported values. Clear the + * modifiable bits in the host base value and or the + * modifiable bits from the guest value. + */ + guestval = hostval & ~x86_spec_ctrl_mask; + guestval |= guest_spec_ctrl & x86_spec_ctrl_mask; + + /* SSBD controlled in MSR_SPEC_CTRL */ + if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD)) + hostval |= ssbd_tif_to_spec_ctrl(ti->flags); + + if (hostval != guestval) { + msrval = setguest ? guestval : hostval; + wrmsrl(MSR_IA32_SPEC_CTRL, msrval); + } + } + + /* + * If SSBD is not handled in MSR_SPEC_CTRL on AMD, update + * MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported. + */ + if (!static_cpu_has(X86_FEATURE_LS_CFG_SSBD) && + !static_cpu_has(X86_FEATURE_VIRT_SSBD)) + return; + + /* + * If the host has SSBD mitigation enabled, force it in the host's + * virtual MSR value. If its not permanently enabled, evaluate + * current's TIF_SSBD thread flag. + */ + if (static_cpu_has(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE)) + hostval = SPEC_CTRL_SSBD; + else + hostval = ssbd_tif_to_spec_ctrl(ti->flags); + + /* Sanitize the guest value */ + guestval = guest_virt_spec_ctrl & SPEC_CTRL_SSBD; + + if (hostval != guestval) { + unsigned long tif; + + tif = setguest ? ssbd_spec_ctrl_to_tif(guestval) : + ssbd_spec_ctrl_to_tif(hostval); + + speculative_store_bypass_update(tif); + } +} +EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl); + +static void x86_amd_ssb_disable(void) +{ + u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_ssbd_mask; + + if (boot_cpu_has(X86_FEATURE_VIRT_SSBD)) + wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD); + else if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD)) + wrmsrl(MSR_AMD64_LS_CFG, msrval); +} #ifdef RETPOLINE static bool spectre_v2_bad_module; @@ -312,32 +422,289 @@ retpoline_auto: } #undef pr_fmt +#define pr_fmt(fmt) "Speculative Store Bypass: " fmt + +static enum ssb_mitigation ssb_mode __ro_after_init = SPEC_STORE_BYPASS_NONE; + +/* The kernel command line selection */ +enum ssb_mitigation_cmd { + SPEC_STORE_BYPASS_CMD_NONE, + SPEC_STORE_BYPASS_CMD_AUTO, + SPEC_STORE_BYPASS_CMD_ON, + SPEC_STORE_BYPASS_CMD_PRCTL, + SPEC_STORE_BYPASS_CMD_SECCOMP, +}; + +static const char *ssb_strings[] = { + [SPEC_STORE_BYPASS_NONE] = "Vulnerable", + [SPEC_STORE_BYPASS_DISABLE] = "Mitigation: Speculative Store Bypass disabled", + [SPEC_STORE_BYPASS_PRCTL] = "Mitigation: Speculative Store Bypass disabled via prctl", + [SPEC_STORE_BYPASS_SECCOMP] = "Mitigation: Speculative Store Bypass disabled via prctl and seccomp", +}; + +static const struct { + const char *option; + enum ssb_mitigation_cmd cmd; +} ssb_mitigation_options[] = { + { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */ + { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */ + { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */ + { "prctl", SPEC_STORE_BYPASS_CMD_PRCTL }, /* Disable Speculative Store Bypass via prctl */ + { "seccomp", SPEC_STORE_BYPASS_CMD_SECCOMP }, /* Disable Speculative Store Bypass via prctl and seccomp */ +}; + +static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) +{ + enum ssb_mitigation_cmd cmd = SPEC_STORE_BYPASS_CMD_AUTO; + char arg[20]; + int ret, i; + + if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable")) { + return SPEC_STORE_BYPASS_CMD_NONE; + } else { + ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", + arg, sizeof(arg)); + if (ret < 0) + return SPEC_STORE_BYPASS_CMD_AUTO; + + for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) { + if (!match_option(arg, ret, ssb_mitigation_options[i].option)) + continue; + + cmd = ssb_mitigation_options[i].cmd; + break; + } + + if (i >= ARRAY_SIZE(ssb_mitigation_options)) { + pr_err("unknown option (%s). Switching to AUTO select\n", arg); + return SPEC_STORE_BYPASS_CMD_AUTO; + } + } + + return cmd; +} + +static enum ssb_mitigation __init __ssb_select_mitigation(void) +{ + enum ssb_mitigation mode = SPEC_STORE_BYPASS_NONE; + enum ssb_mitigation_cmd cmd; + + if (!boot_cpu_has(X86_FEATURE_SSBD)) + return mode; + + cmd = ssb_parse_cmdline(); + if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) && + (cmd == SPEC_STORE_BYPASS_CMD_NONE || + cmd == SPEC_STORE_BYPASS_CMD_AUTO)) + return mode; + + switch (cmd) { + case SPEC_STORE_BYPASS_CMD_AUTO: + case SPEC_STORE_BYPASS_CMD_SECCOMP: + /* + * Choose prctl+seccomp as the default mode if seccomp is + * enabled. + */ + if (IS_ENABLED(CONFIG_SECCOMP)) + mode = SPEC_STORE_BYPASS_SECCOMP; + else + mode = SPEC_STORE_BYPASS_PRCTL; + break; + case SPEC_STORE_BYPASS_CMD_ON: + mode = SPEC_STORE_BYPASS_DISABLE; + break; + case SPEC_STORE_BYPASS_CMD_PRCTL: + mode = SPEC_STORE_BYPASS_PRCTL; + break; + case SPEC_STORE_BYPASS_CMD_NONE: + break; + } + + /* + * We have three CPU feature flags that are in play here: + * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible. + * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass + * - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation + */ + if (mode == SPEC_STORE_BYPASS_DISABLE) { + setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE); + /* + * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD uses + * a completely different MSR and bit dependent on family. + */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + x86_spec_ctrl_base |= SPEC_CTRL_SSBD; + x86_spec_ctrl_mask |= SPEC_CTRL_SSBD; + wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + break; + case X86_VENDOR_AMD: + x86_amd_ssb_disable(); + break; + } + } + + return mode; +} + +static void ssb_select_mitigation(void) +{ + ssb_mode = __ssb_select_mitigation(); + + if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + pr_info("%s\n", ssb_strings[ssb_mode]); +} + +#undef pr_fmt +#define pr_fmt(fmt) "Speculation prctl: " fmt + +static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) +{ + bool update; + + if (ssb_mode != SPEC_STORE_BYPASS_PRCTL && + ssb_mode != SPEC_STORE_BYPASS_SECCOMP) + return -ENXIO; + + switch (ctrl) { + case PR_SPEC_ENABLE: + /* If speculation is force disabled, enable is not allowed */ + if (task_spec_ssb_force_disable(task)) + return -EPERM; + task_clear_spec_ssb_disable(task); + update = test_and_clear_tsk_thread_flag(task, TIF_SSBD); + break; + case PR_SPEC_DISABLE: + task_set_spec_ssb_disable(task); + update = !test_and_set_tsk_thread_flag(task, TIF_SSBD); + break; + case PR_SPEC_FORCE_DISABLE: + task_set_spec_ssb_disable(task); + task_set_spec_ssb_force_disable(task); + update = !test_and_set_tsk_thread_flag(task, TIF_SSBD); + break; + default: + return -ERANGE; + } + + /* + * If being set on non-current task, delay setting the CPU + * mitigation until it is next scheduled. + */ + if (task == current && update) + speculative_store_bypass_update_current(); + + return 0; +} + +int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, + unsigned long ctrl) +{ + switch (which) { + case PR_SPEC_STORE_BYPASS: + return ssb_prctl_set(task, ctrl); + default: + return -ENODEV; + } +} + +#ifdef CONFIG_SECCOMP +void arch_seccomp_spec_mitigate(struct task_struct *task) +{ + if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP) + ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE); +} +#endif + +static int ssb_prctl_get(struct task_struct *task) +{ + switch (ssb_mode) { + case SPEC_STORE_BYPASS_DISABLE: + return PR_SPEC_DISABLE; + case SPEC_STORE_BYPASS_SECCOMP: + case SPEC_STORE_BYPASS_PRCTL: + if (task_spec_ssb_force_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; + if (task_spec_ssb_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_DISABLE; + return PR_SPEC_PRCTL | PR_SPEC_ENABLE; + default: + if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + return PR_SPEC_ENABLE; + return PR_SPEC_NOT_AFFECTED; + } +} + +int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) +{ + switch (which) { + case PR_SPEC_STORE_BYPASS: + return ssb_prctl_get(task); + default: + return -ENODEV; + } +} + +void x86_spec_ctrl_setup_ap(void) +{ + if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) + wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + + if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) + x86_amd_ssb_disable(); +} #ifdef CONFIG_SYSFS -ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) + +static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, + char *buf, unsigned int bug) { - if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + if (!boot_cpu_has_bug(bug)) return sprintf(buf, "Not affected\n"); - if (boot_cpu_has(X86_FEATURE_PTI)) - return sprintf(buf, "Mitigation: PTI\n"); + + switch (bug) { + case X86_BUG_CPU_MELTDOWN: + if (boot_cpu_has(X86_FEATURE_PTI)) + return sprintf(buf, "Mitigation: PTI\n"); + + break; + + case X86_BUG_SPECTRE_V1: + return sprintf(buf, "Mitigation: __user pointer sanitization\n"); + + case X86_BUG_SPECTRE_V2: + return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], + boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "", + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + spectre_v2_module_string()); + + case X86_BUG_SPEC_STORE_BYPASS: + return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); + + default: + break; + } + return sprintf(buf, "Vulnerable\n"); } +ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_CPU_MELTDOWN); +} + ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf) { - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1)) - return sprintf(buf, "Not affected\n"); - return sprintf(buf, "Mitigation: __user pointer sanitization\n"); + return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V1); } ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf) { - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) - return sprintf(buf, "Not affected\n"); + return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V2); +} - return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], - boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "", - boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", - spectre_v2_module_string()); +ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS); } #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4702fbd98f92..38276f58d3bf 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -757,17 +757,32 @@ static void init_speculation_control(struct cpuinfo_x86 *c) * and they also have a different bit for STIBP support. Also, * a hypervisor might have set the individual AMD bits even on * Intel CPUs, for finer-grained selection of what's available. - * - * We use the AMD bits in 0x8000_0008 EBX as the generic hardware - * features, which are visible in /proc/cpuinfo and used by the - * kernel. So set those accordingly from the Intel bits. */ if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) { set_cpu_cap(c, X86_FEATURE_IBRS); set_cpu_cap(c, X86_FEATURE_IBPB); + set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); } + if (cpu_has(c, X86_FEATURE_INTEL_STIBP)) set_cpu_cap(c, X86_FEATURE_STIBP); + + if (cpu_has(c, X86_FEATURE_SPEC_CTRL_SSBD) || + cpu_has(c, X86_FEATURE_VIRT_SSBD)) + set_cpu_cap(c, X86_FEATURE_SSBD); + + if (cpu_has(c, X86_FEATURE_AMD_IBRS)) { + set_cpu_cap(c, X86_FEATURE_IBRS); + set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); + } + + if (cpu_has(c, X86_FEATURE_AMD_IBPB)) + set_cpu_cap(c, X86_FEATURE_IBPB); + + if (cpu_has(c, X86_FEATURE_AMD_STIBP)) { + set_cpu_cap(c, X86_FEATURE_STIBP); + set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); + } } void get_cpu_cap(struct cpuinfo_x86 *c) @@ -850,15 +865,8 @@ void get_cpu_cap(struct cpuinfo_x86 *c) if (c->extended_cpuid_level >= 0x80000008) { cpuid(0x80000008, &eax, &ebx, &ecx, &edx); - - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; c->x86_capability[CPUID_8000_0008_EBX] = ebx; } -#ifdef CONFIG_X86_32 - else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36)) - c->x86_phys_bits = 36; -#endif if (c->extended_cpuid_level >= 0x8000000a) c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); @@ -874,6 +882,22 @@ void get_cpu_cap(struct cpuinfo_x86 *c) apply_forced_caps(c); } +static void get_cpu_address_sizes(struct cpuinfo_x86 *c) +{ + u32 eax, ebx, ecx, edx; + + if (c->extended_cpuid_level >= 0x80000008) { + cpuid(0x80000008, &eax, &ebx, &ecx, &edx); + + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; + } +#ifdef CONFIG_X86_32 + else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36)) + c->x86_phys_bits = 36; +#endif +} + static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_32 @@ -918,21 +942,47 @@ static const __initconst struct x86_cpu_id cpu_no_meltdown[] = { {} }; -static bool __init cpu_vulnerable_to_meltdown(struct cpuinfo_x86 *c) +/* Only list CPUs which speculate but are non susceptible to SSB */ +static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = { + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_CORE_YONAH }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, + { X86_VENDOR_AMD, 0x12, }, + { X86_VENDOR_AMD, 0x11, }, + { X86_VENDOR_AMD, 0x10, }, + { X86_VENDOR_AMD, 0xf, }, + {} +}; + +static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { u64 ia32_cap = 0; - if (x86_match_cpu(cpu_no_meltdown)) - return false; + if (x86_match_cpu(cpu_no_speculation)) + return; + + setup_force_cpu_bug(X86_BUG_SPECTRE_V1); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + if (!x86_match_cpu(cpu_no_spec_store_bypass) && + !(ia32_cap & ARCH_CAP_SSB_NO)) + setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); + + if (x86_match_cpu(cpu_no_meltdown)) + return; + /* Rogue Data Cache Load? No! */ if (ia32_cap & ARCH_CAP_RDCL_NO) - return false; + return; - return true; + setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); } /* @@ -965,6 +1015,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) cpu_detect(c); get_cpu_vendor(c); get_cpu_cap(c); + get_cpu_address_sizes(c); setup_force_cpu_cap(X86_FEATURE_CPUID); if (this_cpu->c_early_init) @@ -982,12 +1033,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_force_cpu_cap(X86_FEATURE_ALWAYS); - if (!x86_match_cpu(cpu_no_speculation)) { - if (cpu_vulnerable_to_meltdown(c)) - setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); - setup_force_cpu_bug(X86_BUG_SPECTRE_V1); - setup_force_cpu_bug(X86_BUG_SPECTRE_V2); - } + cpu_set_bug_bits(c); fpu__init_system(c); @@ -1097,6 +1143,8 @@ static void generic_identify(struct cpuinfo_x86 *c) get_cpu_cap(c); + get_cpu_address_sizes(c); + if (c->cpuid_level >= 0x00000001) { c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; #ifdef CONFIG_X86_32 @@ -1347,6 +1395,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) #endif mtrr_ap_init(); validate_apic_and_package_id(c); + x86_spec_ctrl_setup_ap(); } static __init int setup_noclflush(char *arg) diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index e806b11a99af..37672d299e35 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -50,4 +50,6 @@ extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); unsigned int aperfmperf_get_khz(int cpu); +extern void x86_spec_ctrl_setup_ap(void); + #endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 904b0a3c4e53..2c0bd38a44ab 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -19,7 +19,7 @@ struct cpuid_dep { * called from cpu hotplug. It shouldn't do anything in this case, * but it's difficult to tell that to the init reference checker. */ -const static struct cpuid_dep cpuid_deps[] = { +static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE }, { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE }, { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE }, diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index b9693b80fc21..577e7f7ae273 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -188,7 +188,10 @@ static void early_init_intel(struct cpuinfo_x86 *c) setup_clear_cpu_cap(X86_FEATURE_IBPB); setup_clear_cpu_cap(X86_FEATURE_STIBP); setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL); + setup_clear_cpu_cap(X86_FEATURE_MSR_SPEC_CTRL); setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP); + setup_clear_cpu_cap(X86_FEATURE_SSBD); + setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL_SSBD); } /* @@ -835,6 +838,9 @@ static const struct _tlb_table intel_tlb_table[] = { { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" }, { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" }, { 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" }, + { 0x6b, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 8-way associative" }, + { 0x6c, TLB_DATA_2M_4M, 128, " TLB_DATA 2 MByte or 4 MByte pages, 8-way associative" }, + { 0x6d, TLB_DATA_1G, 16, " TLB_DATA 1 GByte pages, fully associative" }, { 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" }, { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" }, { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" }, diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index f7666eef4a87..c8e038800591 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -94,6 +94,11 @@ static struct smca_bank_name smca_names[] = { [SMCA_SMU] = { "smu", "System Management Unit" }, }; +static u32 smca_bank_addrs[MAX_NR_BANKS][NR_BLOCKS] __ro_after_init = +{ + [0 ... MAX_NR_BANKS - 1] = { [0 ... NR_BLOCKS - 1] = -1 } +}; + const char *smca_get_name(enum smca_bank_types t) { if (t >= N_SMCA_BANK_TYPES) @@ -443,20 +448,26 @@ static u32 smca_get_block_address(unsigned int cpu, unsigned int bank, if (!block) return MSR_AMD64_SMCA_MCx_MISC(bank); + /* Check our cache first: */ + if (smca_bank_addrs[bank][block] != -1) + return smca_bank_addrs[bank][block]; + /* * For SMCA enabled processors, BLKPTR field of the first MISC register * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4). */ if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) - return addr; + goto out; if (!(low & MCI_CONFIG_MCAX)) - return addr; + goto out; if (!rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && (low & MASK_BLKPTR_LO)) - return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); + addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); +out: + smca_bank_addrs[bank][block] = addr; return addr; } @@ -468,18 +479,6 @@ static u32 get_block_address(unsigned int cpu, u32 current_addr, u32 low, u32 hi if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS)) return addr; - /* Get address from already initialized block. */ - if (per_cpu(threshold_banks, cpu)) { - struct threshold_bank *bankp = per_cpu(threshold_banks, cpu)[bank]; - - if (bankp && bankp->blocks) { - struct threshold_block *blockp = &bankp->blocks[block]; - - if (blockp) - return blockp->address; - } - } - if (mce_flags.smca) return smca_get_block_address(cpu, bank, block); diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 10c4fc2c91f8..77e201301528 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -564,14 +564,12 @@ static int __reload_late(void *info) apply_microcode_local(&err); spin_unlock(&update_lock); + /* siblings return UCODE_OK because their engine got updated already */ if (err > UCODE_NFOUND) { pr_warn("Error reloading microcode on CPU %d\n", cpu); - return -1; - /* siblings return UCODE_OK because their engine got updated already */ + ret = -1; } else if (err == UCODE_UPDATED || err == UCODE_OK) { ret = 1; - } else { - return ret; } /* diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 32b8e5724f96..1c2cfa0644aa 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -485,7 +485,6 @@ static void show_saved_mc(void) */ static void save_mc_for_early(u8 *mc, unsigned int size) { -#ifdef CONFIG_HOTPLUG_CPU /* Synchronization during CPU hotplug. */ static DEFINE_MUTEX(x86_cpu_microcode_mutex); @@ -495,7 +494,6 @@ static void save_mc_for_early(u8 *mc, unsigned int size) show_saved_mc(); mutex_unlock(&x86_cpu_microcode_mutex); -#endif } static bool load_builtin_intel_microcode(struct cpio_data *cp) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 1f6680427ff0..f631a3f15587 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -38,37 +38,6 @@ #include <asm/virtext.h> #include <asm/intel_pt.h> -/* Alignment required for elf header segment */ -#define ELF_CORE_HEADER_ALIGN 4096 - -/* This primarily represents number of split ranges due to exclusion */ -#define CRASH_MAX_RANGES 16 - -struct crash_mem_range { - u64 start, end; -}; - -struct crash_mem { - unsigned int nr_ranges; - struct crash_mem_range ranges[CRASH_MAX_RANGES]; -}; - -/* Misc data about ram ranges needed to prepare elf headers */ -struct crash_elf_data { - struct kimage *image; - /* - * Total number of ram ranges we have after various adjustments for - * crash reserved region, etc. - */ - unsigned int max_nr_ranges; - - /* Pointer to elf header */ - void *ehdr; - /* Pointer to next phdr */ - void *bufp; - struct crash_mem mem; -}; - /* Used while preparing memory map entries for second kernel */ struct crash_memmap_data { struct boot_params *params; @@ -218,124 +187,49 @@ static int get_nr_ram_ranges_callback(struct resource *res, void *arg) return 0; } - /* Gather all the required information to prepare elf headers for ram regions */ -static void fill_up_crash_elf_data(struct crash_elf_data *ced, - struct kimage *image) +static struct crash_mem *fill_up_crash_elf_data(void) { unsigned int nr_ranges = 0; - - ced->image = image; + struct crash_mem *cmem; walk_system_ram_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback); + if (!nr_ranges) + return NULL; - ced->max_nr_ranges = nr_ranges; - - /* Exclusion of crash region could split memory ranges */ - ced->max_nr_ranges++; - - /* If crashk_low_res is not 0, another range split possible */ - if (crashk_low_res.end) - ced->max_nr_ranges++; -} - -static int exclude_mem_range(struct crash_mem *mem, - unsigned long long mstart, unsigned long long mend) -{ - int i, j; - unsigned long long start, end; - struct crash_mem_range temp_range = {0, 0}; - - for (i = 0; i < mem->nr_ranges; i++) { - start = mem->ranges[i].start; - end = mem->ranges[i].end; - - if (mstart > end || mend < start) - continue; - - /* Truncate any area outside of range */ - if (mstart < start) - mstart = start; - if (mend > end) - mend = end; - - /* Found completely overlapping range */ - if (mstart == start && mend == end) { - mem->ranges[i].start = 0; - mem->ranges[i].end = 0; - if (i < mem->nr_ranges - 1) { - /* Shift rest of the ranges to left */ - for (j = i; j < mem->nr_ranges - 1; j++) { - mem->ranges[j].start = - mem->ranges[j+1].start; - mem->ranges[j].end = - mem->ranges[j+1].end; - } - } - mem->nr_ranges--; - return 0; - } - - if (mstart > start && mend < end) { - /* Split original range */ - mem->ranges[i].end = mstart - 1; - temp_range.start = mend + 1; - temp_range.end = end; - } else if (mstart != start) - mem->ranges[i].end = mstart - 1; - else - mem->ranges[i].start = mend + 1; - break; - } + /* + * Exclusion of crash region and/or crashk_low_res may cause + * another range split. So add extra two slots here. + */ + nr_ranges += 2; + cmem = vzalloc(sizeof(struct crash_mem) + + sizeof(struct crash_mem_range) * nr_ranges); + if (!cmem) + return NULL; - /* If a split happend, add the split to array */ - if (!temp_range.end) - return 0; + cmem->max_nr_ranges = nr_ranges; + cmem->nr_ranges = 0; - /* Split happened */ - if (i == CRASH_MAX_RANGES - 1) { - pr_err("Too many crash ranges after split\n"); - return -ENOMEM; - } - - /* Location where new range should go */ - j = i + 1; - if (j < mem->nr_ranges) { - /* Move over all ranges one slot towards the end */ - for (i = mem->nr_ranges - 1; i >= j; i--) - mem->ranges[i + 1] = mem->ranges[i]; - } - - mem->ranges[j].start = temp_range.start; - mem->ranges[j].end = temp_range.end; - mem->nr_ranges++; - return 0; + return cmem; } /* * Look for any unwanted ranges between mstart, mend and remove them. This - * might lead to split and split ranges are put in ced->mem.ranges[] array + * might lead to split and split ranges are put in cmem->ranges[] array */ -static int elf_header_exclude_ranges(struct crash_elf_data *ced, - unsigned long long mstart, unsigned long long mend) +static int elf_header_exclude_ranges(struct crash_mem *cmem) { - struct crash_mem *cmem = &ced->mem; int ret = 0; - memset(cmem->ranges, 0, sizeof(cmem->ranges)); - - cmem->ranges[0].start = mstart; - cmem->ranges[0].end = mend; - cmem->nr_ranges = 1; - /* Exclude crashkernel region */ - ret = exclude_mem_range(cmem, crashk_res.start, crashk_res.end); + ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end); if (ret) return ret; if (crashk_low_res.end) { - ret = exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end); + ret = crash_exclude_mem_range(cmem, crashk_low_res.start, + crashk_low_res.end); if (ret) return ret; } @@ -345,144 +239,12 @@ static int elf_header_exclude_ranges(struct crash_elf_data *ced, static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg) { - struct crash_elf_data *ced = arg; - Elf64_Ehdr *ehdr; - Elf64_Phdr *phdr; - unsigned long mstart, mend; - struct kimage *image = ced->image; - struct crash_mem *cmem; - int ret, i; - - ehdr = ced->ehdr; - - /* Exclude unwanted mem ranges */ - ret = elf_header_exclude_ranges(ced, res->start, res->end); - if (ret) - return ret; - - /* Go through all the ranges in ced->mem.ranges[] and prepare phdr */ - cmem = &ced->mem; - - for (i = 0; i < cmem->nr_ranges; i++) { - mstart = cmem->ranges[i].start; - mend = cmem->ranges[i].end; - - phdr = ced->bufp; - ced->bufp += sizeof(Elf64_Phdr); - - phdr->p_type = PT_LOAD; - phdr->p_flags = PF_R|PF_W|PF_X; - phdr->p_offset = mstart; - - /* - * If a range matches backup region, adjust offset to backup - * segment. - */ - if (mstart == image->arch.backup_src_start && - (mend - mstart + 1) == image->arch.backup_src_sz) - phdr->p_offset = image->arch.backup_load_addr; - - phdr->p_paddr = mstart; - phdr->p_vaddr = (unsigned long long) __va(mstart); - phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; - phdr->p_align = 0; - ehdr->e_phnum++; - pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", - phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, - ehdr->e_phnum, phdr->p_offset); - } - - return ret; -} - -static int prepare_elf64_headers(struct crash_elf_data *ced, - void **addr, unsigned long *sz) -{ - Elf64_Ehdr *ehdr; - Elf64_Phdr *phdr; - unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz; - unsigned char *buf, *bufp; - unsigned int cpu; - unsigned long long notes_addr; - int ret; + struct crash_mem *cmem = arg; - /* extra phdr for vmcoreinfo elf note */ - nr_phdr = nr_cpus + 1; - nr_phdr += ced->max_nr_ranges; - - /* - * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping - * area on x86_64 (ffffffff80000000 - ffffffffa0000000). - * I think this is required by tools like gdb. So same physical - * memory will be mapped in two elf headers. One will contain kernel - * text virtual addresses and other will have __va(physical) addresses. - */ + cmem->ranges[cmem->nr_ranges].start = res->start; + cmem->ranges[cmem->nr_ranges].end = res->end; + cmem->nr_ranges++; - nr_phdr++; - elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr); - elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN); - - buf = vzalloc(elf_sz); - if (!buf) - return -ENOMEM; - - bufp = buf; - ehdr = (Elf64_Ehdr *)bufp; - bufp += sizeof(Elf64_Ehdr); - memcpy(ehdr->e_ident, ELFMAG, SELFMAG); - ehdr->e_ident[EI_CLASS] = ELFCLASS64; - ehdr->e_ident[EI_DATA] = ELFDATA2LSB; - ehdr->e_ident[EI_VERSION] = EV_CURRENT; - ehdr->e_ident[EI_OSABI] = ELF_OSABI; - memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD); - ehdr->e_type = ET_CORE; - ehdr->e_machine = ELF_ARCH; - ehdr->e_version = EV_CURRENT; - ehdr->e_phoff = sizeof(Elf64_Ehdr); - ehdr->e_ehsize = sizeof(Elf64_Ehdr); - ehdr->e_phentsize = sizeof(Elf64_Phdr); - - /* Prepare one phdr of type PT_NOTE for each present cpu */ - for_each_present_cpu(cpu) { - phdr = (Elf64_Phdr *)bufp; - bufp += sizeof(Elf64_Phdr); - phdr->p_type = PT_NOTE; - notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu)); - phdr->p_offset = phdr->p_paddr = notes_addr; - phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t); - (ehdr->e_phnum)++; - } - - /* Prepare one PT_NOTE header for vmcoreinfo */ - phdr = (Elf64_Phdr *)bufp; - bufp += sizeof(Elf64_Phdr); - phdr->p_type = PT_NOTE; - phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note(); - phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE; - (ehdr->e_phnum)++; - -#ifdef CONFIG_X86_64 - /* Prepare PT_LOAD type program header for kernel text region */ - phdr = (Elf64_Phdr *)bufp; - bufp += sizeof(Elf64_Phdr); - phdr->p_type = PT_LOAD; - phdr->p_flags = PF_R|PF_W|PF_X; - phdr->p_vaddr = (Elf64_Addr)_text; - phdr->p_filesz = phdr->p_memsz = _end - _text; - phdr->p_offset = phdr->p_paddr = __pa_symbol(_text); - (ehdr->e_phnum)++; -#endif - - /* Prepare PT_LOAD headers for system ram chunks. */ - ced->ehdr = ehdr; - ced->bufp = bufp; - ret = walk_system_ram_res(0, -1, ced, - prepare_elf64_ram_headers_callback); - if (ret < 0) - return ret; - - *addr = buf; - *sz = elf_sz; return 0; } @@ -490,18 +252,46 @@ static int prepare_elf64_headers(struct crash_elf_data *ced, static int prepare_elf_headers(struct kimage *image, void **addr, unsigned long *sz) { - struct crash_elf_data *ced; - int ret; + struct crash_mem *cmem; + Elf64_Ehdr *ehdr; + Elf64_Phdr *phdr; + int ret, i; - ced = kzalloc(sizeof(*ced), GFP_KERNEL); - if (!ced) + cmem = fill_up_crash_elf_data(); + if (!cmem) return -ENOMEM; - fill_up_crash_elf_data(ced, image); + ret = walk_system_ram_res(0, -1, cmem, + prepare_elf64_ram_headers_callback); + if (ret) + goto out; + + /* Exclude unwanted mem ranges */ + ret = elf_header_exclude_ranges(cmem); + if (ret) + goto out; /* By default prepare 64bit headers */ - ret = prepare_elf64_headers(ced, addr, sz); - kfree(ced); + ret = crash_prepare_elf64_headers(cmem, + IS_ENABLED(CONFIG_X86_64), addr, sz); + if (ret) + goto out; + + /* + * If a range matches backup region, adjust offset to backup + * segment. + */ + ehdr = (Elf64_Ehdr *)*addr; + phdr = (Elf64_Phdr *)(ehdr + 1); + for (i = 0; i < ehdr->e_phnum; phdr++, i++) + if (phdr->p_type == PT_LOAD && + phdr->p_paddr == image->arch.backup_src_start && + phdr->p_memsz == image->arch.backup_src_sz) { + phdr->p_offset = image->arch.backup_load_addr; + break; + } +out: + vfree(cmem); return ret; } @@ -547,14 +337,14 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, /* Exclude Backup region */ start = image->arch.backup_load_addr; end = start + image->arch.backup_src_sz - 1; - ret = exclude_mem_range(cmem, start, end); + ret = crash_exclude_mem_range(cmem, start, end); if (ret) return ret; /* Exclude elf header region */ start = image->arch.elf_load_addr; end = start + image->arch.elf_headers_sz - 1; - return exclude_mem_range(cmem, start, end); + return crash_exclude_mem_range(cmem, start, end); } /* Prepare memory map for crash dump kernel */ diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index e5ec3cafa72e..aebd0d5bc086 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -195,6 +195,10 @@ void init_espfix_ap(int cpu) pte_p = pte_offset_kernel(&pmd, addr); stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0)); + /* + * __PAGE_KERNEL_* includes _PAGE_GLOBAL, which we want since + * this is mapped to userspace. + */ pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask)); for (n = 0; n < ESPFIX_PTE_CLONES; n++) set_pte(&pte_p[n*PTE_STRIDE], pte); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 0c855deee165..2d29e47c056e 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -104,6 +104,12 @@ static bool __head check_la57_support(unsigned long physaddr) } #endif +/* Code in __startup_64() can be relocated during execution, but the compiler + * doesn't have to generate PC-relative relocations when accessing globals from + * that function. Clang actually does not generate them, which leads to + * boot-time crashes. To work around this problem, every global pointer must + * be adjusted using fixup_pointer(). + */ unsigned long __head __startup_64(unsigned long physaddr, struct boot_params *bp) { @@ -113,6 +119,7 @@ unsigned long __head __startup_64(unsigned long physaddr, p4dval_t *p4d; pudval_t *pud; pmdval_t *pmd, pmd_entry; + pteval_t *mask_ptr; bool la57; int i; unsigned int *next_pgt_ptr; @@ -195,6 +202,9 @@ unsigned long __head __startup_64(unsigned long physaddr, pud[i + 1] = (pudval_t)pmd + pgtable_flags; pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; + /* Filter out unsupported __PAGE_KERNEL_* bits: */ + mask_ptr = fixup_pointer(&__supported_pte_mask, physaddr); + pmd_entry &= *mask_ptr; pmd_entry += sme_get_me_mask(); pmd_entry += physaddr; diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 48385c1074a5..8344dd2f310a 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -399,8 +399,13 @@ NEXT_PAGE(level3_ident_pgt) .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .fill 511, 8, 0 NEXT_PAGE(level2_ident_pgt) - /* Since I easily can, map the first 1G. + /* + * Since I easily can, map the first 1G. * Don't set NX because code runs from these pages. + * + * Note: This sets _PAGE_GLOBAL despite whether + * the CPU supports it or it is enabled. But, + * the CPU should ignore the bit. */ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) #else @@ -431,6 +436,10 @@ NEXT_PAGE(level2_kernel_pgt) * (NOTE: at +512MB starts the module area, see MODULES_VADDR. * If you want to increase this then increase MODULES_VADDR * too.) + * + * This table is eventually used by the kernel during normal + * runtime. Care must be taken to clear out undesired bits + * later, like _PAGE_RW or _PAGE_GLOBAL in some cases. */ PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE) diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index fa183a131edc..a15fe0e92cf9 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL2.0 +// SPDX-License-Identifier: GPL-2.0 /* * Jailhouse paravirt_ops implementation * diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index fb095ba0c02f..7326078eaa7a 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -334,7 +334,6 @@ static void *bzImage64_load(struct kimage *image, char *kernel, unsigned long setup_header_size, params_cmdline_sz; struct boot_params *params; unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr; - unsigned long purgatory_load_addr; struct bzimage64_data *ldata; struct kexec_entry64_regs regs64; void *stack; @@ -342,6 +341,8 @@ static void *bzImage64_load(struct kimage *image, char *kernel, unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset; struct kexec_buf kbuf = { .image = image, .buf_max = ULONG_MAX, .top_down = true }; + struct kexec_buf pbuf = { .image = image, .buf_min = MIN_PURGATORY_ADDR, + .buf_max = ULONG_MAX, .top_down = true }; header = (struct setup_header *)(kernel + setup_hdr_offset); setup_sects = header->setup_sects; @@ -379,14 +380,13 @@ static void *bzImage64_load(struct kimage *image, char *kernel, * Load purgatory. For 64bit entry point, purgatory code can be * anywhere. */ - ret = kexec_load_purgatory(image, MIN_PURGATORY_ADDR, ULONG_MAX, 1, - &purgatory_load_addr); + ret = kexec_load_purgatory(image, &pbuf); if (ret) { pr_err("Loading purgatory failed\n"); return ERR_PTR(ret); } - pr_debug("Loaded purgatory at 0x%lx\n", purgatory_load_addr); + pr_debug("Loaded purgatory at 0x%lx\n", pbuf.mem); /* @@ -398,11 +398,10 @@ static void *bzImage64_load(struct kimage *image, char *kernel, * little bit simple */ efi_map_sz = efi_get_runtime_map_size(); - efi_map_sz = ALIGN(efi_map_sz, 16); params_cmdline_sz = sizeof(struct boot_params) + cmdline_len + MAX_ELFCOREHDR_STR_LEN; params_cmdline_sz = ALIGN(params_cmdline_sz, 16); - kbuf.bufsz = params_cmdline_sz + efi_map_sz + + kbuf.bufsz = params_cmdline_sz + ALIGN(efi_map_sz, 16) + sizeof(struct setup_data) + sizeof(struct efi_setup_data); @@ -410,7 +409,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel, if (!params) return ERR_PTR(-ENOMEM); efi_map_offset = params_cmdline_sz; - efi_setup_data_offset = efi_map_offset + efi_map_sz; + efi_setup_data_offset = efi_map_offset + ALIGN(efi_map_sz, 16); /* Copy setup header onto bootparams. Documentation/x86/boot.txt */ setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset; @@ -538,7 +537,7 @@ static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len) } #endif -struct kexec_file_ops kexec_bzImage64_ops = { +const struct kexec_file_ops kexec_bzImage64_ops = { .probe = bzImage64_probe, .load = bzImage64_load, .cleanup = bzImage64_cleanup, diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 0715f827607c..6f4d42377fe5 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -370,6 +370,10 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION) return 0; + /* We should not singlestep on the exception masking instructions */ + if (insn_masking_exception(insn)) + return 0; + #ifdef CONFIG_X86_64 /* Only x86_64 has RIP relative instructions */ if (insn_rip_relative(insn)) { diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 7867417cfaff..5b2300b818af 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -457,7 +457,7 @@ static void __init sev_map_percpu_data(void) static void __init kvm_smp_prepare_cpus(unsigned int max_cpus) { native_smp_prepare_cpus(max_cpus); - if (kvm_para_has_hint(KVM_HINTS_DEDICATED)) + if (kvm_para_has_hint(KVM_HINTS_REALTIME)) static_branch_disable(&virt_spin_lock_key); } @@ -553,7 +553,7 @@ static void __init kvm_guest_init(void) } if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && - !kvm_para_has_hint(KVM_HINTS_DEDICATED) && + !kvm_para_has_hint(KVM_HINTS_REALTIME) && kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others; @@ -649,7 +649,7 @@ static __init int kvm_setup_pv_tlb_flush(void) int cpu; if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && - !kvm_para_has_hint(KVM_HINTS_DEDICATED) && + !kvm_para_has_hint(KVM_HINTS_REALTIME) && kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { for_each_possible_cpu(cpu) { zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu), @@ -745,7 +745,7 @@ void __init kvm_spinlock_init(void) if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) return; - if (kvm_para_has_hint(KVM_HINTS_DEDICATED)) + if (kvm_para_has_hint(KVM_HINTS_REALTIME)) return; __pv_init_lock_hash(); diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 26d713ecad34..c9b14020f4dd 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -145,6 +145,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) unsigned long offset = i << PAGE_SHIFT; const void *src = (char *)ldt->entries + offset; unsigned long pfn; + pgprot_t pte_prot; pte_t pte, *ptep; va = (unsigned long)ldt_slot_va(slot) + offset; @@ -163,7 +164,10 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) * target via some kernel interface which misses a * permission check. */ - pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)); + pte_prot = __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL); + /* Filter out unsuppored __PAGE_KERNEL* bits: */ + pgprot_val(pte_prot) &= __supported_pte_mask; + pte = pfn_pte(pfn, pte_prot); set_pte_at(mm, va, ptep, pte); pte_unmap_unlock(ptep, ptl); } diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 60cdec6628b0..d1ab07ec8c9a 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -57,12 +57,17 @@ static void load_segments(void) static void machine_kexec_free_page_tables(struct kimage *image) { free_page((unsigned long)image->arch.pgd); + image->arch.pgd = NULL; #ifdef CONFIG_X86_PAE free_page((unsigned long)image->arch.pmd0); + image->arch.pmd0 = NULL; free_page((unsigned long)image->arch.pmd1); + image->arch.pmd1 = NULL; #endif free_page((unsigned long)image->arch.pte0); + image->arch.pte0 = NULL; free_page((unsigned long)image->arch.pte1); + image->arch.pte1 = NULL; } static int machine_kexec_alloc_page_tables(struct kimage *image) @@ -79,7 +84,6 @@ static int machine_kexec_alloc_page_tables(struct kimage *image) !image->arch.pmd0 || !image->arch.pmd1 || #endif !image->arch.pte0 || !image->arch.pte1) { - machine_kexec_free_page_tables(image); return -ENOMEM; } return 0; diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 93bd4fb603d1..6010449ca6d2 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -30,17 +30,22 @@ #include <asm/set_memory.h> #ifdef CONFIG_KEXEC_FILE -static struct kexec_file_ops *kexec_file_loaders[] = { +const struct kexec_file_ops * const kexec_file_loaders[] = { &kexec_bzImage64_ops, + NULL }; #endif static void free_transition_pgtable(struct kimage *image) { free_page((unsigned long)image->arch.p4d); + image->arch.p4d = NULL; free_page((unsigned long)image->arch.pud); + image->arch.pud = NULL; free_page((unsigned long)image->arch.pmd); + image->arch.pmd = NULL; free_page((unsigned long)image->arch.pte); + image->arch.pte = NULL; } static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) @@ -90,7 +95,6 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC)); return 0; err: - free_transition_pgtable(image); return result; } @@ -364,27 +368,6 @@ void arch_crash_save_vmcoreinfo(void) /* arch-dependent functionality related to kexec file-based syscall */ #ifdef CONFIG_KEXEC_FILE -int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, - unsigned long buf_len) -{ - int i, ret = -ENOEXEC; - struct kexec_file_ops *fops; - - for (i = 0; i < ARRAY_SIZE(kexec_file_loaders); i++) { - fops = kexec_file_loaders[i]; - if (!fops || !fops->probe) - continue; - - ret = fops->probe(buf, buf_len); - if (!ret) { - image->fops = fops; - return ret; - } - } - - return ret; -} - void *arch_kexec_kernel_image_load(struct kimage *image) { vfree(image->arch.elf_headers); @@ -399,88 +382,53 @@ void *arch_kexec_kernel_image_load(struct kimage *image) image->cmdline_buf_len); } -int arch_kimage_file_post_load_cleanup(struct kimage *image) -{ - if (!image->fops || !image->fops->cleanup) - return 0; - - return image->fops->cleanup(image->image_loader_data); -} - -#ifdef CONFIG_KEXEC_VERIFY_SIG -int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel, - unsigned long kernel_len) -{ - if (!image->fops || !image->fops->verify_sig) { - pr_debug("kernel loader does not support signature verification."); - return -EKEYREJECTED; - } - - return image->fops->verify_sig(kernel, kernel_len); -} -#endif - /* * Apply purgatory relocations. * - * ehdr: Pointer to elf headers - * sechdrs: Pointer to section headers. - * relsec: section index of SHT_RELA section. + * @pi: Purgatory to be relocated. + * @section: Section relocations applying to. + * @relsec: Section containing RELAs. + * @symtabsec: Corresponding symtab. * * TODO: Some of the code belongs to generic code. Move that in kexec.c. */ -int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr, - Elf64_Shdr *sechdrs, unsigned int relsec) +int arch_kexec_apply_relocations_add(struct purgatory_info *pi, + Elf_Shdr *section, const Elf_Shdr *relsec, + const Elf_Shdr *symtabsec) { unsigned int i; Elf64_Rela *rel; Elf64_Sym *sym; void *location; - Elf64_Shdr *section, *symtabsec; unsigned long address, sec_base, value; const char *strtab, *name, *shstrtab; + const Elf_Shdr *sechdrs; - /* - * ->sh_offset has been modified to keep the pointer to section - * contents in memory - */ - rel = (void *)sechdrs[relsec].sh_offset; - - /* Section to which relocations apply */ - section = &sechdrs[sechdrs[relsec].sh_info]; - - pr_debug("Applying relocate section %u to %u\n", relsec, - sechdrs[relsec].sh_info); - - /* Associated symbol table */ - symtabsec = &sechdrs[sechdrs[relsec].sh_link]; - - /* String table */ - if (symtabsec->sh_link >= ehdr->e_shnum) { - /* Invalid strtab section number */ - pr_err("Invalid string table section index %d\n", - symtabsec->sh_link); - return -ENOEXEC; - } + /* String & section header string table */ + sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; + strtab = (char *)pi->ehdr + sechdrs[symtabsec->sh_link].sh_offset; + shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset; - strtab = (char *)sechdrs[symtabsec->sh_link].sh_offset; + rel = (void *)pi->ehdr + relsec->sh_offset; - /* section header string table */ - shstrtab = (char *)sechdrs[ehdr->e_shstrndx].sh_offset; + pr_debug("Applying relocate section %s to %u\n", + shstrtab + relsec->sh_name, relsec->sh_info); - for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + for (i = 0; i < relsec->sh_size / sizeof(*rel); i++) { /* * rel[i].r_offset contains byte offset from beginning * of section to the storage unit affected. * - * This is location to update (->sh_offset). This is temporary - * buffer where section is currently loaded. This will finally - * be loaded to a different address later, pointed to by + * This is location to update. This is temporary buffer + * where section is currently loaded. This will finally be + * loaded to a different address later, pointed to by * ->sh_addr. kexec takes care of moving it * (kexec_load_segment()). */ - location = (void *)(section->sh_offset + rel[i].r_offset); + location = pi->purgatory_buf; + location += section->sh_offset; + location += rel[i].r_offset; /* Final address of the location */ address = section->sh_addr + rel[i].r_offset; @@ -491,8 +439,8 @@ int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr, * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get * these respectively. */ - sym = (Elf64_Sym *)symtabsec->sh_offset + - ELF64_R_SYM(rel[i].r_info); + sym = (void *)pi->ehdr + symtabsec->sh_offset; + sym += ELF64_R_SYM(rel[i].r_info); if (sym->st_name) name = strtab + sym->st_name; @@ -515,12 +463,12 @@ int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr, if (sym->st_shndx == SHN_ABS) sec_base = 0; - else if (sym->st_shndx >= ehdr->e_shnum) { + else if (sym->st_shndx >= pi->ehdr->e_shnum) { pr_err("Invalid section %d for symbol %s\n", sym->st_shndx, name); return -ENOEXEC; } else - sec_base = sechdrs[sym->st_shndx].sh_addr; + sec_base = pi->sechdrs[sym->st_shndx].sh_addr; value = sym->st_value; value += sec_base; diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 77625b60a510..ab5d9dd668d2 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -15,13 +15,11 @@ #include <asm/x86_init.h> #include <asm/iommu_table.h> -static int forbid_dac __read_mostly; +static bool disable_dac_quirk __read_mostly; const struct dma_map_ops *dma_ops = &dma_direct_ops; EXPORT_SYMBOL(dma_ops); -static int iommu_sac_force __read_mostly; - #ifdef CONFIG_IOMMU_DEBUG int panic_on_overflow __read_mostly = 1; int force_iommu __read_mostly = 1; @@ -55,9 +53,6 @@ struct device x86_dma_fallback_dev = { }; EXPORT_SYMBOL(x86_dma_fallback_dev); -/* Number of entries preallocated for DMA-API debugging */ -#define PREALLOC_DMA_DEBUG_ENTRIES 65536 - void __init pci_iommu_alloc(void) { struct iommu_table_entry *p; @@ -76,7 +71,7 @@ void __init pci_iommu_alloc(void) } } -bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp) +bool arch_dma_alloc_attrs(struct device **dev) { if (!*dev) *dev = &x86_dma_fallback_dev; @@ -125,13 +120,13 @@ static __init int iommu_setup(char *p) if (!strncmp(p, "nomerge", 7)) iommu_merge = 0; if (!strncmp(p, "forcesac", 8)) - iommu_sac_force = 1; + pr_warn("forcesac option ignored.\n"); if (!strncmp(p, "allowdac", 8)) - forbid_dac = 0; + pr_warn("allowdac option ignored.\n"); if (!strncmp(p, "nodac", 5)) - forbid_dac = 1; + pr_warn("nodac option ignored.\n"); if (!strncmp(p, "usedac", 6)) { - forbid_dac = -1; + disable_dac_quirk = true; return 1; } #ifdef CONFIG_SWIOTLB @@ -156,40 +151,9 @@ static __init int iommu_setup(char *p) } early_param("iommu", iommu_setup); -int arch_dma_supported(struct device *dev, u64 mask) -{ -#ifdef CONFIG_PCI - if (mask > 0xffffffff && forbid_dac > 0) { - dev_info(dev, "PCI: Disallowing DAC for device\n"); - return 0; - } -#endif - - /* Tell the device to use SAC when IOMMU force is on. This - allows the driver to use cheaper accesses in some cases. - - Problem with this is that if we overflow the IOMMU area and - return DAC as fallback address the device may not handle it - correctly. - - As a special case some controllers have a 39bit address - mode that is as efficient as 32bit (aic79xx). Don't force - SAC for these. Assume all masks <= 40 bits are of this - type. Normally this doesn't make any difference, but gives - more gentle handling of IOMMU overflow. */ - if (iommu_sac_force && (mask >= DMA_BIT_MASK(40))) { - dev_info(dev, "Force SAC with mask %Lx\n", mask); - return 0; - } - - return 1; -} -EXPORT_SYMBOL(arch_dma_supported); - static int __init pci_iommu_init(void) { struct iommu_table_entry *p; - dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); #ifdef CONFIG_PCI dma_debug_add_bus(&pci_bus_type); @@ -209,11 +173,17 @@ rootfs_initcall(pci_iommu_init); #ifdef CONFIG_PCI /* Many VIA bridges seem to corrupt data for DAC. Disable it here */ +static int via_no_dac_cb(struct pci_dev *pdev, void *data) +{ + pdev->dev.dma_32bit_limit = true; + return 0; +} + static void via_no_dac(struct pci_dev *dev) { - if (forbid_dac == 0) { + if (!disable_dac_quirk) { dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); - forbid_dac = 1; + pci_walk_bus(dev->subordinate, via_no_dac_cb, NULL); } } DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c deleted file mode 100644 index ac7ea3a8242f..000000000000 --- a/arch/x86/kernel/pci-nommu.c +++ /dev/null @@ -1,90 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Fallback functions when the main IOMMU code is not compiled in. This - code is roughly equivalent to i386. */ -#include <linux/dma-direct.h> -#include <linux/scatterlist.h> -#include <linux/string.h> -#include <linux/gfp.h> -#include <linux/pci.h> -#include <linux/mm.h> - -#include <asm/processor.h> -#include <asm/iommu.h> -#include <asm/dma.h> - -#define NOMMU_MAPPING_ERROR 0 - -static int -check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) -{ - if (hwdev && !dma_capable(hwdev, bus, size)) { - if (*hwdev->dma_mask >= DMA_BIT_MASK(32)) - printk(KERN_ERR - "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", - name, (long long)bus, size, - (long long)*hwdev->dma_mask); - return 0; - } - return 1; -} - -static dma_addr_t nommu_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, - unsigned long attrs) -{ - dma_addr_t bus = phys_to_dma(dev, page_to_phys(page)) + offset; - WARN_ON(size == 0); - if (!check_addr("map_single", dev, bus, size)) - return NOMMU_MAPPING_ERROR; - return bus; -} - -/* Map a set of buffers described by scatterlist in streaming - * mode for DMA. This is the scatter-gather version of the - * above pci_map_single interface. Here the scatter gather list - * elements are each tagged with the appropriate dma address - * and length. They are obtained via sg_dma_{address,length}(SG). - * - * NOTE: An implementation may be able to use a smaller number of - * DMA address/length pairs than there are SG table elements. - * (for example via virtual mapping capabilities) - * The routine returns the number of addr/length pairs actually - * used, at most nents. - * - * Device ownership issues as mentioned above for pci_map_single are - * the same here. - */ -static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, - int nents, enum dma_data_direction dir, - unsigned long attrs) -{ - struct scatterlist *s; - int i; - - WARN_ON(nents == 0 || sg[0].length == 0); - - for_each_sg(sg, s, nents, i) { - BUG_ON(!sg_page(s)); - s->dma_address = sg_phys(s); - if (!check_addr("map_sg", hwdev, s->dma_address, s->length)) - return 0; - s->dma_length = s->length; - } - return nents; -} - -static int nommu_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ - return dma_addr == NOMMU_MAPPING_ERROR; -} - -const struct dma_map_ops nommu_dma_ops = { - .alloc = dma_generic_alloc_coherent, - .free = dma_generic_free_coherent, - .map_sg = nommu_map_sg, - .map_page = nommu_map_page, - .is_phys = 1, - .mapping_error = nommu_mapping_error, - .dma_supported = x86_dma_supported, -}; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 03408b942adb..30ca2d1a9231 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -38,6 +38,7 @@ #include <asm/switch_to.h> #include <asm/desc.h> #include <asm/prctl.h> +#include <asm/spec-ctrl.h> /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, @@ -278,6 +279,148 @@ static inline void switch_to_bitmap(struct tss_struct *tss, } } +#ifdef CONFIG_SMP + +struct ssb_state { + struct ssb_state *shared_state; + raw_spinlock_t lock; + unsigned int disable_state; + unsigned long local_state; +}; + +#define LSTATE_SSB 0 + +static DEFINE_PER_CPU(struct ssb_state, ssb_state); + +void speculative_store_bypass_ht_init(void) +{ + struct ssb_state *st = this_cpu_ptr(&ssb_state); + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; + + st->local_state = 0; + + /* + * Shared state setup happens once on the first bringup + * of the CPU. It's not destroyed on CPU hotunplug. + */ + if (st->shared_state) + return; + + raw_spin_lock_init(&st->lock); + + /* + * Go over HT siblings and check whether one of them has set up the + * shared state pointer already. + */ + for_each_cpu(cpu, topology_sibling_cpumask(this_cpu)) { + if (cpu == this_cpu) + continue; + + if (!per_cpu(ssb_state, cpu).shared_state) + continue; + + /* Link it to the state of the sibling: */ + st->shared_state = per_cpu(ssb_state, cpu).shared_state; + return; + } + + /* + * First HT sibling to come up on the core. Link shared state of + * the first HT sibling to itself. The siblings on the same core + * which come up later will see the shared state pointer and link + * themself to the state of this CPU. + */ + st->shared_state = st; +} + +/* + * Logic is: First HT sibling enables SSBD for both siblings in the core + * and last sibling to disable it, disables it for the whole core. This how + * MSR_SPEC_CTRL works in "hardware": + * + * CORE_SPEC_CTRL = THREAD0_SPEC_CTRL | THREAD1_SPEC_CTRL + */ +static __always_inline void amd_set_core_ssb_state(unsigned long tifn) +{ + struct ssb_state *st = this_cpu_ptr(&ssb_state); + u64 msr = x86_amd_ls_cfg_base; + + if (!static_cpu_has(X86_FEATURE_ZEN)) { + msr |= ssbd_tif_to_amd_ls_cfg(tifn); + wrmsrl(MSR_AMD64_LS_CFG, msr); + return; + } + + if (tifn & _TIF_SSBD) { + /* + * Since this can race with prctl(), block reentry on the + * same CPU. + */ + if (__test_and_set_bit(LSTATE_SSB, &st->local_state)) + return; + + msr |= x86_amd_ls_cfg_ssbd_mask; + + raw_spin_lock(&st->shared_state->lock); + /* First sibling enables SSBD: */ + if (!st->shared_state->disable_state) + wrmsrl(MSR_AMD64_LS_CFG, msr); + st->shared_state->disable_state++; + raw_spin_unlock(&st->shared_state->lock); + } else { + if (!__test_and_clear_bit(LSTATE_SSB, &st->local_state)) + return; + + raw_spin_lock(&st->shared_state->lock); + st->shared_state->disable_state--; + if (!st->shared_state->disable_state) + wrmsrl(MSR_AMD64_LS_CFG, msr); + raw_spin_unlock(&st->shared_state->lock); + } +} +#else +static __always_inline void amd_set_core_ssb_state(unsigned long tifn) +{ + u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn); + + wrmsrl(MSR_AMD64_LS_CFG, msr); +} +#endif + +static __always_inline void amd_set_ssb_virt_state(unsigned long tifn) +{ + /* + * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL, + * so ssbd_tif_to_spec_ctrl() just works. + */ + wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn)); +} + +static __always_inline void intel_set_ssb_state(unsigned long tifn) +{ + u64 msr = x86_spec_ctrl_base | ssbd_tif_to_spec_ctrl(tifn); + + wrmsrl(MSR_IA32_SPEC_CTRL, msr); +} + +static __always_inline void __speculative_store_bypass_update(unsigned long tifn) +{ + if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) + amd_set_ssb_virt_state(tifn); + else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) + amd_set_core_ssb_state(tifn); + else + intel_set_ssb_state(tifn); +} + +void speculative_store_bypass_update(unsigned long tif) +{ + preempt_disable(); + __speculative_store_bypass_update(tif); + preempt_enable(); +} + void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss) { @@ -309,6 +452,9 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, if ((tifp ^ tifn) & _TIF_NOCPUID) set_cpuid_faulting(!!(tifn & _TIF_NOCPUID)); + + if ((tifp ^ tifn) & _TIF_SSBD) + __speculative_store_bypass_update(tifn); } /* diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 4b100fe0f508..12bb445fb98d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -542,6 +542,7 @@ void set_personality_64bit(void) clear_thread_flag(TIF_X32); /* Pretend that this comes from a 64bit execve */ task_pt_regs(current)->orig_ax = __NR_execve; + current_thread_info()->status &= ~TS_COMPAT; /* Ensure the corresponding mm is not marked. */ if (current->mm) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 6285697b6e56..5c623dfe39d1 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -50,6 +50,7 @@ #include <linux/init_ohci1394_dma.h> #include <linux/kvm_para.h> #include <linux/dma-contiguous.h> +#include <xen/xen.h> #include <linux/errno.h> #include <linux/kernel.h> @@ -534,6 +535,11 @@ static void __init reserve_crashkernel(void) high = true; } + if (xen_pv_domain()) { + pr_info("Ignoring crashkernel for a Xen PV domain\n"); + return; + } + /* 0 means: find the address automatically */ if (crash_base <= 0) { /* diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index df92605d8724..14c057f29979 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -26,7 +26,7 @@ static inline void signal_compat_build_tests(void) * new fields are handled in copy_siginfo_to_user32()! */ BUILD_BUG_ON(NSIGILL != 11); - BUILD_BUG_ON(NSIGFPE != 14); + BUILD_BUG_ON(NSIGFPE != 15); BUILD_BUG_ON(NSIGSEGV != 7); BUILD_BUG_ON(NSIGBUS != 5); BUILD_BUG_ON(NSIGTRAP != 4); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ff99e2b6fc54..9dd324ae4832 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -77,6 +77,9 @@ #include <asm/i8259.h> #include <asm/misc.h> #include <asm/qspinlock.h> +#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> +#include <asm/spec-ctrl.h> /* Number of siblings per CPU package */ int smp_num_siblings = 1; @@ -242,6 +245,8 @@ static void notrace start_secondary(void *unused) */ check_tsc_sync_target(); + speculative_store_bypass_ht_init(); + /* * Lock vector_lock, set CPU online and bring the vector * allocator online. Online must be set with vector_lock held @@ -390,15 +395,47 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } +/* + * Define snc_cpu[] for SNC (Sub-NUMA Cluster) CPUs. + * + * These are Intel CPUs that enumerate an LLC that is shared by + * multiple NUMA nodes. The LLC on these systems is shared for + * off-package data access but private to the NUMA node (half + * of the package) for on-package access. + * + * CPUID (the source of the information about the LLC) can only + * enumerate the cache as being shared *or* unshared, but not + * this particular configuration. The CPU in this case enumerates + * the cache to be shared across the entire package (spanning both + * NUMA nodes). + */ + +static const struct x86_cpu_id snc_cpu[] = { + { X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X }, + {} +}; + static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { int cpu1 = c->cpu_index, cpu2 = o->cpu_index; - if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID && - per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) - return topology_sane(c, o, "llc"); + /* Do not match if we do not have a valid APICID for cpu: */ + if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID) + return false; - return false; + /* Do not match if LLC id does not match: */ + if (per_cpu(cpu_llc_id, cpu1) != per_cpu(cpu_llc_id, cpu2)) + return false; + + /* + * Allow the SNC topology without warning. Return of false + * means 'c' does not share the LLC of 'o'. This will be + * reflected to userspace. + */ + if (!topology_same_node(c, o) && x86_match_cpu(snc_cpu)) + return false; + + return topology_sane(c, o, "llc"); } /* @@ -456,7 +493,8 @@ static struct sched_domain_topology_level x86_topology[] = { /* * Set if a package/die has multiple NUMA nodes inside. - * AMD Magny-Cours and Intel Cluster-on-Die have this. + * AMD Magny-Cours, Intel Cluster-on-Die, and Intel + * Sub-NUMA Clustering have this. */ static bool x86_has_numa_in_package; @@ -1257,6 +1295,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) set_mtrr_aps_delayed_init(); smp_quirk_init_udelay(); + + speculative_store_bypass_ht_init(); } void arch_enable_nonboot_cpus_begin(void) @@ -1536,6 +1576,8 @@ static inline void mwait_play_dead(void) void *mwait_ptr; int i; + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return; if (!this_cpu_has(X86_FEATURE_MWAIT)) return; if (!this_cpu_has(X86_FEATURE_CLFLUSH)) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index ef32297ff17e..74392d9d51e0 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -317,7 +317,7 @@ static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2) hpet2 -= hpet1; tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); do_div(tmp, 1000000); - do_div(deltatsc, tmp); + deltatsc = div64_u64(deltatsc, tmp); return (unsigned long) deltatsc; } @@ -1067,6 +1067,7 @@ static struct clocksource clocksource_tsc_early = { .resume = tsc_resume, .mark_unstable = tsc_cs_mark_unstable, .tick_stable = tsc_cs_tick_stable, + .list = LIST_HEAD_INIT(clocksource_tsc_early.list), }; /* @@ -1086,6 +1087,7 @@ static struct clocksource clocksource_tsc = { .resume = tsc_resume, .mark_unstable = tsc_cs_mark_unstable, .tick_stable = tsc_cs_tick_stable, + .list = LIST_HEAD_INIT(clocksource_tsc.list), }; void mark_tsc_unstable(char *reason) @@ -1098,13 +1100,9 @@ void mark_tsc_unstable(char *reason) clear_sched_clock_stable(); disable_sched_clock_irqtime(); pr_info("Marking TSC unstable due to %s\n", reason); - /* Change only the rating, when not registered */ - if (clocksource_tsc.mult) { - clocksource_mark_unstable(&clocksource_tsc); - } else { - clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE; - clocksource_tsc.rating = 0; - } + + clocksource_mark_unstable(&clocksource_tsc_early); + clocksource_mark_unstable(&clocksource_tsc); } EXPORT_SYMBOL_GPL(mark_tsc_unstable); @@ -1244,7 +1242,7 @@ static void tsc_refine_calibration_work(struct work_struct *work) /* Don't bother refining TSC on unstable systems */ if (tsc_unstable) - return; + goto unreg; /* * Since the work is started early in boot, we may be @@ -1297,11 +1295,12 @@ static void tsc_refine_calibration_work(struct work_struct *work) out: if (tsc_unstable) - return; + goto unreg; if (boot_cpu_has(X86_FEATURE_ART)) art_related_clocksource = &clocksource_tsc; clocksource_register_khz(&clocksource_tsc, tsc_khz); +unreg: clocksource_unregister(&clocksource_tsc_early); } @@ -1311,8 +1310,8 @@ static int __init init_tsc_clocksource(void) if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_disabled > 0 || !tsc_khz) return 0; - if (check_tsc_unstable()) - return 0; + if (tsc_unstable) + goto unreg; if (tsc_clocksource_reliable) clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; @@ -1328,6 +1327,7 @@ static int __init init_tsc_clocksource(void) if (boot_cpu_has(X86_FEATURE_ART)) art_related_clocksource = &clocksource_tsc; clocksource_register_khz(&clocksource_tsc, tsc_khz); +unreg: clocksource_unregister(&clocksource_tsc_early); return 0; } diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 85c7ef23d99f..c84bb5396958 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -299,6 +299,10 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool if (is_prefix_bad(insn)) return -ENOTSUPP; + /* We should not singlestep on the exception masking instructions */ + if (insn_masking_exception(insn)) + return -ENOTSUPP; + if (x86_64) good_insns = good_insns_64; else diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 82055b90a8b3..92bf2f2e7cdd 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -379,7 +379,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 0x80000008.ebx */ const u32 kvm_cpuid_8000_0008_ebx_x86_features = - F(IBPB) | F(IBRS); + F(AMD_IBPB) | F(AMD_IBRS) | F(VIRT_SSBD); /* cpuid 0xC0000001.edx */ const u32 kvm_cpuid_C000_0001_edx_x86_features = @@ -408,7 +408,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | - F(ARCH_CAPABILITIES); + F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -495,6 +495,11 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ecx &= ~F(PKU); entry->edx &= kvm_cpuid_7_0_edx_x86_features; cpuid_mask(&entry->edx, CPUID_7_EDX); + /* + * We emulate ARCH_CAPABILITIES in software even + * if the host doesn't support it. + */ + entry->edx |= F(ARCH_CAPABILITIES); } else { entry->ebx = 0; entry->ecx = 0; @@ -647,13 +652,20 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, g_phys_as = phys_as; entry->eax = g_phys_as | (virt_as << 8); entry->edx = 0; - /* IBRS and IBPB aren't necessarily present in hardware cpuid */ - if (boot_cpu_has(X86_FEATURE_IBPB)) - entry->ebx |= F(IBPB); - if (boot_cpu_has(X86_FEATURE_IBRS)) - entry->ebx |= F(IBRS); + /* + * IBRS, IBPB and VIRT_SSBD aren't necessarily present in + * hardware cpuid + */ + if (boot_cpu_has(X86_FEATURE_AMD_IBPB)) + entry->ebx |= F(AMD_IBPB); + if (boot_cpu_has(X86_FEATURE_AMD_IBRS)) + entry->ebx |= F(AMD_IBRS); + if (boot_cpu_has(X86_FEATURE_VIRT_SSBD)) + entry->ebx |= F(VIRT_SSBD); entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); + if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD)) + entry->ebx |= F(VIRT_SSBD); break; } case 0x80000019: diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 98618e397342..46ff64da44ca 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1260,12 +1260,16 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) } } -static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) +static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result) { - struct kvm_run *run = vcpu->run; + kvm_hv_hypercall_set_result(vcpu, result); + ++vcpu->stat.hypercalls; + return kvm_skip_emulated_instruction(vcpu); +} - kvm_hv_hypercall_set_result(vcpu, run->hyperv.u.hcall.result); - return 1; +static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) +{ + return kvm_hv_hypercall_complete(vcpu, vcpu->run->hyperv.u.hcall.result); } static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param) @@ -1296,8 +1300,10 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param) if (param & ~KVM_HYPERV_CONN_ID_MASK) return HV_STATUS_INVALID_HYPERCALL_INPUT; - /* conn_to_evt is protected by vcpu->kvm->srcu */ + /* the eventfd is protected by vcpu->kvm->srcu, but conn_to_evt isn't */ + rcu_read_lock(); eventfd = idr_find(&vcpu->kvm->arch.hyperv.conn_to_evt, param); + rcu_read_unlock(); if (!eventfd) return HV_STATUS_INVALID_PORT_ID; @@ -1348,7 +1354,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) /* Hypercall continuation is not supported yet */ if (rep_cnt || rep_idx) { ret = HV_STATUS_INVALID_HYPERCALL_CODE; - goto set_result; + goto out; } switch (code) { @@ -1379,9 +1385,8 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) break; } -set_result: - kvm_hv_hypercall_set_result(vcpu, ret); - return 1; +out: + return kvm_hv_hypercall_complete(vcpu, ret); } void kvm_hv_init_vm(struct kvm *kvm) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 70dcb5548022..3773c4625114 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1463,23 +1463,6 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) local_irq_restore(flags); } -static void start_sw_period(struct kvm_lapic *apic) -{ - if (!apic->lapic_timer.period) - return; - - if (apic_lvtt_oneshot(apic) && - ktime_after(ktime_get(), - apic->lapic_timer.target_expiration)) { - apic_timer_expired(apic); - return; - } - - hrtimer_start(&apic->lapic_timer.timer, - apic->lapic_timer.target_expiration, - HRTIMER_MODE_ABS_PINNED); -} - static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor) { ktime_t now, remaining; @@ -1539,11 +1522,43 @@ static bool set_target_expiration(struct kvm_lapic *apic) static void advance_periodic_target_expiration(struct kvm_lapic *apic) { - apic->lapic_timer.tscdeadline += - nsec_to_cycles(apic->vcpu, apic->lapic_timer.period); + ktime_t now = ktime_get(); + u64 tscl = rdtsc(); + ktime_t delta; + + /* + * Synchronize both deadlines to the same time source or + * differences in the periods (caused by differences in the + * underlying clocks or numerical approximation errors) will + * cause the two to drift apart over time as the errors + * accumulate. + */ apic->lapic_timer.target_expiration = ktime_add_ns(apic->lapic_timer.target_expiration, apic->lapic_timer.period); + delta = ktime_sub(apic->lapic_timer.target_expiration, now); + apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) + + nsec_to_cycles(apic->vcpu, delta); +} + +static void start_sw_period(struct kvm_lapic *apic) +{ + if (!apic->lapic_timer.period) + return; + + if (ktime_after(ktime_get(), + apic->lapic_timer.target_expiration)) { + apic_timer_expired(apic); + + if (apic_lvtt_oneshot(apic)) + return; + + advance_periodic_target_expiration(apic); + } + + hrtimer_start(&apic->lapic_timer.timer, + apic->lapic_timer.target_expiration, + HRTIMER_MODE_ABS_PINNED); } bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b58787daf9f8..26110c202b19 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -49,7 +49,7 @@ #include <asm/debugreg.h> #include <asm/kvm_para.h> #include <asm/irq_remapping.h> -#include <asm/nospec-branch.h> +#include <asm/spec-ctrl.h> #include <asm/virtext.h> #include "trace.h" @@ -213,6 +213,12 @@ struct vcpu_svm { } host; u64 spec_ctrl; + /* + * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be + * translated into the appropriate L2_CFG bits on the host to + * perform speculative control. + */ + u64 virt_spec_ctrl; u32 *msrpm; @@ -1423,12 +1429,23 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } +static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (is_guest_mode(vcpu)) + return svm->nested.hsave->control.tsc_offset; + + return vcpu->arch.tsc_offset; +} + static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { struct vcpu_svm *svm = to_svm(vcpu); u64 g_tsc_offset = 0; if (is_guest_mode(vcpu)) { + /* Write L1's TSC offset. */ g_tsc_offset = svm->vmcb->control.tsc_offset - svm->nested.hsave->control.tsc_offset; svm->nested.hsave->control.tsc_offset = offset; @@ -2049,6 +2066,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.microcode_version = 0x01000065; svm->spec_ctrl = 0; + svm->virt_spec_ctrl = 0; if (!init_event) { svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | @@ -3322,6 +3340,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) /* Restore the original control entries */ copy_vmcb_control_area(vmcb, hsave); + svm->vcpu.arch.tsc_offset = svm->vmcb->control.tsc_offset; kvm_clear_exception_queue(&svm->vcpu); kvm_clear_interrupt_queue(&svm->vcpu); @@ -3482,10 +3501,12 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, /* We don't want to see VMMCALLs from a nested guest */ clr_intercept(svm, INTERCEPT_VMMCALL); + svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset; + svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset; + svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext; svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; svm->vmcb->control.int_state = nested_vmcb->control.int_state; - svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; @@ -4035,12 +4056,6 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) struct vcpu_svm *svm = to_svm(vcpu); switch (msr_info->index) { - case MSR_IA32_TSC: { - msr_info->data = svm->vmcb->control.tsc_offset + - kvm_scale_tsc(vcpu, rdtsc()); - - break; - } case MSR_STAR: msr_info->data = svm->vmcb->save.star; break; @@ -4100,11 +4115,18 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_IBRS)) + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS)) return 1; msr_info->data = svm->spec_ctrl; break; + case MSR_AMD64_VIRT_SPEC_CTRL: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) + return 1; + + msr_info->data = svm->virt_spec_ctrl; + break; case MSR_F15H_IC_CFG: { int family, model; @@ -4193,12 +4215,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->vmcb->save.g_pat = data; mark_dirty(svm->vmcb, VMCB_NPT); break; - case MSR_IA32_TSC: - kvm_write_tsc(vcpu, msr); - break; case MSR_IA32_SPEC_CTRL: if (!msr->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_IBRS)) + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS)) return 1; /* The STIBP bit doesn't fault even if it's not advertised */ @@ -4225,7 +4244,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_PRED_CMD: if (!msr->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_IBPB)) + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB)) return 1; if (data & ~PRED_CMD_IBPB) @@ -4239,6 +4258,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); break; + case MSR_AMD64_VIRT_SPEC_CTRL: + if (!msr->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) + return 1; + + if (data & ~SPEC_CTRL_SSBD) + return 1; + + svm->virt_spec_ctrl = data; + break; case MSR_STAR: svm->vmcb->save.star = data; break; @@ -5265,9 +5294,8 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, } if (!ret && svm) { - trace_kvm_pi_irte_update(svm->vcpu.vcpu_id, - host_irq, e->gsi, - vcpu_info.vector, + trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id, + e->gsi, vcpu_info.vector, vcpu_info.pi_desc_addr, set); } @@ -5553,8 +5581,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) * is no need to worry about the conditional branch over the wrmsr * being speculatively taken. */ - if (svm->spec_ctrl) - native_wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); + x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); asm volatile ( "push %%" _ASM_BP "; \n\t" @@ -5648,6 +5675,18 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) #endif ); + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); + +#ifdef CONFIG_X86_64 + wrmsrl(MSR_GS_BASE, svm->host.gs_base); +#else + loadsegment(fs, svm->host.fs); +#ifndef CONFIG_X86_32_LAZY_GS + loadsegment(gs, svm->host.gs); +#endif +#endif + /* * We do not use IBRS in the kernel. If this vCPU has used the * SPEC_CTRL MSR it may have left it on; save the value and @@ -5666,20 +5705,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); - if (svm->spec_ctrl) - native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); - - /* Eliminate branch target predictions from guest mode */ - vmexit_fill_RSB(); - -#ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, svm->host.gs_base); -#else - loadsegment(fs, svm->host.fs); -#ifndef CONFIG_X86_32_LAZY_GS - loadsegment(gs, svm->host.gs); -#endif -#endif + x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); reload_tss(vcpu); @@ -5782,7 +5808,7 @@ static bool svm_cpu_has_accelerated_tpr(void) return false; } -static bool svm_has_high_real_mode_segbase(void) +static bool svm_has_emulated_msr(int index) { return true; } @@ -7008,7 +7034,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .hardware_enable = svm_hardware_enable, .hardware_disable = svm_hardware_disable, .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, - .cpu_has_high_real_mode_segbase = svm_has_high_real_mode_segbase, + .has_emulated_msr = svm_has_emulated_msr, .vcpu_create = svm_create_vcpu, .vcpu_free = svm_free_vcpu, @@ -7102,6 +7128,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .has_wbinvd_exit = svm_has_wbinvd_exit, + .read_l1_tsc_offset = svm_read_l1_tsc_offset, .write_tsc_offset = svm_write_tsc_offset, .set_tdp_cr3 = set_tdp_cr3, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index aafcc9881e88..40aa29204baf 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -51,7 +51,7 @@ #include <asm/apic.h> #include <asm/irq_remapping.h> #include <asm/mmu_context.h> -#include <asm/nospec-branch.h> +#include <asm/spec-ctrl.h> #include <asm/mshyperv.h> #include "trace.h" @@ -1494,6 +1494,12 @@ static inline bool cpu_has_vmx_vmfunc(void) SECONDARY_EXEC_ENABLE_VMFUNC; } +static bool vmx_umip_emulated(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_DESC; +} + static inline bool report_flexpriority(void) { return flexpriority_enabled; @@ -2880,18 +2886,15 @@ static void setup_msrs(struct vcpu_vmx *vmx) vmx_update_msr_bitmap(&vmx->vcpu); } -/* - * reads and returns guest's timestamp counter "register" - * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset - * -- Intel TSC Scaling for Virtualization White Paper, sec 1.3 - */ -static u64 guest_read_tsc(struct kvm_vcpu *vcpu) +static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu) { - u64 host_tsc, tsc_offset; + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - host_tsc = rdtsc(); - tsc_offset = vmcs_read64(TSC_OFFSET); - return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset; + if (is_guest_mode(vcpu) && + (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) + return vcpu->arch.tsc_offset - vmcs12->tsc_offset; + + return vcpu->arch.tsc_offset; } /* @@ -3524,12 +3527,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) #endif case MSR_EFER: return kvm_get_msr_common(vcpu, msr_info); - case MSR_IA32_TSC: - msr_info->data = guest_read_tsc(vcpu); - break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) && !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) return 1; @@ -3646,17 +3645,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vmcs_write64(GUEST_BNDCFGS, data); break; - case MSR_IA32_TSC: - kvm_write_tsc(vcpu, msr_info); - break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) && !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) return 1; /* The STIBP bit doesn't fault even if it's not advertised */ - if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) return 1; vmx->spec_ctrl = data; @@ -3682,7 +3677,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_PRED_CMD: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_IBPB) && !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) return 1; @@ -4553,12 +4547,6 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa); } -static void vmx_flush_tlb_ept_only(struct kvm_vcpu *vcpu) -{ - if (enable_ept) - vmx_flush_tlb(vcpu, true); -} - static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) { ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; @@ -4776,14 +4764,16 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) else hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; - if ((cr4 & X86_CR4_UMIP) && !boot_cpu_has(X86_FEATURE_UMIP)) { - vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_DESC); - hw_cr4 &= ~X86_CR4_UMIP; - } else if (!is_guest_mode(vcpu) || - !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) { + if (cr4 & X86_CR4_UMIP) { + vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_DESC); + hw_cr4 &= ~X86_CR4_UMIP; + } else if (!is_guest_mode(vcpu) || + !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_DESC); + } if (cr4 & X86_CR4_VMXE) { /* @@ -9287,7 +9277,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) } else { sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - vmx_flush_tlb_ept_only(vcpu); + vmx_flush_tlb(vcpu, true); } vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); @@ -9315,7 +9305,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) !nested_cpu_has2(get_vmcs12(&vmx->vcpu), SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { vmcs_write64(APIC_ACCESS_ADDR, hpa); - vmx_flush_tlb_ept_only(vcpu); + vmx_flush_tlb(vcpu, true); } } @@ -9495,9 +9485,21 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) } STACK_FRAME_NON_STANDARD(vmx_handle_external_intr); -static bool vmx_has_high_real_mode_segbase(void) +static bool vmx_has_emulated_msr(int index) { - return enable_unrestricted_guest || emulate_invalid_guest_state; + switch (index) { + case MSR_IA32_SMBASE: + /* + * We cannot do SMM unless we can run the guest in big + * real mode. + */ + return enable_unrestricted_guest || emulate_invalid_guest_state; + case MSR_AMD64_VIRT_SPEC_CTRL: + /* This is AMD only. */ + return false; + default: + return true; + } } static bool vmx_mpx_supported(void) @@ -9512,12 +9514,6 @@ static bool vmx_xsaves_supported(void) SECONDARY_EXEC_XSAVES; } -static bool vmx_umip_emulated(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_DESC; -} - static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -9735,8 +9731,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) * is no need to worry about the conditional branch over the wrmsr * being speculatively taken. */ - if (vmx->spec_ctrl) - native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); + x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); vmx->__launched = vmx->loaded_vmcs->launched; @@ -9884,8 +9879,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); - if (vmx->spec_ctrl) - native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); + x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); /* Eliminate branch target predictions from guest mode */ vmexit_fill_RSB(); @@ -10608,6 +10602,16 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, return true; } +static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && + !page_address_valid(vcpu, vmcs12->apic_access_addr)) + return -EINVAL; + else + return 0; +} + static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -11176,11 +11180,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); } - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) - vmcs_write64(TSC_OFFSET, - vcpu->arch.tsc_offset + vmcs12->tsc_offset); - else - vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); + vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); + if (kvm_has_tsc_control) decache_tsc_multiplier(vmx); @@ -11222,7 +11223,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } } else if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { - vmx_flush_tlb_ept_only(vcpu); + vmx_flush_tlb(vcpu, true); } /* @@ -11299,6 +11300,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_apic_access_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; @@ -11420,6 +11424,7 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); u32 msr_entry_idx; u32 exit_qual; + int r; enter_guest_mode(vcpu); @@ -11429,26 +11434,21 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); vmx_segment_cache_clear(vmx); - if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) { - leave_guest_mode(vcpu); - vmx_switch_vmcs(vcpu, &vmx->vmcs01); - nested_vmx_entry_failure(vcpu, vmcs12, - EXIT_REASON_INVALID_STATE, exit_qual); - return 1; - } + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + vcpu->arch.tsc_offset += vmcs12->tsc_offset; + + r = EXIT_REASON_INVALID_STATE; + if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) + goto fail; nested_get_vmcs12_pages(vcpu, vmcs12); + r = EXIT_REASON_MSR_LOAD_FAIL; msr_entry_idx = nested_vmx_load_msr(vcpu, vmcs12->vm_entry_msr_load_addr, vmcs12->vm_entry_msr_load_count); - if (msr_entry_idx) { - leave_guest_mode(vcpu); - vmx_switch_vmcs(vcpu, &vmx->vmcs01); - nested_vmx_entry_failure(vcpu, vmcs12, - EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx); - return 1; - } + if (msr_entry_idx) + goto fail; /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point @@ -11457,6 +11457,14 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) * the success flag) when L2 exits (see nested_vmx_vmexit()). */ return 0; + +fail: + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + vcpu->arch.tsc_offset -= vmcs12->tsc_offset; + leave_guest_mode(vcpu); + vmx_switch_vmcs(vcpu, &vmx->vmcs01); + nested_vmx_entry_failure(vcpu, vmcs12, r, exit_qual); + return 1; } /* @@ -12028,6 +12036,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, leave_guest_mode(vcpu); + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + vcpu->arch.tsc_offset -= vmcs12->tsc_offset; + if (likely(!vmx->fail)) { if (exit_reason == -1) sync_vmcs12(vcpu, vmcs12); @@ -12065,7 +12076,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, } else if (!nested_cpu_has_ept(vmcs12) && nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { - vmx_flush_tlb_ept_only(vcpu); + vmx_flush_tlb(vcpu, true); } /* This is needed for same reason as it was needed in prepare_vmcs02 */ @@ -12224,10 +12235,16 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift, static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 tscl = rdtsc(); - u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl); - u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; + struct vcpu_vmx *vmx; + u64 tscl, guest_tscl, delta_tsc; + + if (kvm_mwait_in_guest(vcpu->kvm)) + return -EOPNOTSUPP; + + vmx = to_vmx(vcpu); + tscl = rdtsc(); + guest_tscl = kvm_read_l1_tsc(vcpu, tscl); + delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; /* Convert to host delta tsc if tsc scaling is enabled */ if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && @@ -12533,7 +12550,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); vcpu_info.vector = irq.vector; - trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi, + trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, vcpu_info.vector, vcpu_info.pi_desc_addr, set); if (set) @@ -12622,7 +12639,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .hardware_enable = hardware_enable, .hardware_disable = hardware_disable, .cpu_has_accelerated_tpr = report_flexpriority, - .cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase, + .has_emulated_msr = vmx_has_emulated_msr, .vm_init = vmx_vm_init, .vm_alloc = vmx_vm_alloc, @@ -12712,6 +12729,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, + .read_l1_tsc_offset = vmx_read_l1_tsc_offset, .write_tsc_offset = vmx_write_tsc_offset, .set_tdp_cr3 = vmx_set_cr3, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b2ff74b12ec4..71e7cda6d014 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -114,7 +114,7 @@ module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); static bool __read_mostly report_ignored_msrs = true; module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR); -unsigned int min_timer_period_us = 500; +unsigned int min_timer_period_us = 200; module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); static bool __read_mostly kvmclock_periodic_sync = true; @@ -843,7 +843,10 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4); int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { #ifdef CONFIG_X86_64 - cr3 &= ~CR3_PCID_INVD; + bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); + + if (pcid_enabled) + cr3 &= ~CR3_PCID_INVD; #endif if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { @@ -1058,6 +1061,7 @@ static u32 emulated_msrs[] = { MSR_SMI_COUNT, MSR_PLATFORM_INFO, MSR_MISC_FEATURES_ENABLES, + MSR_AMD64_VIRT_SPEC_CTRL, }; static unsigned num_emulated_msrs; @@ -1490,7 +1494,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) { - u64 curr_offset = vcpu->arch.tsc_offset; + u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu); vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; } @@ -1532,7 +1536,9 @@ static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { - return vcpu->arch.tsc_offset + kvm_scale_tsc(vcpu, host_tsc); + u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu); + + return tsc_offset + kvm_scale_tsc(vcpu, host_tsc); } EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); @@ -2362,6 +2368,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.smbase = data; break; + case MSR_IA32_TSC: + kvm_write_tsc(vcpu, msr_info); + break; case MSR_SMI_COUNT: if (!msr_info->host_initiated) return 1; @@ -2605,6 +2614,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_UCODE_REV: msr_info->data = vcpu->arch.microcode_version; break; + case MSR_IA32_TSC: + msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset; + break; case MSR_MTRRcap: case 0x200 ... 0x2ff: return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); @@ -2819,7 +2831,8 @@ out: static inline bool kvm_can_mwait_in_guest(void) { return boot_cpu_has(X86_FEATURE_MWAIT) && - !boot_cpu_has_bug(X86_BUG_MONITOR); + !boot_cpu_has_bug(X86_BUG_MONITOR) && + boot_cpu_has(X86_FEATURE_ARAT); } int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) @@ -2894,7 +2907,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) * fringe case that is not enabled except via specific settings * of the module parameters. */ - r = kvm_x86_ops->cpu_has_high_real_mode_segbase(); + r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE); break; case KVM_CAP_VAPIC: r = !kvm_x86_ops->cpu_has_accelerated_tpr(); @@ -4594,14 +4607,8 @@ static void kvm_init_msr_list(void) num_msrs_to_save = j; for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) { - switch (emulated_msrs[i]) { - case MSR_IA32_SMBASE: - if (!kvm_x86_ops->cpu_has_high_real_mode_segbase()) - continue; - break; - default: - break; - } + if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i])) + continue; if (j < i) emulated_msrs[j] = emulated_msrs[i]; @@ -6662,9 +6669,7 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; - int op_64_bit, r; - - r = kvm_skip_emulated_instruction(vcpu); + int op_64_bit; if (kvm_hv_hypercall_enabled(vcpu->kvm)) return kvm_hv_hypercall(vcpu); @@ -6712,8 +6717,9 @@ out: if (!op_64_bit) ret = (u32)ret; kvm_register_write(vcpu, VCPU_REGS_RAX, ret); + ++vcpu->stat.hypercalls; - return r; + return kvm_skip_emulated_instruction(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); @@ -7970,6 +7976,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { struct msr_data apic_base_msr; int mmu_reset_needed = 0; + int cpuid_update_needed = 0; int pending_vec, max_bits, idx; struct desc_ptr dt; int ret = -EINVAL; @@ -8008,8 +8015,10 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) vcpu->arch.cr0 = sregs->cr0; mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; + cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) & + (X86_CR4_OSXSAVE | X86_CR4_PKE)); kvm_x86_ops->set_cr4(vcpu, sregs->cr4); - if (sregs->cr4 & (X86_CR4_OSXSAVE | X86_CR4_PKE)) + if (cpuid_update_needed) kvm_update_cpuid(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 7d35ce672989..c9492f764902 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -302,13 +302,6 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) __rem; \ }) -#define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0) -#define KVM_X86_DISABLE_EXITS_HTL (1 << 1) -#define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) -#define KVM_X86_DISABLE_VALID_EXITS (KVM_X86_DISABLE_EXITS_MWAIT | \ - KVM_X86_DISABLE_EXITS_HTL | \ - KVM_X86_DISABLE_EXITS_PAUSE) - static inline bool kvm_mwait_in_guest(struct kvm *kvm) { return kvm->arch.mwait_in_guest; diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 476d810639a8..b45f5aaefd74 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -27,8 +27,20 @@ EXPORT_SYMBOL(get_cpu_entry_area); void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags) { unsigned long va = (unsigned long) cea_vaddr; + pte_t pte = pfn_pte(pa >> PAGE_SHIFT, flags); - set_pte_vaddr(va, pfn_pte(pa >> PAGE_SHIFT, flags)); + /* + * The cpu_entry_area is shared between the user and kernel + * page tables. All of its ptes can safely be global. + * _PAGE_GLOBAL gets reused to help indicate PROT_NONE for + * non-present PTEs, so be careful not to set it in that + * case to avoid confusion. + */ + if (boot_cpu_has(X86_FEATURE_PGE) && + (pgprot_val(flags) & _PAGE_PRESENT)) + pte = pte_set_flags(pte, _PAGE_GLOBAL); + + set_pte_vaddr(va, pte); } static void __init diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 62a7e9f65dec..cc7ff5957194 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -18,6 +18,7 @@ #include <linux/init.h> #include <linux/sched.h> #include <linux/seq_file.h> +#include <linux/highmem.h> #include <asm/pgtable.h> @@ -334,16 +335,16 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, pgprotval_t eff_in, unsigned long P) { int i; - pte_t *start; + pte_t *pte; pgprotval_t prot, eff; - start = (pte_t *)pmd_page_vaddr(addr); for (i = 0; i < PTRS_PER_PTE; i++) { - prot = pte_flags(*start); - eff = effective_prot(eff_in, prot); st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); + pte = pte_offset_map(&addr, st->current_address); + prot = pte_flags(*pte); + eff = effective_prot(eff_in, prot); note_page(m, st, __pgprot(prot), eff, 5); - start++; + pte_unmap(pte); } } #ifdef CONFIG_KASAN diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index 9aa22be8331e..a2f0c7e20fb0 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -98,6 +98,9 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, if (!info->kernpg_flag) info->kernpg_flag = _KERNPG_TABLE; + /* Filter out unsupported __PAGE_KERNEL_* bits: */ + info->kernpg_flag &= __default_kernel_pte_mask; + for (; addr < end; addr = next) { pgd_t *pgd = pgd_page + pgd_index(addr); p4d_t *p4d; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 82f5252c723a..fec82b577c18 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -161,12 +161,6 @@ struct map_range { static int page_size_mask; -static void enable_global_pages(void) -{ - if (!static_cpu_has(X86_FEATURE_PTI)) - __supported_pte_mask |= _PAGE_GLOBAL; -} - static void __init probe_page_size_mask(void) { /* @@ -187,9 +181,15 @@ static void __init probe_page_size_mask(void) __supported_pte_mask &= ~_PAGE_GLOBAL; if (boot_cpu_has(X86_FEATURE_PGE)) { cr4_set_bits_and_update_boot(X86_CR4_PGE); - enable_global_pages(); + __supported_pte_mask |= _PAGE_GLOBAL; } + /* By the default is everything supported: */ + __default_kernel_pte_mask = __supported_pte_mask; + /* Except when with PTI where the kernel is mostly non-Global: */ + if (cpu_feature_enabled(X86_FEATURE_PTI)) + __default_kernel_pte_mask &= ~_PAGE_GLOBAL; + /* Enable 1 GB linear kernel mappings if available: */ if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) { printk(KERN_INFO "Using GB pages for direct mapping\n"); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8008db2bddb3..c893c6a3d707 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -558,8 +558,14 @@ static void __init pagetable_init(void) permanent_kmaps_init(pgd_base); } -pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL); +#define DEFAULT_PTE_MASK ~(_PAGE_NX | _PAGE_GLOBAL) +/* Bits supported by the hardware: */ +pteval_t __supported_pte_mask __read_mostly = DEFAULT_PTE_MASK; +/* Bits allowed in normal kernel mappings: */ +pteval_t __default_kernel_pte_mask __read_mostly = DEFAULT_PTE_MASK; EXPORT_SYMBOL_GPL(__supported_pte_mask); +/* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ +EXPORT_SYMBOL(__default_kernel_pte_mask); /* user-defined highmem size */ static unsigned int highmem_pages = -1; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 66de40e45f58..0a400606dea0 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -65,8 +65,13 @@ * around without checking the pgd every time. */ +/* Bits supported by the hardware: */ pteval_t __supported_pte_mask __read_mostly = ~0; +/* Bits allowed in normal kernel mappings: */ +pteval_t __default_kernel_pte_mask __read_mostly = ~0; EXPORT_SYMBOL_GPL(__supported_pte_mask); +/* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ +EXPORT_SYMBOL(__default_kernel_pte_mask); int force_personality32; @@ -1286,6 +1291,12 @@ void mark_rodata_ro(void) (unsigned long) __va(__pa_symbol(_sdata))); debug_checkwx(); + + /* + * Do this after all of the manipulation of the + * kernel text page tables are complete. + */ + pti_clone_kernel_text(); } int kern_addr_valid(unsigned long addr) diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index ada98b39b8ad..b3294d36769d 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -44,6 +44,9 @@ int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot) return ret; *prot = __pgprot(__PAGE_KERNEL | cachemode2protval(pcm)); + /* Filter out unsupported __PAGE_KERNEL* bits: */ + pgprot_val(*prot) &= __default_kernel_pte_mask; + return 0; } EXPORT_SYMBOL_GPL(iomap_create_wc); @@ -88,6 +91,9 @@ iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) prot = __pgprot(__PAGE_KERNEL | cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)); + /* Filter out unsupported __PAGE_KERNEL* bits: */ + pgprot_val(prot) &= __default_kernel_pte_mask; + return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot); } EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index e2db83bebc3b..c63a545ec199 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -816,6 +816,9 @@ void __init __early_set_fixmap(enum fixed_addresses idx, } pte = early_ioremap_pte(addr); + /* Sanitize 'prot' against any unsupported bits: */ + pgprot_val(flags) &= __default_kernel_pte_mask; + if (pgprot_val(flags)) set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); else diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index d8ff013ea9d0..980dbebd0ca7 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -269,6 +269,12 @@ void __init kasan_early_init(void) pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE; p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE; + /* Mask out unsupported __PAGE_KERNEL bits: */ + pte_val &= __default_kernel_pte_mask; + pmd_val &= __default_kernel_pte_mask; + pud_val &= __default_kernel_pte_mask; + p4d_val &= __default_kernel_pte_mask; + for (i = 0; i < PTRS_PER_PTE; i++) kasan_zero_pte[i] = __pte(pte_val); @@ -371,7 +377,13 @@ void __init kasan_init(void) */ memset(kasan_zero_page, 0, PAGE_SIZE); for (i = 0; i < PTRS_PER_PTE; i++) { - pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO | _PAGE_ENC); + pte_t pte; + pgprot_t prot; + + prot = __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC); + pgprot_val(prot) &= __default_kernel_pte_mask; + + pte = __pte(__pa(kasan_zero_page) | pgprot_val(prot)); set_pte(&kasan_zero_pte[i], pte); } /* Flush TLBs again to be sure that write protection applied. */ diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 85cf12219dea..3bded76e8d5c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -93,6 +93,18 @@ void arch_report_meminfo(struct seq_file *m) static inline void split_page_count(int level) { } #endif +static inline int +within(unsigned long addr, unsigned long start, unsigned long end) +{ + return addr >= start && addr < end; +} + +static inline int +within_inclusive(unsigned long addr, unsigned long start, unsigned long end) +{ + return addr >= start && addr <= end; +} + #ifdef CONFIG_X86_64 static inline unsigned long highmap_start_pfn(void) @@ -106,20 +118,25 @@ static inline unsigned long highmap_end_pfn(void) return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT; } -#endif - -static inline int -within(unsigned long addr, unsigned long start, unsigned long end) +static bool __cpa_pfn_in_highmap(unsigned long pfn) { - return addr >= start && addr < end; + /* + * Kernel text has an alias mapping at a high address, known + * here as "highmap". + */ + return within_inclusive(pfn, highmap_start_pfn(), highmap_end_pfn()); } -static inline int -within_inclusive(unsigned long addr, unsigned long start, unsigned long end) +#else + +static bool __cpa_pfn_in_highmap(unsigned long pfn) { - return addr >= start && addr <= end; + /* There is no highmap on 32-bit */ + return false; } +#endif + /* * Flushing functions */ @@ -172,7 +189,7 @@ static void __cpa_flush_all(void *arg) static void cpa_flush_all(unsigned long cache) { - BUG_ON(irqs_disabled()); + BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); on_each_cpu(__cpa_flush_all, (void *) cache, 1); } @@ -236,7 +253,7 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache, unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */ #endif - BUG_ON(irqs_disabled()); + BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1); @@ -298,9 +315,11 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, /* * The .rodata section needs to be read-only. Using the pfn - * catches all aliases. + * catches all aliases. This also includes __ro_after_init, + * so do not enforce until kernel_set_to_readonly is true. */ - if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT, + if (kernel_set_to_readonly && + within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT, __pa_symbol(__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; @@ -512,6 +531,23 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) #endif } +static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot) +{ + /* + * _PAGE_GLOBAL means "global page" for present PTEs. + * But, it is also used to indicate _PAGE_PROTNONE + * for non-present PTEs. + * + * This ensures that a _PAGE_GLOBAL PTE going from + * present to non-present is not confused as + * _PAGE_PROTNONE. + */ + if (!(pgprot_val(prot) & _PAGE_PRESENT)) + pgprot_val(prot) &= ~_PAGE_GLOBAL; + + return prot; +} + static int try_preserve_large_page(pte_t *kpte, unsigned long address, struct cpa_data *cpa) @@ -566,6 +602,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * up accordingly. */ old_pte = *kpte; + /* Clear PSE (aka _PAGE_PAT) and move PAT bit to correct position */ req_prot = pgprot_large_2_4k(old_prot); pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); @@ -577,19 +614,9 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * different bit positions in the two formats. */ req_prot = pgprot_4k_2_large(req_prot); - - /* - * Set the PSE and GLOBAL flags only if the PRESENT flag is - * set otherwise pmd_present/pmd_huge will return true even on - * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL - * for the ancient hardware that doesn't support it. - */ + req_prot = pgprot_clear_protnone_bits(req_prot); if (pgprot_val(req_prot) & _PAGE_PRESENT) - pgprot_val(req_prot) |= _PAGE_PSE | _PAGE_GLOBAL; - else - pgprot_val(req_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL); - - req_prot = canon_pgprot(req_prot); + pgprot_val(req_prot) |= _PAGE_PSE; /* * old_pfn points to the large page base pfn. So we need @@ -674,8 +701,12 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, switch (level) { case PG_LEVEL_2M: ref_prot = pmd_pgprot(*(pmd_t *)kpte); - /* clear PSE and promote PAT bit to correct position */ + /* + * Clear PSE (aka _PAGE_PAT) and move + * PAT bit to correct position. + */ ref_prot = pgprot_large_2_4k(ref_prot); + ref_pfn = pmd_pfn(*(pmd_t *)kpte); break; @@ -698,23 +729,14 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, return 1; } - /* - * Set the GLOBAL flags only if the PRESENT flag is set - * otherwise pmd/pte_present will return true even on a non - * present pmd/pte. The canon_pgprot will clear _PAGE_GLOBAL - * for the ancient hardware that doesn't support it. - */ - if (pgprot_val(ref_prot) & _PAGE_PRESENT) - pgprot_val(ref_prot) |= _PAGE_GLOBAL; - else - pgprot_val(ref_prot) &= ~_PAGE_GLOBAL; + ref_prot = pgprot_clear_protnone_bits(ref_prot); /* * Get the target pfn from the original entry: */ pfn = ref_pfn; for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) - set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot))); + set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); if (virt_addr_valid(address)) { unsigned long pfn = PFN_DOWN(__pa(address)); @@ -930,19 +952,7 @@ static void populate_pte(struct cpa_data *cpa, pte = pte_offset_kernel(pmd, start); - /* - * Set the GLOBAL flags only if the PRESENT flag is - * set otherwise pte_present will return true even on - * a non present pte. The canon_pgprot will clear - * _PAGE_GLOBAL for the ancient hardware that doesn't - * support it. - */ - if (pgprot_val(pgprot) & _PAGE_PRESENT) - pgprot_val(pgprot) |= _PAGE_GLOBAL; - else - pgprot_val(pgprot) &= ~_PAGE_GLOBAL; - - pgprot = canon_pgprot(pgprot); + pgprot = pgprot_clear_protnone_bits(pgprot); while (num_pages-- && start < end) { set_pte(pte, pfn_pte(cpa->pfn, pgprot)); @@ -1190,6 +1200,10 @@ static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, cpa->numpages = 1; cpa->pfn = __pa(vaddr) >> PAGE_SHIFT; return 0; + + } else if (__cpa_pfn_in_highmap(cpa->pfn)) { + /* Faults in the highmap are OK, so do not warn: */ + return -EFAULT; } else { WARN(1, KERN_WARNING "CPA: called for zero pte. " "vaddr = %lx cpa->vaddr = %lx\n", vaddr, @@ -1234,24 +1248,14 @@ repeat: new_prot = static_protections(new_prot, address, pfn); - /* - * Set the GLOBAL flags only if the PRESENT flag is - * set otherwise pte_present will return true even on - * a non present pte. The canon_pgprot will clear - * _PAGE_GLOBAL for the ancient hardware that doesn't - * support it. - */ - if (pgprot_val(new_prot) & _PAGE_PRESENT) - pgprot_val(new_prot) |= _PAGE_GLOBAL; - else - pgprot_val(new_prot) &= ~_PAGE_GLOBAL; + new_prot = pgprot_clear_protnone_bits(new_prot); /* * We need to keep the pfn from the existing PTE, * after all we're only going to change it's attributes * not the memory it points to */ - new_pte = pfn_pte(pfn, canon_pgprot(new_prot)); + new_pte = pfn_pte(pfn, new_prot); cpa->pfn = pfn; /* * Do we really change anything ? @@ -1352,8 +1356,7 @@ static int cpa_process_alias(struct cpa_data *cpa) * to touch the high mapped kernel as well: */ if (!within(vaddr, (unsigned long)_text, _brk_end) && - within_inclusive(cpa->pfn, highmap_start_pfn(), - highmap_end_pfn())) { + __cpa_pfn_in_highmap(cpa->pfn)) { unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; alias_cpa = *cpa; @@ -1428,11 +1431,11 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, memset(&cpa, 0, sizeof(cpa)); /* - * Check, if we are requested to change a not supported - * feature: + * Check, if we are requested to set a not supported + * feature. Clearing non-supported features is OK. */ mask_set = canon_pgprot(mask_set); - mask_clr = canon_pgprot(mask_clr); + if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split) return 0; @@ -1775,6 +1778,12 @@ int set_memory_4k(unsigned long addr, int numpages) __pgprot(0), 1, 0, NULL); } +int set_memory_nonglobal(unsigned long addr, int numpages) +{ + return change_page_attr_clear(&addr, numpages, + __pgprot(_PAGE_GLOBAL), 0); +} + static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) { struct cpa_data cpa; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 34cda7e0551b..ffc8c13c50e4 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <linux/gfp.h> +#include <linux/hugetlb.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> #include <asm/tlb.h> @@ -583,6 +584,9 @@ void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) { + /* Sanitize 'prot' against any unsupported bits: */ + pgprot_val(flags) &= __default_kernel_pte_mask; + __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); } @@ -636,6 +640,10 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) (mtrr != MTRR_TYPE_WRBACK)) return 0; + /* Bail out if we are we on a populated non-leaf entry: */ + if (pud_present(*pud) && !pud_huge(*pud)) + return 0; + prot = pgprot_4k_2_large(prot); set_pte((pte_t *)pud, pfn_pte( @@ -664,6 +672,10 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) return 0; } + /* Bail out if we are we on a populated non-leaf entry: */ + if (pmd_present(*pmd) && !pmd_huge(*pmd)) + return 0; + prot = pgprot_4k_2_large(prot); set_pte((pte_t *)pmd, pfn_pte( diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index d7bc0eea20a5..6e98e0a7c923 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -94,26 +94,27 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey */ if (pkey != -1) return pkey; - /* - * Look for a protection-key-drive execute-only mapping - * which is now being given permissions that are not - * execute-only. Move it back to the default pkey. - */ - if (vma_is_pkey_exec_only(vma) && - (prot & (PROT_READ|PROT_WRITE))) { - return 0; - } + /* * The mapping is execute-only. Go try to get the * execute-only protection key. If we fail to do that, * fall through as if we do not have execute-only - * support. + * support in this mm. */ if (prot == PROT_EXEC) { pkey = execute_only_pkey(vma->vm_mm); if (pkey > 0) return pkey; + } else if (vma_is_pkey_exec_only(vma)) { + /* + * Protections are *not* PROT_EXEC, but the mapping + * is using the exec-only pkey. This mapping was + * PROT_EXEC and will no longer be. Move back to + * the default pkey. + */ + return ARCH_DEFAULT_PKEY; } + /* * This is a vanilla, non-pkey mprotect (or we failed to * setup execute-only), inherit the pkey from the VMA we diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 631507f0c198..4d418e705878 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -66,12 +66,22 @@ static void __init pti_print_if_secure(const char *reason) pr_info("%s\n", reason); } +enum pti_mode { + PTI_AUTO = 0, + PTI_FORCE_OFF, + PTI_FORCE_ON +} pti_mode; + void __init pti_check_boottime_disable(void) { char arg[5]; int ret; + /* Assume mode is auto unless overridden. */ + pti_mode = PTI_AUTO; + if (hypervisor_is_type(X86_HYPER_XEN_PV)) { + pti_mode = PTI_FORCE_OFF; pti_print_if_insecure("disabled on XEN PV."); return; } @@ -79,18 +89,23 @@ void __init pti_check_boottime_disable(void) ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); if (ret > 0) { if (ret == 3 && !strncmp(arg, "off", 3)) { + pti_mode = PTI_FORCE_OFF; pti_print_if_insecure("disabled on command line."); return; } if (ret == 2 && !strncmp(arg, "on", 2)) { + pti_mode = PTI_FORCE_ON; pti_print_if_secure("force enabled on command line."); goto enable; } - if (ret == 4 && !strncmp(arg, "auto", 4)) + if (ret == 4 && !strncmp(arg, "auto", 4)) { + pti_mode = PTI_AUTO; goto autosel; + } } if (cmdline_find_option_bool(boot_command_line, "nopti")) { + pti_mode = PTI_FORCE_OFF; pti_print_if_insecure("disabled on command line."); return; } @@ -149,7 +164,7 @@ pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) * * Returns a pointer to a P4D on success, or NULL on failure. */ -static __init p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) +static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) { pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); @@ -177,7 +192,7 @@ static __init p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) * * Returns a pointer to a PMD on success, or NULL on failure. */ -static __init pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) +static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) { gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); p4d_t *p4d = pti_user_pagetable_walk_p4d(address); @@ -267,7 +282,7 @@ static void __init pti_setup_vsyscall(void) static void __init pti_setup_vsyscall(void) { } #endif -static void __init +static void pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) { unsigned long addr; @@ -300,6 +315,27 @@ pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) return; /* + * Only clone present PMDs. This ensures only setting + * _PAGE_GLOBAL on present PMDs. This should only be + * called on well-known addresses anyway, so a non- + * present PMD would be a surprise. + */ + if (WARN_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT))) + return; + + /* + * Setting 'target_pmd' below creates a mapping in both + * the user and kernel page tables. It is effectively + * global, so set it as global in both copies. Note: + * the X86_FEATURE_PGE check is not _required_ because + * the CPU ignores _PAGE_GLOBAL when PGE is not + * supported. The check keeps consistentency with + * code that only set this bit when supported. + */ + if (boot_cpu_has(X86_FEATURE_PGE)) + *pmd = pmd_set_flags(*pmd, _PAGE_GLOBAL); + + /* * Copy the PMD. That is, the kernelmode and usermode * tables will share the last-level page tables of this * address range @@ -348,7 +384,103 @@ static void __init pti_clone_entry_text(void) { pti_clone_pmds((unsigned long) __entry_text_start, (unsigned long) __irqentry_text_end, - _PAGE_RW | _PAGE_GLOBAL); + _PAGE_RW); +} + +/* + * Global pages and PCIDs are both ways to make kernel TLB entries + * live longer, reduce TLB misses and improve kernel performance. + * But, leaving all kernel text Global makes it potentially accessible + * to Meltdown-style attacks which make it trivial to find gadgets or + * defeat KASLR. + * + * Only use global pages when it is really worth it. + */ +static inline bool pti_kernel_image_global_ok(void) +{ + /* + * Systems with PCIDs get litlle benefit from global + * kernel text and are not worth the downsides. + */ + if (cpu_feature_enabled(X86_FEATURE_PCID)) + return false; + + /* + * Only do global kernel image for pti=auto. Do the most + * secure thing (not global) if pti=on specified. + */ + if (pti_mode != PTI_AUTO) + return false; + + /* + * K8 may not tolerate the cleared _PAGE_RW on the userspace + * global kernel image pages. Do the safe thing (disable + * global kernel image). This is unlikely to ever be + * noticed because PTI is disabled by default on AMD CPUs. + */ + if (boot_cpu_has(X86_FEATURE_K8)) + return false; + + /* + * RANDSTRUCT derives its hardening benefits from the + * attacker's lack of knowledge about the layout of kernel + * data structures. Keep the kernel image non-global in + * cases where RANDSTRUCT is in use to help keep the layout a + * secret. + */ + if (IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT)) + return false; + + return true; +} + +/* + * For some configurations, map all of kernel text into the user page + * tables. This reduces TLB misses, especially on non-PCID systems. + */ +void pti_clone_kernel_text(void) +{ + /* + * rodata is part of the kernel image and is normally + * readable on the filesystem or on the web. But, do not + * clone the areas past rodata, they might contain secrets. + */ + unsigned long start = PFN_ALIGN(_text); + unsigned long end = (unsigned long)__end_rodata_hpage_align; + + if (!pti_kernel_image_global_ok()) + return; + + pr_debug("mapping partial kernel image into user address space\n"); + + /* + * Note that this will undo _some_ of the work that + * pti_set_kernel_image_nonglobal() did to clear the + * global bit. + */ + pti_clone_pmds(start, end, _PAGE_RW); +} + +/* + * This is the only user for it and it is not arch-generic like + * the other set_memory.h functions. Just extern it. + */ +extern int set_memory_nonglobal(unsigned long addr, int numpages); +void pti_set_kernel_image_nonglobal(void) +{ + /* + * The identity map is created with PMDs, regardless of the + * actual length of the kernel. We need to clear + * _PAGE_GLOBAL up to a PMD boundary, not just to the end + * of the image. + */ + unsigned long start = PFN_ALIGN(_text); + unsigned long end = ALIGN((unsigned long)_end, PMD_PAGE_SIZE); + + if (pti_kernel_image_global_ok()) + return; + + set_memory_nonglobal(start, (end - start) >> PAGE_SHIFT); } /* @@ -362,6 +494,10 @@ void __init pti_init(void) pr_info("enabled\n"); pti_clone_user_shared(); + + /* Undo all global bits from the init pagetables in head_64.S: */ + pti_set_kernel_image_nonglobal(); + /* Replace some of the global bits just for shared entry text: */ pti_clone_entry_text(); pti_setup_espfix64(); pti_setup_vsyscall(); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index b725154182cc..263c8453815e 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1027,7 +1027,17 @@ emit_cond_jmp: /* convert BPF opcode to x86 */ break; case BPF_JMP | BPF_JA: - jmp_offset = addrs[i + insn->off] - addrs[i]; + if (insn->off == -1) + /* -1 jmp instructions will always jump + * backwards two bytes. Explicitly handling + * this case avoids wasting too many passes + * when there are long sequences of replaced + * dead code. + */ + jmp_offset = -2; + else + jmp_offset = addrs[i + insn->off] - addrs[i]; + if (!jmp_offset) /* optimize out nop jumps */ break; @@ -1226,6 +1236,7 @@ skip_init_addrs: for (pass = 0; pass < 20 || image; pass++) { proglen = do_jit(prog, addrs, image, oldproglen, &ctx); if (proglen <= 0) { +out_image: image = NULL; if (header) bpf_jit_binary_free(header); @@ -1236,8 +1247,7 @@ skip_init_addrs: if (proglen != oldproglen) { pr_err("bpf_jit: proglen=%d != oldproglen=%d\n", proglen, oldproglen); - prog = orig_prog; - goto out_addrs; + goto out_image; } break; } @@ -1273,7 +1283,7 @@ skip_init_addrs: prog = orig_prog; } - if (!prog->is_func || extra_pass) { + if (!image || !prog->is_func || extra_pass) { out_addrs: kfree(addrs); kfree(jit_data); diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 74a532989308..ccf4a49bb065 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -51,6 +51,12 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) pmd_t *pmd; pud_t *pud; p4d_t *p4d = NULL; + pgprot_t pgtable_prot = __pgprot(_KERNPG_TABLE); + pgprot_t pmd_text_prot = __pgprot(__PAGE_KERNEL_LARGE_EXEC); + + /* Filter out unsupported __PAGE_KERNEL* bits: */ + pgprot_val(pmd_text_prot) &= __default_kernel_pte_mask; + pgprot_val(pgtable_prot) &= __default_kernel_pte_mask; /* * The new mapping only has to cover the page containing the image @@ -81,15 +87,19 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) return -ENOMEM; set_pmd(pmd + pmd_index(restore_jump_address), - __pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC)); + __pmd((jump_address_phys & PMD_MASK) | pgprot_val(pmd_text_prot))); set_pud(pud + pud_index(restore_jump_address), - __pud(__pa(pmd) | _KERNPG_TABLE)); + __pud(__pa(pmd) | pgprot_val(pgtable_prot))); if (p4d) { - set_p4d(p4d + p4d_index(restore_jump_address), __p4d(__pa(pud) | _KERNPG_TABLE)); - set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(p4d) | _KERNPG_TABLE)); + p4d_t new_p4d = __p4d(__pa(pud) | pgprot_val(pgtable_prot)); + pgd_t new_pgd = __pgd(__pa(p4d) | pgprot_val(pgtable_prot)); + + set_p4d(p4d + p4d_index(restore_jump_address), new_p4d); + set_pgd(pgd + pgd_index(restore_jump_address), new_pgd); } else { /* No p4d for 4-level paging: point the pgd to the pud page table */ - set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(pud) | _KERNPG_TABLE)); + pgd_t new_pgd = __pgd(__pa(pud) | pgprot_val(pgtable_prot)); + set_pgd(pgd + pgd_index(restore_jump_address), new_pgd); } return 0; diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index d70c15de417b..2e9ee023e6bc 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -6,6 +6,9 @@ purgatory-y := purgatory.o stack.o setup-x86_$(BITS).o sha256.o entry64.o string targets += $(purgatory-y) PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) +$(obj)/sha256.o: $(srctree)/lib/sha256.c + $(call if_changed_rule,cc_o_c) + LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib targets += purgatory.ro diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c index 470edad96bb9..025c34ac0d84 100644 --- a/arch/x86/purgatory/purgatory.c +++ b/arch/x86/purgatory/purgatory.c @@ -11,9 +11,9 @@ */ #include <linux/bug.h> +#include <linux/sha256.h> #include <asm/purgatory.h> -#include "sha256.h" #include "../boot/string.h" unsigned long purgatory_backup_dest __section(.kexec-purgatory); diff --git a/arch/x86/purgatory/sha256.c b/arch/x86/purgatory/sha256.c deleted file mode 100644 index 548ca675a14a..000000000000 --- a/arch/x86/purgatory/sha256.c +++ /dev/null @@ -1,283 +0,0 @@ -/* - * SHA-256, as specified in - * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf - * - * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>. - * - * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> - * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> - * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> - * Copyright (c) 2014 Red Hat Inc. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - */ - -#include <linux/bitops.h> -#include <asm/byteorder.h> -#include "sha256.h" -#include "../boot/string.h" - -static inline u32 Ch(u32 x, u32 y, u32 z) -{ - return z ^ (x & (y ^ z)); -} - -static inline u32 Maj(u32 x, u32 y, u32 z) -{ - return (x & y) | (z & (x | y)); -} - -#define e0(x) (ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22)) -#define e1(x) (ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25)) -#define s0(x) (ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3)) -#define s1(x) (ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10)) - -static inline void LOAD_OP(int I, u32 *W, const u8 *input) -{ - W[I] = __be32_to_cpu(((__be32 *)(input))[I]); -} - -static inline void BLEND_OP(int I, u32 *W) -{ - W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16]; -} - -static void sha256_transform(u32 *state, const u8 *input) -{ - u32 a, b, c, d, e, f, g, h, t1, t2; - u32 W[64]; - int i; - - /* load the input */ - for (i = 0; i < 16; i++) - LOAD_OP(i, W, input); - - /* now blend */ - for (i = 16; i < 64; i++) - BLEND_OP(i, W); - - /* load the state into our registers */ - a = state[0]; b = state[1]; c = state[2]; d = state[3]; - e = state[4]; f = state[5]; g = state[6]; h = state[7]; - - /* now iterate */ - t1 = h + e1(e) + Ch(e, f, g) + 0x428a2f98 + W[0]; - t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; - t1 = g + e1(d) + Ch(d, e, f) + 0x71374491 + W[1]; - t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; - t1 = f + e1(c) + Ch(c, d, e) + 0xb5c0fbcf + W[2]; - t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; - t1 = e + e1(b) + Ch(b, c, d) + 0xe9b5dba5 + W[3]; - t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; - t1 = d + e1(a) + Ch(a, b, c) + 0x3956c25b + W[4]; - t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; - t1 = c + e1(h) + Ch(h, a, b) + 0x59f111f1 + W[5]; - t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; - t1 = b + e1(g) + Ch(g, h, a) + 0x923f82a4 + W[6]; - t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; - t1 = a + e1(f) + Ch(f, g, h) + 0xab1c5ed5 + W[7]; - t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2; - - t1 = h + e1(e) + Ch(e, f, g) + 0xd807aa98 + W[8]; - t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; - t1 = g + e1(d) + Ch(d, e, f) + 0x12835b01 + W[9]; - t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; - t1 = f + e1(c) + Ch(c, d, e) + 0x243185be + W[10]; - t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; - t1 = e + e1(b) + Ch(b, c, d) + 0x550c7dc3 + W[11]; - t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; - t1 = d + e1(a) + Ch(a, b, c) + 0x72be5d74 + W[12]; - t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; - t1 = c + e1(h) + Ch(h, a, b) + 0x80deb1fe + W[13]; - t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; - t1 = b + e1(g) + Ch(g, h, a) + 0x9bdc06a7 + W[14]; - t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; - t1 = a + e1(f) + Ch(f, g, h) + 0xc19bf174 + W[15]; - t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; - - t1 = h + e1(e) + Ch(e, f, g) + 0xe49b69c1 + W[16]; - t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; - t1 = g + e1(d) + Ch(d, e, f) + 0xefbe4786 + W[17]; - t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; - t1 = f + e1(c) + Ch(c, d, e) + 0x0fc19dc6 + W[18]; - t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; - t1 = e + e1(b) + Ch(b, c, d) + 0x240ca1cc + W[19]; - t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; - t1 = d + e1(a) + Ch(a, b, c) + 0x2de92c6f + W[20]; - t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; - t1 = c + e1(h) + Ch(h, a, b) + 0x4a7484aa + W[21]; - t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; - t1 = b + e1(g) + Ch(g, h, a) + 0x5cb0a9dc + W[22]; - t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; - t1 = a + e1(f) + Ch(f, g, h) + 0x76f988da + W[23]; - t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; - - t1 = h + e1(e) + Ch(e, f, g) + 0x983e5152 + W[24]; - t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; - t1 = g + e1(d) + Ch(d, e, f) + 0xa831c66d + W[25]; - t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; - t1 = f + e1(c) + Ch(c, d, e) + 0xb00327c8 + W[26]; - t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; - t1 = e + e1(b) + Ch(b, c, d) + 0xbf597fc7 + W[27]; - t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; - t1 = d + e1(a) + Ch(a, b, c) + 0xc6e00bf3 + W[28]; - t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; - t1 = c + e1(h) + Ch(h, a, b) + 0xd5a79147 + W[29]; - t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; - t1 = b + e1(g) + Ch(g, h, a) + 0x06ca6351 + W[30]; - t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; - t1 = a + e1(f) + Ch(f, g, h) + 0x14292967 + W[31]; - t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; - - t1 = h + e1(e) + Ch(e, f, g) + 0x27b70a85 + W[32]; - t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; - t1 = g + e1(d) + Ch(d, e, f) + 0x2e1b2138 + W[33]; - t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; - t1 = f + e1(c) + Ch(c, d, e) + 0x4d2c6dfc + W[34]; - t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; - t1 = e + e1(b) + Ch(b, c, d) + 0x53380d13 + W[35]; - t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; - t1 = d + e1(a) + Ch(a, b, c) + 0x650a7354 + W[36]; - t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; - t1 = c + e1(h) + Ch(h, a, b) + 0x766a0abb + W[37]; - t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; - t1 = b + e1(g) + Ch(g, h, a) + 0x81c2c92e + W[38]; - t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; - t1 = a + e1(f) + Ch(f, g, h) + 0x92722c85 + W[39]; - t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; - - t1 = h + e1(e) + Ch(e, f, g) + 0xa2bfe8a1 + W[40]; - t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; - t1 = g + e1(d) + Ch(d, e, f) + 0xa81a664b + W[41]; - t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; - t1 = f + e1(c) + Ch(c, d, e) + 0xc24b8b70 + W[42]; - t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; - t1 = e + e1(b) + Ch(b, c, d) + 0xc76c51a3 + W[43]; - t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; - t1 = d + e1(a) + Ch(a, b, c) + 0xd192e819 + W[44]; - t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; - t1 = c + e1(h) + Ch(h, a, b) + 0xd6990624 + W[45]; - t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; - t1 = b + e1(g) + Ch(g, h, a) + 0xf40e3585 + W[46]; - t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; - t1 = a + e1(f) + Ch(f, g, h) + 0x106aa070 + W[47]; - t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; - - t1 = h + e1(e) + Ch(e, f, g) + 0x19a4c116 + W[48]; - t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; - t1 = g + e1(d) + Ch(d, e, f) + 0x1e376c08 + W[49]; - t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; - t1 = f + e1(c) + Ch(c, d, e) + 0x2748774c + W[50]; - t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; - t1 = e + e1(b) + Ch(b, c, d) + 0x34b0bcb5 + W[51]; - t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; - t1 = d + e1(a) + Ch(a, b, c) + 0x391c0cb3 + W[52]; - t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; - t1 = c + e1(h) + Ch(h, a, b) + 0x4ed8aa4a + W[53]; - t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; - t1 = b + e1(g) + Ch(g, h, a) + 0x5b9cca4f + W[54]; - t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; - t1 = a + e1(f) + Ch(f, g, h) + 0x682e6ff3 + W[55]; - t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; - - t1 = h + e1(e) + Ch(e, f, g) + 0x748f82ee + W[56]; - t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; - t1 = g + e1(d) + Ch(d, e, f) + 0x78a5636f + W[57]; - t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; - t1 = f + e1(c) + Ch(c, d, e) + 0x84c87814 + W[58]; - t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; - t1 = e + e1(b) + Ch(b, c, d) + 0x8cc70208 + W[59]; - t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; - t1 = d + e1(a) + Ch(a, b, c) + 0x90befffa + W[60]; - t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; - t1 = c + e1(h) + Ch(h, a, b) + 0xa4506ceb + W[61]; - t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; - t1 = b + e1(g) + Ch(g, h, a) + 0xbef9a3f7 + W[62]; - t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; - t1 = a + e1(f) + Ch(f, g, h) + 0xc67178f2 + W[63]; - t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; - - state[0] += a; state[1] += b; state[2] += c; state[3] += d; - state[4] += e; state[5] += f; state[6] += g; state[7] += h; - - /* clear any sensitive info... */ - a = b = c = d = e = f = g = h = t1 = t2 = 0; - memset(W, 0, 64 * sizeof(u32)); -} - -int sha256_init(struct sha256_state *sctx) -{ - sctx->state[0] = SHA256_H0; - sctx->state[1] = SHA256_H1; - sctx->state[2] = SHA256_H2; - sctx->state[3] = SHA256_H3; - sctx->state[4] = SHA256_H4; - sctx->state[5] = SHA256_H5; - sctx->state[6] = SHA256_H6; - sctx->state[7] = SHA256_H7; - sctx->count = 0; - - return 0; -} - -int sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len) -{ - unsigned int partial, done; - const u8 *src; - - partial = sctx->count & 0x3f; - sctx->count += len; - done = 0; - src = data; - - if ((partial + len) > 63) { - if (partial) { - done = -partial; - memcpy(sctx->buf + partial, data, done + 64); - src = sctx->buf; - } - - do { - sha256_transform(sctx->state, src); - done += 64; - src = data + done; - } while (done + 63 < len); - - partial = 0; - } - memcpy(sctx->buf + partial, src, len - done); - - return 0; -} - -int sha256_final(struct sha256_state *sctx, u8 *out) -{ - __be32 *dst = (__be32 *)out; - __be64 bits; - unsigned int index, pad_len; - int i; - static const u8 padding[64] = { 0x80, }; - - /* Save number of bits */ - bits = cpu_to_be64(sctx->count << 3); - - /* Pad out to 56 mod 64. */ - index = sctx->count & 0x3f; - pad_len = (index < 56) ? (56 - index) : ((64+56) - index); - sha256_update(sctx, padding, pad_len); - - /* Append length (before padding) */ - sha256_update(sctx, (const u8 *)&bits, sizeof(bits)); - - /* Store state in digest */ - for (i = 0; i < 8; i++) - dst[i] = cpu_to_be32(sctx->state[i]); - - /* Zeroize sensitive information. */ - memset(sctx, 0, sizeof(*sctx)); - - return 0; -} diff --git a/arch/x86/purgatory/sha256.h b/arch/x86/purgatory/sha256.h deleted file mode 100644 index 2867d9825a57..000000000000 --- a/arch/x86/purgatory/sha256.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (C) 2014 Red Hat Inc. - * - * Author: Vivek Goyal <vgoyal@redhat.com> - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. - */ - -#ifndef SHA256_H -#define SHA256_H - -#include <linux/types.h> -#include <crypto/sha.h> - -extern int sha256_init(struct sha256_state *sctx); -extern int sha256_update(struct sha256_state *sctx, const u8 *input, - unsigned int length); -extern int sha256_final(struct sha256_state *sctx, u8 *hash); - -#endif /* SHA256_H */ diff --git a/arch/x86/purgatory/string.c b/arch/x86/purgatory/string.c index d886b1fa36f0..795ca4f2cb3c 100644 --- a/arch/x86/purgatory/string.c +++ b/arch/x86/purgatory/string.c @@ -10,4 +10,16 @@ * Version 2. See the file COPYING for more details. */ +#include <linux/types.h> + #include "../boot/string.c" + +void *memcpy(void *dst, const void *src, size_t len) +{ + return __builtin_memcpy(dst, src, len); +} + +void *memset(void *dst, int c, size_t len) +{ + return __builtin_memset(dst, c, len); +} diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 2163888497d3..5e53bfbe5823 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -112,7 +112,7 @@ static int xen_madt_oem_check(char *oem_id, char *oem_table_id) return xen_pv_domain(); } -static int xen_id_always_valid(int apicid) +static int xen_id_always_valid(u32 apicid) { return 1; } diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 826898701045..19c1ff542387 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -65,6 +65,19 @@ static void __init xen_hvm_init_mem_mapping(void) { early_memunmap(HYPERVISOR_shared_info, PAGE_SIZE); HYPERVISOR_shared_info = __va(PFN_PHYS(shared_info_pfn)); + + /* + * The virtual address of the shared_info page has changed, so + * the vcpu_info pointer for VCPU 0 is now stale. + * + * The prepare_boot_cpu callback will re-initialize it via + * xen_vcpu_setup, but we can't rely on that to be called for + * old Xen versions (xen_have_vector_callback == 0). + * + * It is, in any case, bad to have a stale vcpu_info pointer + * so reset it now. + */ + xen_vcpu_info_reset(0); } static void __init init_hvm_pv_info(void) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 3c2c2530737e..357969a3697c 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -421,45 +421,33 @@ static void xen_load_gdt(const struct desc_ptr *dtr) { unsigned long va = dtr->address; unsigned int size = dtr->size + 1; - unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE); - unsigned long frames[pages]; - int f; - - /* - * A GDT can be up to 64k in size, which corresponds to 8192 - * 8-byte entries, or 16 4k pages.. - */ + unsigned long pfn, mfn; + int level; + pte_t *ptep; + void *virt; - BUG_ON(size > 65536); + /* @size should be at most GDT_SIZE which is smaller than PAGE_SIZE. */ + BUG_ON(size > PAGE_SIZE); BUG_ON(va & ~PAGE_MASK); - for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { - int level; - pte_t *ptep; - unsigned long pfn, mfn; - void *virt; - - /* - * The GDT is per-cpu and is in the percpu data area. - * That can be virtually mapped, so we need to do a - * page-walk to get the underlying MFN for the - * hypercall. The page can also be in the kernel's - * linear range, so we need to RO that mapping too. - */ - ptep = lookup_address(va, &level); - BUG_ON(ptep == NULL); - - pfn = pte_pfn(*ptep); - mfn = pfn_to_mfn(pfn); - virt = __va(PFN_PHYS(pfn)); + /* + * The GDT is per-cpu and is in the percpu data area. + * That can be virtually mapped, so we need to do a + * page-walk to get the underlying MFN for the + * hypercall. The page can also be in the kernel's + * linear range, so we need to RO that mapping too. + */ + ptep = lookup_address(va, &level); + BUG_ON(ptep == NULL); - frames[f] = mfn; + pfn = pte_pfn(*ptep); + mfn = pfn_to_mfn(pfn); + virt = __va(PFN_PHYS(pfn)); - make_lowmem_page_readonly((void *)va); - make_lowmem_page_readonly(virt); - } + make_lowmem_page_readonly((void *)va); + make_lowmem_page_readonly(virt); - if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) + if (HYPERVISOR_set_gdt(&mfn, size / sizeof(struct desc_struct))) BUG(); } @@ -470,34 +458,22 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) { unsigned long va = dtr->address; unsigned int size = dtr->size + 1; - unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE); - unsigned long frames[pages]; - int f; - - /* - * A GDT can be up to 64k in size, which corresponds to 8192 - * 8-byte entries, or 16 4k pages.. - */ + unsigned long pfn, mfn; + pte_t pte; - BUG_ON(size > 65536); + /* @size should be at most GDT_SIZE which is smaller than PAGE_SIZE. */ + BUG_ON(size > PAGE_SIZE); BUG_ON(va & ~PAGE_MASK); - for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { - pte_t pte; - unsigned long pfn, mfn; - - pfn = virt_to_pfn(va); - mfn = pfn_to_mfn(pfn); - - pte = pfn_pte(pfn, PAGE_KERNEL_RO); + pfn = virt_to_pfn(va); + mfn = pfn_to_mfn(pfn); - if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0)) - BUG(); + pte = pfn_pte(pfn, PAGE_KERNEL_RO); - frames[f] = mfn; - } + if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0)) + BUG(); - if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) + if (HYPERVISOR_set_gdt(&mfn, size / sizeof(struct desc_struct))) BUG(); } @@ -1259,10 +1235,6 @@ asmlinkage __visible void __init xen_start_kernel(void) */ __userpte_alloc_gfp &= ~__GFP_HIGHMEM; - /* Work out if we support NX */ - get_cpu_cap(&boot_cpu_data); - x86_configure_nx(); - /* Get mfn list */ xen_build_dynamic_phys_to_machine(); @@ -1272,6 +1244,10 @@ asmlinkage __visible void __init xen_start_kernel(void) */ xen_setup_gdt(0); + /* Work out if we support NX */ + get_cpu_cap(&boot_cpu_data); + x86_configure_nx(); + xen_init_irq_ops(); /* Let's presume PV guests always boot on vCPU with id 0. */ diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index d33e7dbe3129..2d76106788a3 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -42,13 +42,11 @@ xmaddr_t arbitrary_virt_to_machine(void *vaddr) } EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine); -static void xen_flush_tlb_all(void) +static noinline void xen_flush_tlb_all(void) { struct mmuext_op *op; struct multicall_space mcs; - trace_xen_mmu_flush_tlb_all(0); - preempt_disable(); mcs = xen_mc_entry(sizeof(*op)); diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 486c0a34d00b..2c30cabfda90 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1310,13 +1310,11 @@ unsigned long xen_read_cr2_direct(void) return this_cpu_read(xen_vcpu_info.arch.cr2); } -static void xen_flush_tlb(void) +static noinline void xen_flush_tlb(void) { struct mmuext_op *op; struct multicall_space mcs; - trace_xen_mmu_flush_tlb(0); - preempt_disable(); mcs = xen_mc_entry(sizeof(*op)); diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 96f26e026783..5077ead5e59c 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -89,7 +89,9 @@ END(hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables|pae_pgdir_above_4gb") ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, - .long (1 << XENFEAT_writable_page_tables) | (1 << XENFEAT_dom0)) + .long (1 << XENFEAT_writable_page_tables) | \ + (1 << XENFEAT_dom0) | \ + (1 << XENFEAT_linux_rsdp_unrestricted)) ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, |