diff options
Diffstat (limited to 'arch/x86/mm')
| -rw-r--r-- | arch/x86/mm/Makefile | 9 | ||||
| -rw-r--r-- | arch/x86/mm/cpu_entry_area.c | 166 | ||||
| -rw-r--r-- | arch/x86/mm/debug_pagetables.c | 80 | ||||
| -rw-r--r-- | arch/x86/mm/dump_pagetables.c | 141 | ||||
| -rw-r--r-- | arch/x86/mm/extable.c | 6 | ||||
| -rw-r--r-- | arch/x86/mm/fault.c | 4 | ||||
| -rw-r--r-- | arch/x86/mm/init.c | 82 | ||||
| -rw-r--r-- | arch/x86/mm/init_32.c | 6 | ||||
| -rw-r--r-- | arch/x86/mm/ioremap.c | 4 | ||||
| -rw-r--r-- | arch/x86/mm/kasan_init_64.c | 23 | ||||
| -rw-r--r-- | arch/x86/mm/kaslr.c | 32 | ||||
| -rw-r--r-- | arch/x86/mm/kmemcheck/error.c | 1 | ||||
| -rw-r--r-- | arch/x86/mm/kmemcheck/error.h | 1 | ||||
| -rw-r--r-- | arch/x86/mm/kmemcheck/opcode.c | 1 | ||||
| -rw-r--r-- | arch/x86/mm/kmemcheck/opcode.h | 1 | ||||
| -rw-r--r-- | arch/x86/mm/kmemcheck/pte.c | 1 | ||||
| -rw-r--r-- | arch/x86/mm/kmemcheck/pte.h | 1 | ||||
| -rw-r--r-- | arch/x86/mm/kmemcheck/selftest.c | 1 | ||||
| -rw-r--r-- | arch/x86/mm/kmemcheck/selftest.h | 1 | ||||
| -rw-r--r-- | arch/x86/mm/kmemcheck/shadow.h | 1 | ||||
| -rw-r--r-- | arch/x86/mm/kmmio.c | 12 | ||||
| -rw-r--r-- | arch/x86/mm/mem_encrypt.c | 4 | ||||
| -rw-r--r-- | arch/x86/mm/pgtable.c | 5 | ||||
| -rw-r--r-- | arch/x86/mm/pgtable_32.c | 1 | ||||
| -rw-r--r-- | arch/x86/mm/pti.c | 368 | ||||
| -rw-r--r-- | arch/x86/mm/tlb.c | 64 | 
26 files changed, 876 insertions, 140 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 8e13b8cc6bed..27e9e90a8d35 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -10,7 +10,7 @@ CFLAGS_REMOVE_mem_encrypt.o	= -pg  endif  obj-y	:=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ -	    pat.o pgtable.o physaddr.o setup_nx.o tlb.o +	    pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o  # Make sure __phys_addr has no stackprotector  nostackp := $(call cc-option, -fno-stack-protector) @@ -41,9 +41,10 @@ obj-$(CONFIG_AMD_NUMA)		+= amdtopology.o  obj-$(CONFIG_ACPI_NUMA)		+= srat.o  obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o -obj-$(CONFIG_X86_INTEL_MPX)	+= mpx.o -obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o -obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o +obj-$(CONFIG_X86_INTEL_MPX)			+= mpx.o +obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS)	+= pkeys.o +obj-$(CONFIG_RANDOMIZE_MEMORY)			+= kaslr.o +obj-$(CONFIG_PAGE_TABLE_ISOLATION)		+= pti.o  obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt.o  obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_boot.o diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c new file mode 100644 index 000000000000..b9283cc27622 --- /dev/null +++ b/arch/x86/mm/cpu_entry_area.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/spinlock.h> +#include <linux/percpu.h> + +#include <asm/cpu_entry_area.h> +#include <asm/pgtable.h> +#include <asm/fixmap.h> +#include <asm/desc.h> + +static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage); + +#ifdef CONFIG_X86_64 +static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks +	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); +#endif + +struct cpu_entry_area *get_cpu_entry_area(int cpu) +{ +	unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE; +	BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); + +	return (struct cpu_entry_area *) va; +} +EXPORT_SYMBOL(get_cpu_entry_area); + +void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags) +{ +	unsigned long va = (unsigned long) cea_vaddr; + +	set_pte_vaddr(va, pfn_pte(pa >> PAGE_SHIFT, flags)); +} + +static void __init +cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) +{ +	for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE) +		cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); +} + +static void percpu_setup_debug_store(int cpu) +{ +#ifdef CONFIG_CPU_SUP_INTEL +	int npages; +	void *cea; + +	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) +		return; + +	cea = &get_cpu_entry_area(cpu)->cpu_debug_store; +	npages = sizeof(struct debug_store) / PAGE_SIZE; +	BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0); +	cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages, +			     PAGE_KERNEL); + +	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers; +	/* +	 * Force the population of PMDs for not yet allocated per cpu +	 * memory like debug store buffers. +	 */ +	npages = sizeof(struct debug_store_buffers) / PAGE_SIZE; +	for (; npages; npages--, cea += PAGE_SIZE) +		cea_set_pte(cea, 0, PAGE_NONE); +#endif +} + +/* Setup the fixmap mappings only once per-processor */ +static void __init setup_cpu_entry_area(int cpu) +{ +#ifdef CONFIG_X86_64 +	extern char _entry_trampoline[]; + +	/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ +	pgprot_t gdt_prot = PAGE_KERNEL_RO; +	pgprot_t tss_prot = PAGE_KERNEL_RO; +#else +	/* +	 * On native 32-bit systems, the GDT cannot be read-only because +	 * our double fault handler uses a task gate, and entering through +	 * a task gate needs to change an available TSS to busy.  If the +	 * GDT is read-only, that will triple fault.  The TSS cannot be +	 * read-only because the CPU writes to it on task switches. +	 * +	 * On Xen PV, the GDT must be read-only because the hypervisor +	 * requires it. +	 */ +	pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? +		PAGE_KERNEL_RO : PAGE_KERNEL; +	pgprot_t tss_prot = PAGE_KERNEL; +#endif + +	cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu), +		    gdt_prot); + +	cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page, +			     per_cpu_ptr(&entry_stack_storage, cpu), 1, +			     PAGE_KERNEL); + +	/* +	 * The Intel SDM says (Volume 3, 7.2.1): +	 * +	 *  Avoid placing a page boundary in the part of the TSS that the +	 *  processor reads during a task switch (the first 104 bytes). The +	 *  processor may not correctly perform address translations if a +	 *  boundary occurs in this area. During a task switch, the processor +	 *  reads and writes into the first 104 bytes of each TSS (using +	 *  contiguous physical addresses beginning with the physical address +	 *  of the first byte of the TSS). So, after TSS access begins, if +	 *  part of the 104 bytes is not physically contiguous, the processor +	 *  will access incorrect information without generating a page-fault +	 *  exception. +	 * +	 * There are also a lot of errata involving the TSS spanning a page +	 * boundary.  Assert that we're not doing that. +	 */ +	BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ +		      offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); +	BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); +	cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss, +			     &per_cpu(cpu_tss_rw, cpu), +			     sizeof(struct tss_struct) / PAGE_SIZE, tss_prot); + +#ifdef CONFIG_X86_32 +	per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); +#endif + +#ifdef CONFIG_X86_64 +	BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); +	BUILD_BUG_ON(sizeof(exception_stacks) != +		     sizeof(((struct cpu_entry_area *)0)->exception_stacks)); +	cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks, +			     &per_cpu(exception_stacks, cpu), +			     sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL); + +	cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline, +		     __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); +#endif +	percpu_setup_debug_store(cpu); +} + +static __init void setup_cpu_entry_area_ptes(void) +{ +#ifdef CONFIG_X86_32 +	unsigned long start, end; + +	BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE); +	BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK); + +	start = CPU_ENTRY_AREA_BASE; +	end = start + CPU_ENTRY_AREA_MAP_SIZE; + +	/* Careful here: start + PMD_SIZE might wrap around */ +	for (; start < end && start >= CPU_ENTRY_AREA_BASE; start += PMD_SIZE) +		populate_extra_pte(start); +#endif +} + +void __init setup_cpu_entry_areas(void) +{ +	unsigned int cpu; + +	setup_cpu_entry_area_ptes(); + +	for_each_possible_cpu(cpu) +		setup_cpu_entry_area(cpu); +} diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index bfcffdf6c577..421f2664ffa0 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c @@ -5,7 +5,7 @@  static int ptdump_show(struct seq_file *m, void *v)  { -	ptdump_walk_pgd_level(m, NULL); +	ptdump_walk_pgd_level_debugfs(m, NULL, false);  	return 0;  } @@ -22,21 +22,89 @@ static const struct file_operations ptdump_fops = {  	.release	= single_release,  }; -static struct dentry *pe; +static int ptdump_show_curknl(struct seq_file *m, void *v) +{ +	if (current->mm->pgd) { +		down_read(¤t->mm->mmap_sem); +		ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false); +		up_read(¤t->mm->mmap_sem); +	} +	return 0; +} + +static int ptdump_open_curknl(struct inode *inode, struct file *filp) +{ +	return single_open(filp, ptdump_show_curknl, NULL); +} + +static const struct file_operations ptdump_curknl_fops = { +	.owner		= THIS_MODULE, +	.open		= ptdump_open_curknl, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= single_release, +}; + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +static struct dentry *pe_curusr; + +static int ptdump_show_curusr(struct seq_file *m, void *v) +{ +	if (current->mm->pgd) { +		down_read(¤t->mm->mmap_sem); +		ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true); +		up_read(¤t->mm->mmap_sem); +	} +	return 0; +} + +static int ptdump_open_curusr(struct inode *inode, struct file *filp) +{ +	return single_open(filp, ptdump_show_curusr, NULL); +} + +static const struct file_operations ptdump_curusr_fops = { +	.owner		= THIS_MODULE, +	.open		= ptdump_open_curusr, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= single_release, +}; +#endif + +static struct dentry *dir, *pe_knl, *pe_curknl;  static int __init pt_dump_debug_init(void)  { -	pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL, -				 &ptdump_fops); -	if (!pe) +	dir = debugfs_create_dir("page_tables", NULL); +	if (!dir)  		return -ENOMEM; +	pe_knl = debugfs_create_file("kernel", 0400, dir, NULL, +				     &ptdump_fops); +	if (!pe_knl) +		goto err; + +	pe_curknl = debugfs_create_file("current_kernel", 0400, +					dir, NULL, &ptdump_curknl_fops); +	if (!pe_curknl) +		goto err; + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +	pe_curusr = debugfs_create_file("current_user", 0400, +					dir, NULL, &ptdump_curusr_fops); +	if (!pe_curusr) +		goto err; +#endif  	return 0; +err: +	debugfs_remove_recursive(dir); +	return -ENOMEM;  }  static void __exit pt_dump_debug_exit(void)  { -	debugfs_remove_recursive(pe); +	debugfs_remove_recursive(dir);  }  module_init(pt_dump_debug_init); diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 5e3ac6fe6c9e..2a4849e92831 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -44,68 +44,97 @@ struct addr_marker {  	unsigned long max_lines;  }; -/* indices for address_markers; keep sync'd w/ address_markers below */ +/* Address space markers hints */ + +#ifdef CONFIG_X86_64 +  enum address_markers_idx {  	USER_SPACE_NR = 0, -#ifdef CONFIG_X86_64  	KERNEL_SPACE_NR,  	LOW_KERNEL_NR, +#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL) +	LDT_NR, +#endif  	VMALLOC_START_NR,  	VMEMMAP_START_NR,  #ifdef CONFIG_KASAN  	KASAN_SHADOW_START_NR,  	KASAN_SHADOW_END_NR,  #endif -# ifdef CONFIG_X86_ESPFIX64 +	CPU_ENTRY_AREA_NR, +#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL) +	LDT_NR, +#endif +#ifdef CONFIG_X86_ESPFIX64  	ESPFIX_START_NR, -# endif +#endif +#ifdef CONFIG_EFI +	EFI_END_NR, +#endif  	HIGH_KERNEL_NR,  	MODULES_VADDR_NR,  	MODULES_END_NR, -#else +	FIXADDR_START_NR, +	END_OF_SPACE_NR, +}; + +static struct addr_marker address_markers[] = { +	[USER_SPACE_NR]		= { 0,			"User Space" }, +	[KERNEL_SPACE_NR]	= { (1UL << 63),	"Kernel Space" }, +	[LOW_KERNEL_NR]		= { 0UL,		"Low Kernel Mapping" }, +	[VMALLOC_START_NR]	= { 0UL,		"vmalloc() Area" }, +	[VMEMMAP_START_NR]	= { 0UL,		"Vmemmap" }, +#ifdef CONFIG_KASAN +	[KASAN_SHADOW_START_NR]	= { KASAN_SHADOW_START,	"KASAN shadow" }, +	[KASAN_SHADOW_END_NR]	= { KASAN_SHADOW_END,	"KASAN shadow end" }, +#endif +#ifdef CONFIG_MODIFY_LDT_SYSCALL +	[LDT_NR]		= { LDT_BASE_ADDR,	"LDT remap" }, +#endif +	[CPU_ENTRY_AREA_NR]	= { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, +#ifdef CONFIG_X86_ESPFIX64 +	[ESPFIX_START_NR]	= { ESPFIX_BASE_ADDR,	"ESPfix Area", 16 }, +#endif +#ifdef CONFIG_EFI +	[EFI_END_NR]		= { EFI_VA_END,		"EFI Runtime Services" }, +#endif +	[HIGH_KERNEL_NR]	= { __START_KERNEL_map,	"High Kernel Mapping" }, +	[MODULES_VADDR_NR]	= { MODULES_VADDR,	"Modules" }, +	[MODULES_END_NR]	= { MODULES_END,	"End Modules" }, +	[FIXADDR_START_NR]	= { FIXADDR_START,	"Fixmap Area" }, +	[END_OF_SPACE_NR]	= { -1,			NULL } +}; + +#else /* CONFIG_X86_64 */ + +enum address_markers_idx { +	USER_SPACE_NR = 0,  	KERNEL_SPACE_NR,  	VMALLOC_START_NR,  	VMALLOC_END_NR, -# ifdef CONFIG_HIGHMEM +#ifdef CONFIG_HIGHMEM  	PKMAP_BASE_NR, -# endif -	FIXADDR_START_NR,  #endif +	CPU_ENTRY_AREA_NR, +	FIXADDR_START_NR, +	END_OF_SPACE_NR,  }; -/* Address space markers hints */  static struct addr_marker address_markers[] = { -	{ 0, "User Space" }, -#ifdef CONFIG_X86_64 -	{ 0x8000000000000000UL, "Kernel Space" }, -	{ 0/* PAGE_OFFSET */,   "Low Kernel Mapping" }, -	{ 0/* VMALLOC_START */, "vmalloc() Area" }, -	{ 0/* VMEMMAP_START */, "Vmemmap" }, -#ifdef CONFIG_KASAN -	{ KASAN_SHADOW_START,	"KASAN shadow" }, -	{ KASAN_SHADOW_END,	"KASAN shadow end" }, +	[USER_SPACE_NR]		= { 0,			"User Space" }, +	[KERNEL_SPACE_NR]	= { PAGE_OFFSET,	"Kernel Mapping" }, +	[VMALLOC_START_NR]	= { 0UL,		"vmalloc() Area" }, +	[VMALLOC_END_NR]	= { 0UL,		"vmalloc() End" }, +#ifdef CONFIG_HIGHMEM +	[PKMAP_BASE_NR]		= { 0UL,		"Persistent kmap() Area" },  #endif -# ifdef CONFIG_X86_ESPFIX64 -	{ ESPFIX_BASE_ADDR,	"ESPfix Area", 16 }, -# endif -# ifdef CONFIG_EFI -	{ EFI_VA_END,		"EFI Runtime Services" }, -# endif -	{ __START_KERNEL_map,   "High Kernel Mapping" }, -	{ MODULES_VADDR,        "Modules" }, -	{ MODULES_END,          "End Modules" }, -#else -	{ PAGE_OFFSET,          "Kernel Mapping" }, -	{ 0/* VMALLOC_START */, "vmalloc() Area" }, -	{ 0/*VMALLOC_END*/,     "vmalloc() End" }, -# ifdef CONFIG_HIGHMEM -	{ 0/*PKMAP_BASE*/,      "Persistent kmap() Area" }, -# endif -	{ 0/*FIXADDR_START*/,   "Fixmap Area" }, -#endif -	{ -1, NULL }		/* End of list */ +	[CPU_ENTRY_AREA_NR]	= { 0UL,		"CPU entry area" }, +	[FIXADDR_START_NR]	= { 0UL,		"Fixmap area" }, +	[END_OF_SPACE_NR]	= { -1,			NULL }  }; +#endif /* !CONFIG_X86_64 */ +  /* Multipliers for offsets within the PTEs */  #define PTE_LEVEL_MULT (PAGE_SIZE)  #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) @@ -140,7 +169,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)  	static const char * const level_name[] =  		{ "cr3", "pgd", "p4d", "pud", "pmd", "pte" }; -	if (!pgprot_val(prot)) { +	if (!(pr & _PAGE_PRESENT)) {  		/* Not present */  		pt_dump_cont_printf(m, dmsg, "                              ");  	} else { @@ -447,7 +476,7 @@ static inline bool is_hypervisor_range(int idx)  }  static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, -				       bool checkwx) +				       bool checkwx, bool dmesg)  {  #ifdef CONFIG_X86_64  	pgd_t *start = (pgd_t *) &init_top_pgt; @@ -460,7 +489,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,  	if (pgd) {  		start = pgd; -		st.to_dmesg = true; +		st.to_dmesg = dmesg;  	}  	st.check_wx = checkwx; @@ -498,13 +527,37 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,  void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)  { -	ptdump_walk_pgd_level_core(m, pgd, false); +	ptdump_walk_pgd_level_core(m, pgd, false, true); +} + +void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION +	if (user && static_cpu_has(X86_FEATURE_PTI)) +		pgd = kernel_to_user_pgdp(pgd); +#endif +	ptdump_walk_pgd_level_core(m, pgd, false, false); +} +EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); + +static void ptdump_walk_user_pgd_level_checkwx(void) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION +	pgd_t *pgd = (pgd_t *) &init_top_pgt; + +	if (!static_cpu_has(X86_FEATURE_PTI)) +		return; + +	pr_info("x86/mm: Checking user space page tables\n"); +	pgd = kernel_to_user_pgdp(pgd); +	ptdump_walk_pgd_level_core(NULL, pgd, true, false); +#endif  } -EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);  void ptdump_walk_pgd_level_checkwx(void)  { -	ptdump_walk_pgd_level_core(NULL, NULL, true); +	ptdump_walk_pgd_level_core(NULL, NULL, true, false); +	ptdump_walk_user_pgd_level_checkwx();  }  static int __init pt_dump_init(void) @@ -525,8 +578,8 @@ static int __init pt_dump_init(void)  	address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;  # endif  	address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; +	address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE;  #endif -  	return 0;  }  __initcall(pt_dump_init); diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 3321b446b66c..9fe656c42aa5 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -1,6 +1,7 @@  #include <linux/extable.h>  #include <linux/uaccess.h>  #include <linux/sched/debug.h> +#include <xen/xen.h>  #include <asm/fpu/internal.h>  #include <asm/traps.h> @@ -82,7 +83,7 @@ bool ex_handler_refcount(const struct exception_table_entry *fixup,  	return true;  } -EXPORT_SYMBOL_GPL(ex_handler_refcount); +EXPORT_SYMBOL(ex_handler_refcount);  /*   * Handler for when we fail to restore a task's FPU state.  We should never get @@ -212,8 +213,9 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr)  	 * Old CPUs leave the high bits of CS on the stack  	 * undefined.  I'm not sure which CPUs do this, but at least  	 * the 486 DX works this way. +	 * Xen pv domains are not using the default __KERNEL_CS.  	 */ -	if (regs->cs != __KERNEL_CS) +	if (!xen_pv_domain() && regs->cs != __KERNEL_CS)  		goto fail;  	/* diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 78ca9a8ee454..06fe3d51d385 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -701,7 +701,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,  	else  		printk(KERN_CONT "paging request"); -	printk(KERN_CONT " at %p\n", (void *) address); +	printk(KERN_CONT " at %px\n", (void *) address);  	printk(KERN_ALERT "IP: %pS\n", (void *)regs->ip);  	dump_pagetable(address); @@ -860,7 +860,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,  	if (!printk_ratelimit())  		return; -	printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx", +	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",  		task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,  		tsk->comm, task_pid_nr(tsk), address,  		(void *)regs->ip, (void *)regs->sp, error_code); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 6fdf91ef130a..82f5252c723a 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -20,6 +20,7 @@  #include <asm/kaslr.h>  #include <asm/hypervisor.h>  #include <asm/cpufeature.h> +#include <asm/pti.h>  /*   * We need to define the tracepoints somewhere, and tlb.c @@ -160,6 +161,12 @@ struct map_range {  static int page_size_mask; +static void enable_global_pages(void) +{ +	if (!static_cpu_has(X86_FEATURE_PTI)) +		__supported_pte_mask |= _PAGE_GLOBAL; +} +  static void __init probe_page_size_mask(void)  {  	/* @@ -177,11 +184,11 @@ static void __init probe_page_size_mask(void)  		cr4_set_bits_and_update_boot(X86_CR4_PSE);  	/* Enable PGE if available */ +	__supported_pte_mask &= ~_PAGE_GLOBAL;  	if (boot_cpu_has(X86_FEATURE_PGE)) {  		cr4_set_bits_and_update_boot(X86_CR4_PGE); -		__supported_pte_mask |= _PAGE_GLOBAL; -	} else -		__supported_pte_mask &= ~_PAGE_GLOBAL; +		enable_global_pages(); +	}  	/* Enable 1 GB linear kernel mappings if available: */  	if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) { @@ -194,34 +201,44 @@ static void __init probe_page_size_mask(void)  static void setup_pcid(void)  { -#ifdef CONFIG_X86_64 -	if (boot_cpu_has(X86_FEATURE_PCID)) { -		if (boot_cpu_has(X86_FEATURE_PGE)) { -			/* -			 * This can't be cr4_set_bits_and_update_boot() -- -			 * the trampoline code can't handle CR4.PCIDE and -			 * it wouldn't do any good anyway.  Despite the name, -			 * cr4_set_bits_and_update_boot() doesn't actually -			 * cause the bits in question to remain set all the -			 * way through the secondary boot asm. -			 * -			 * Instead, we brute-force it and set CR4.PCIDE -			 * manually in start_secondary(). -			 */ -			cr4_set_bits(X86_CR4_PCIDE); -		} else { -			/* -			 * flush_tlb_all(), as currently implemented, won't -			 * work if PCID is on but PGE is not.  Since that -			 * combination doesn't exist on real hardware, there's -			 * no reason to try to fully support it, but it's -			 * polite to avoid corrupting data if we're on -			 * an improperly configured VM. -			 */ -			setup_clear_cpu_cap(X86_FEATURE_PCID); -		} +	if (!IS_ENABLED(CONFIG_X86_64)) +		return; + +	if (!boot_cpu_has(X86_FEATURE_PCID)) +		return; + +	if (boot_cpu_has(X86_FEATURE_PGE)) { +		/* +		 * This can't be cr4_set_bits_and_update_boot() -- the +		 * trampoline code can't handle CR4.PCIDE and it wouldn't +		 * do any good anyway.  Despite the name, +		 * cr4_set_bits_and_update_boot() doesn't actually cause +		 * the bits in question to remain set all the way through +		 * the secondary boot asm. +		 * +		 * Instead, we brute-force it and set CR4.PCIDE manually in +		 * start_secondary(). +		 */ +		cr4_set_bits(X86_CR4_PCIDE); + +		/* +		 * INVPCID's single-context modes (2/3) only work if we set +		 * X86_CR4_PCIDE, *and* we INVPCID support.  It's unusable +		 * on systems that have X86_CR4_PCIDE clear, or that have +		 * no INVPCID support at all. +		 */ +		if (boot_cpu_has(X86_FEATURE_INVPCID)) +			setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE); +	} else { +		/* +		 * flush_tlb_all(), as currently implemented, won't work if +		 * PCID is on but PGE is not.  Since that combination +		 * doesn't exist on real hardware, there's no reason to try +		 * to fully support it, but it's polite to avoid corrupting +		 * data if we're on an improperly configured VM. +		 */ +		setup_clear_cpu_cap(X86_FEATURE_PCID);  	} -#endif  }  #ifdef CONFIG_X86_32 @@ -622,6 +639,7 @@ void __init init_mem_mapping(void)  {  	unsigned long end; +	pti_check_boottime_disable();  	probe_page_size_mask();  	setup_pcid(); @@ -845,12 +863,12 @@ void __init zone_sizes_init(void)  	free_area_init_nodes(max_zone_pfns);  } -DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {  	.loaded_mm = &init_mm,  	.next_asid = 1,  	.cr4 = ~0UL,	/* fail hard if we screw up cr4 shadow initialization */  }; -EXPORT_SYMBOL_GPL(cpu_tlbstate); +EXPORT_PER_CPU_SYMBOL(cpu_tlbstate);  void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)  { diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8a64a6f2848d..135c9a7898c7 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -50,6 +50,7 @@  #include <asm/setup.h>  #include <asm/set_memory.h>  #include <asm/page_types.h> +#include <asm/cpu_entry_area.h>  #include <asm/init.h>  #include "mm_internal.h" @@ -766,6 +767,7 @@ void __init mem_init(void)  	mem_init_print_info(NULL);  	printk(KERN_INFO "virtual kernel memory layout:\n"  		"    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n" +		"  cpu_entry : 0x%08lx - 0x%08lx   (%4ld kB)\n"  #ifdef CONFIG_HIGHMEM  		"    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"  #endif @@ -777,6 +779,10 @@ void __init mem_init(void)  		FIXADDR_START, FIXADDR_TOP,  		(FIXADDR_TOP - FIXADDR_START) >> 10, +		CPU_ENTRY_AREA_BASE, +		CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE, +		CPU_ENTRY_AREA_MAP_SIZE >> 10, +  #ifdef CONFIG_HIGHMEM  		PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,  		(LAST_PKMAP*PAGE_SIZE) >> 10, diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 6e4573b1da34..c45b6ec5357b 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -404,11 +404,11 @@ void iounmap(volatile void __iomem *addr)  		return;  	} +	mmiotrace_iounmap(addr); +  	addr = (volatile void __iomem *)  		(PAGE_MASK & (unsigned long __force)addr); -	mmiotrace_iounmap(addr); -  	/* Use the vm area unlocked, assuming the caller  	   ensures there isn't another iounmap for the same address  	   in parallel. Reuse of the virtual address is prevented by diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 99dfed6dfef8..47388f0c0e59 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -15,6 +15,7 @@  #include <asm/tlbflush.h>  #include <asm/sections.h>  #include <asm/pgtable.h> +#include <asm/cpu_entry_area.h>  extern struct range pfn_mapped[E820_MAX_ENTRIES]; @@ -277,6 +278,7 @@ void __init kasan_early_init(void)  void __init kasan_init(void)  {  	int i; +	void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;  #ifdef CONFIG_KASAN_INLINE  	register_die_notifier(&kasan_die_notifier); @@ -321,16 +323,33 @@ void __init kasan_init(void)  		map_range(&pfn_mapped[i]);  	} +	shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE; +	shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin); +	shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin, +						PAGE_SIZE); + +	shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE + +					CPU_ENTRY_AREA_MAP_SIZE); +	shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end); +	shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end, +					PAGE_SIZE); +  	kasan_populate_zero_shadow(  		kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), -		kasan_mem_to_shadow((void *)__START_KERNEL_map)); +		shadow_cpu_entry_begin); + +	kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin, +			      (unsigned long)shadow_cpu_entry_end, 0); + +	kasan_populate_zero_shadow(shadow_cpu_entry_end, +				kasan_mem_to_shadow((void *)__START_KERNEL_map));  	kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext),  			      (unsigned long)kasan_mem_to_shadow(_end),  			      early_pfn_to_nid(__pa(_stext)));  	kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), -			(void *)KASAN_SHADOW_END); +				(void *)KASAN_SHADOW_END);  	load_cr3(init_top_pgt);  	__flush_tlb_all(); diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 879ef930e2c2..aedebd2ebf1e 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -34,25 +34,14 @@  #define TB_SHIFT 40  /* - * Virtual address start and end range for randomization. The end changes base - * on configuration to have the highest amount of space for randomization. - * It increases the possible random position for each randomized region. + * Virtual address start and end range for randomization.   * - * You need to add an if/def entry if you introduce a new memory region - * compatible with KASLR. Your entry must be in logical order with memory - * layout. For example, ESPFIX is before EFI because its virtual address is - * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to - * ensure that this order is correct and won't be changed. + * The end address could depend on more configuration options to make the + * highest amount of space for randomization available, but that's too hard + * to keep straight and caused issues already.   */  static const unsigned long vaddr_start = __PAGE_OFFSET_BASE; - -#if defined(CONFIG_X86_ESPFIX64) -static const unsigned long vaddr_end = ESPFIX_BASE_ADDR; -#elif defined(CONFIG_EFI) -static const unsigned long vaddr_end = EFI_VA_END; -#else -static const unsigned long vaddr_end = __START_KERNEL_map; -#endif +static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;  /* Default values */  unsigned long page_offset_base = __PAGE_OFFSET_BASE; @@ -101,15 +90,12 @@ void __init kernel_randomize_memory(void)  	unsigned long remain_entropy;  	/* -	 * All these BUILD_BUG_ON checks ensures the memory layout is -	 * consistent with the vaddr_start/vaddr_end variables. +	 * These BUILD_BUG_ON checks ensure the memory layout is consistent +	 * with the vaddr_start/vaddr_end variables. These checks are very +	 * limited....  	 */  	BUILD_BUG_ON(vaddr_start >= vaddr_end); -	BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_ESPFIX64) && -		     vaddr_end >= EFI_VA_END); -	BUILD_BUG_ON((IS_ENABLED(CONFIG_X86_ESPFIX64) || -		      IS_ENABLED(CONFIG_EFI)) && -		     vaddr_end >= __START_KERNEL_map); +	BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE);  	BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);  	if (!kaslr_memory_enabled()) diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c deleted file mode 100644 index cec594032515..000000000000 --- a/arch/x86/mm/kmemcheck/error.c +++ /dev/null @@ -1 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h deleted file mode 100644 index ea32a7d3cf1b..000000000000 --- a/arch/x86/mm/kmemcheck/error.h +++ /dev/null @@ -1 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c deleted file mode 100644 index cec594032515..000000000000 --- a/arch/x86/mm/kmemcheck/opcode.c +++ /dev/null @@ -1 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h deleted file mode 100644 index ea32a7d3cf1b..000000000000 --- a/arch/x86/mm/kmemcheck/opcode.h +++ /dev/null @@ -1 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c deleted file mode 100644 index cec594032515..000000000000 --- a/arch/x86/mm/kmemcheck/pte.c +++ /dev/null @@ -1 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h deleted file mode 100644 index ea32a7d3cf1b..000000000000 --- a/arch/x86/mm/kmemcheck/pte.h +++ /dev/null @@ -1 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c deleted file mode 100644 index cec594032515..000000000000 --- a/arch/x86/mm/kmemcheck/selftest.c +++ /dev/null @@ -1 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h deleted file mode 100644 index ea32a7d3cf1b..000000000000 --- a/arch/x86/mm/kmemcheck/selftest.h +++ /dev/null @@ -1 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h deleted file mode 100644 index ea32a7d3cf1b..000000000000 --- a/arch/x86/mm/kmemcheck/shadow.h +++ /dev/null @@ -1 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index c21c2ed04612..58477ec3d66d 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -435,17 +435,18 @@ int register_kmmio_probe(struct kmmio_probe *p)  	unsigned long flags;  	int ret = 0;  	unsigned long size = 0; +	unsigned long addr = p->addr & PAGE_MASK;  	const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);  	unsigned int l;  	pte_t *pte;  	spin_lock_irqsave(&kmmio_lock, flags); -	if (get_kmmio_probe(p->addr)) { +	if (get_kmmio_probe(addr)) {  		ret = -EEXIST;  		goto out;  	} -	pte = lookup_address(p->addr, &l); +	pte = lookup_address(addr, &l);  	if (!pte) {  		ret = -EINVAL;  		goto out; @@ -454,7 +455,7 @@ int register_kmmio_probe(struct kmmio_probe *p)  	kmmio_count++;  	list_add_rcu(&p->list, &kmmio_probes);  	while (size < size_lim) { -		if (add_kmmio_fault_page(p->addr + size)) +		if (add_kmmio_fault_page(addr + size))  			pr_err("Unable to set page fault.\n");  		size += page_level_size(l);  	} @@ -528,19 +529,20 @@ void unregister_kmmio_probe(struct kmmio_probe *p)  {  	unsigned long flags;  	unsigned long size = 0; +	unsigned long addr = p->addr & PAGE_MASK;  	const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);  	struct kmmio_fault_page *release_list = NULL;  	struct kmmio_delayed_release *drelease;  	unsigned int l;  	pte_t *pte; -	pte = lookup_address(p->addr, &l); +	pte = lookup_address(addr, &l);  	if (!pte)  		return;  	spin_lock_irqsave(&kmmio_lock, flags);  	while (size < size_lim) { -		release_kmmio_fault_page(p->addr + size, &release_list); +		release_kmmio_fault_page(addr + size, &release_list);  		size += page_level_size(l);  	}  	list_del_rcu(&p->list); diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index d9a9e9fc75dd..391b13402e40 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -405,13 +405,13 @@ bool sme_active(void)  {  	return sme_me_mask && !sev_enabled;  } -EXPORT_SYMBOL_GPL(sme_active); +EXPORT_SYMBOL(sme_active);  bool sev_active(void)  {  	return sme_me_mask && sev_enabled;  } -EXPORT_SYMBOL_GPL(sev_active); +EXPORT_SYMBOL(sev_active);  static const struct dma_map_ops sev_dma_ops = {  	.alloc                  = sev_alloc, diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 96d456a94b03..004abf9ebf12 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -355,14 +355,15 @@ static inline void _pgd_free(pgd_t *pgd)  		kmem_cache_free(pgd_cache, pgd);  }  #else +  static inline pgd_t *_pgd_alloc(void)  { -	return (pgd_t *)__get_free_page(PGALLOC_GFP); +	return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);  }  static inline void _pgd_free(pgd_t *pgd)  { -	free_page((unsigned long)pgd); +	free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);  }  #endif /* CONFIG_X86_PAE */ diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 6b9bf023a700..c3c5274410a9 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -10,6 +10,7 @@  #include <linux/pagemap.h>  #include <linux/spinlock.h> +#include <asm/cpu_entry_area.h>  #include <asm/pgtable.h>  #include <asm/pgalloc.h>  #include <asm/fixmap.h> diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c new file mode 100644 index 000000000000..ce38f165489b --- /dev/null +++ b/arch/x86/mm/pti.c @@ -0,0 +1,368 @@ +/* + * Copyright(c) 2017 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * General Public License for more details. + * + * This code is based in part on work published here: + * + *	https://github.com/IAIK/KAISER + * + * The original work was written by and and signed off by for the Linux + * kernel by: + * + *   Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at> + *   Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at> + *   Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at> + *   Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at> + * + * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com> + * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and + *		       Andy Lutomirsky <luto@amacapital.net> + */ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/bug.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/mm.h> +#include <linux/uaccess.h> + +#include <asm/cpufeature.h> +#include <asm/hypervisor.h> +#include <asm/vsyscall.h> +#include <asm/cmdline.h> +#include <asm/pti.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/desc.h> + +#undef pr_fmt +#define pr_fmt(fmt)     "Kernel/User page tables isolation: " fmt + +/* Backporting helper */ +#ifndef __GFP_NOTRACK +#define __GFP_NOTRACK	0 +#endif + +static void __init pti_print_if_insecure(const char *reason) +{ +	if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) +		pr_info("%s\n", reason); +} + +static void __init pti_print_if_secure(const char *reason) +{ +	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) +		pr_info("%s\n", reason); +} + +void __init pti_check_boottime_disable(void) +{ +	char arg[5]; +	int ret; + +	if (hypervisor_is_type(X86_HYPER_XEN_PV)) { +		pti_print_if_insecure("disabled on XEN PV."); +		return; +	} + +	ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); +	if (ret > 0)  { +		if (ret == 3 && !strncmp(arg, "off", 3)) { +			pti_print_if_insecure("disabled on command line."); +			return; +		} +		if (ret == 2 && !strncmp(arg, "on", 2)) { +			pti_print_if_secure("force enabled on command line."); +			goto enable; +		} +		if (ret == 4 && !strncmp(arg, "auto", 4)) +			goto autosel; +	} + +	if (cmdline_find_option_bool(boot_command_line, "nopti")) { +		pti_print_if_insecure("disabled on command line."); +		return; +	} + +autosel: +	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) +		return; +enable: +	setup_force_cpu_cap(X86_FEATURE_PTI); +} + +pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) +{ +	/* +	 * Changes to the high (kernel) portion of the kernelmode page +	 * tables are not automatically propagated to the usermode tables. +	 * +	 * Users should keep in mind that, unlike the kernelmode tables, +	 * there is no vmalloc_fault equivalent for the usermode tables. +	 * Top-level entries added to init_mm's usermode pgd after boot +	 * will not be automatically propagated to other mms. +	 */ +	if (!pgdp_maps_userspace(pgdp)) +		return pgd; + +	/* +	 * The user page tables get the full PGD, accessible from +	 * userspace: +	 */ +	kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd; + +	/* +	 * If this is normal user memory, make it NX in the kernel +	 * pagetables so that, if we somehow screw up and return to +	 * usermode with the kernel CR3 loaded, we'll get a page fault +	 * instead of allowing user code to execute with the wrong CR3. +	 * +	 * As exceptions, we don't set NX if: +	 *  - _PAGE_USER is not set.  This could be an executable +	 *     EFI runtime mapping or something similar, and the kernel +	 *     may execute from it +	 *  - we don't have NX support +	 *  - we're clearing the PGD (i.e. the new pgd is not present). +	 */ +	if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) && +	    (__supported_pte_mask & _PAGE_NX)) +		pgd.pgd |= _PAGE_NX; + +	/* return the copy of the PGD we want the kernel to use: */ +	return pgd; +} + +/* + * Walk the user copy of the page tables (optionally) trying to allocate + * page table pages on the way down. + * + * Returns a pointer to a P4D on success, or NULL on failure. + */ +static __init p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) +{ +	pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); +	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + +	if (address < PAGE_OFFSET) { +		WARN_ONCE(1, "attempt to walk user address\n"); +		return NULL; +	} + +	if (pgd_none(*pgd)) { +		unsigned long new_p4d_page = __get_free_page(gfp); +		if (!new_p4d_page) +			return NULL; + +		set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); +	} +	BUILD_BUG_ON(pgd_large(*pgd) != 0); + +	return p4d_offset(pgd, address); +} + +/* + * Walk the user copy of the page tables (optionally) trying to allocate + * page table pages on the way down. + * + * Returns a pointer to a PMD on success, or NULL on failure. + */ +static __init pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) +{ +	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); +	p4d_t *p4d = pti_user_pagetable_walk_p4d(address); +	pud_t *pud; + +	BUILD_BUG_ON(p4d_large(*p4d) != 0); +	if (p4d_none(*p4d)) { +		unsigned long new_pud_page = __get_free_page(gfp); +		if (!new_pud_page) +			return NULL; + +		set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); +	} + +	pud = pud_offset(p4d, address); +	/* The user page tables do not use large mappings: */ +	if (pud_large(*pud)) { +		WARN_ON(1); +		return NULL; +	} +	if (pud_none(*pud)) { +		unsigned long new_pmd_page = __get_free_page(gfp); +		if (!new_pmd_page) +			return NULL; + +		set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); +	} + +	return pmd_offset(pud, address); +} + +#ifdef CONFIG_X86_VSYSCALL_EMULATION +/* + * Walk the shadow copy of the page tables (optionally) trying to allocate + * page table pages on the way down.  Does not support large pages. + * + * Note: this is only used when mapping *new* kernel data into the + * user/shadow page tables.  It is never used for userspace data. + * + * Returns a pointer to a PTE on success, or NULL on failure. + */ +static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address) +{ +	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); +	pmd_t *pmd = pti_user_pagetable_walk_pmd(address); +	pte_t *pte; + +	/* We can't do anything sensible if we hit a large mapping. */ +	if (pmd_large(*pmd)) { +		WARN_ON(1); +		return NULL; +	} + +	if (pmd_none(*pmd)) { +		unsigned long new_pte_page = __get_free_page(gfp); +		if (!new_pte_page) +			return NULL; + +		set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); +	} + +	pte = pte_offset_kernel(pmd, address); +	if (pte_flags(*pte) & _PAGE_USER) { +		WARN_ONCE(1, "attempt to walk to user pte\n"); +		return NULL; +	} +	return pte; +} + +static void __init pti_setup_vsyscall(void) +{ +	pte_t *pte, *target_pte; +	unsigned int level; + +	pte = lookup_address(VSYSCALL_ADDR, &level); +	if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte)) +		return; + +	target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR); +	if (WARN_ON(!target_pte)) +		return; + +	*target_pte = *pte; +	set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir)); +} +#else +static void __init pti_setup_vsyscall(void) { } +#endif + +static void __init +pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) +{ +	unsigned long addr; + +	/* +	 * Clone the populated PMDs which cover start to end. These PMD areas +	 * can have holes. +	 */ +	for (addr = start; addr < end; addr += PMD_SIZE) { +		pmd_t *pmd, *target_pmd; +		pgd_t *pgd; +		p4d_t *p4d; +		pud_t *pud; + +		pgd = pgd_offset_k(addr); +		if (WARN_ON(pgd_none(*pgd))) +			return; +		p4d = p4d_offset(pgd, addr); +		if (WARN_ON(p4d_none(*p4d))) +			return; +		pud = pud_offset(p4d, addr); +		if (pud_none(*pud)) +			continue; +		pmd = pmd_offset(pud, addr); +		if (pmd_none(*pmd)) +			continue; + +		target_pmd = pti_user_pagetable_walk_pmd(addr); +		if (WARN_ON(!target_pmd)) +			return; + +		/* +		 * Copy the PMD.  That is, the kernelmode and usermode +		 * tables will share the last-level page tables of this +		 * address range +		 */ +		*target_pmd = pmd_clear_flags(*pmd, clear); +	} +} + +/* + * Clone a single p4d (i.e. a top-level entry on 4-level systems and a + * next-level entry on 5-level systems. + */ +static void __init pti_clone_p4d(unsigned long addr) +{ +	p4d_t *kernel_p4d, *user_p4d; +	pgd_t *kernel_pgd; + +	user_p4d = pti_user_pagetable_walk_p4d(addr); +	kernel_pgd = pgd_offset_k(addr); +	kernel_p4d = p4d_offset(kernel_pgd, addr); +	*user_p4d = *kernel_p4d; +} + +/* + * Clone the CPU_ENTRY_AREA into the user space visible page table. + */ +static void __init pti_clone_user_shared(void) +{ +	pti_clone_p4d(CPU_ENTRY_AREA_BASE); +} + +/* + * Clone the ESPFIX P4D into the user space visinble page table + */ +static void __init pti_setup_espfix64(void) +{ +#ifdef CONFIG_X86_ESPFIX64 +	pti_clone_p4d(ESPFIX_BASE_ADDR); +#endif +} + +/* + * Clone the populated PMDs of the entry and irqentry text and force it RO. + */ +static void __init pti_clone_entry_text(void) +{ +	pti_clone_pmds((unsigned long) __entry_text_start, +			(unsigned long) __irqentry_text_end, +		       _PAGE_RW | _PAGE_GLOBAL); +} + +/* + * Initialize kernel page table isolation + */ +void __init pti_init(void) +{ +	if (!static_cpu_has(X86_FEATURE_PTI)) +		return; + +	pr_info("enabled\n"); + +	pti_clone_user_shared(); +	pti_clone_entry_text(); +	pti_setup_espfix64(); +	pti_setup_vsyscall(); +} diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 3118392cdf75..a1561957dccb 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -28,6 +28,38 @@   *	Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi   */ +/* + * We get here when we do something requiring a TLB invalidation + * but could not go invalidate all of the contexts.  We do the + * necessary invalidation by clearing out the 'ctx_id' which + * forces a TLB flush when the context is loaded. + */ +void clear_asid_other(void) +{ +	u16 asid; + +	/* +	 * This is only expected to be set if we have disabled +	 * kernel _PAGE_GLOBAL pages. +	 */ +	if (!static_cpu_has(X86_FEATURE_PTI)) { +		WARN_ON_ONCE(1); +		return; +	} + +	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { +		/* Do not need to flush the current asid */ +		if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid)) +			continue; +		/* +		 * Make sure the next time we go to switch to +		 * this asid, we do a flush: +		 */ +		this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0); +	} +	this_cpu_write(cpu_tlbstate.invalidate_other, false); +} +  atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); @@ -42,6 +74,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,  		return;  	} +	if (this_cpu_read(cpu_tlbstate.invalidate_other)) +		clear_asid_other(); +  	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {  		if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=  		    next->context.ctx_id) @@ -65,6 +100,25 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,  	*need_flush = true;  } +static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) +{ +	unsigned long new_mm_cr3; + +	if (need_flush) { +		invalidate_user_asid(new_asid); +		new_mm_cr3 = build_cr3(pgdir, new_asid); +	} else { +		new_mm_cr3 = build_cr3_noflush(pgdir, new_asid); +	} + +	/* +	 * Caution: many callers of this function expect +	 * that load_cr3() is serializing and orders TLB +	 * fills with respect to the mm_cpumask writes. +	 */ +	write_cr3(new_mm_cr3); +} +  void leave_mm(int cpu)  {  	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); @@ -128,7 +182,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,  	 * isn't free.  	 */  #ifdef CONFIG_DEBUG_VM -	if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) { +	if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {  		/*  		 * If we were to BUG here, we'd be very likely to kill  		 * the system so hard that we don't see the call trace. @@ -195,7 +249,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,  		if (need_flush) {  			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);  			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); -			write_cr3(build_cr3(next, new_asid)); +			load_new_mm_cr3(next->pgd, new_asid, true);  			/*  			 * NB: This gets called via leave_mm() in the idle path @@ -208,7 +262,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,  			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);  		} else {  			/* The new ASID is already up to date. */ -			write_cr3(build_cr3_noflush(next, new_asid)); +			load_new_mm_cr3(next->pgd, new_asid, false);  			/* See above wrt _rcuidle. */  			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); @@ -288,7 +342,7 @@ void initialize_tlbstate_and_flush(void)  		!(cr4_read_shadow() & X86_CR4_PCIDE));  	/* Force ASID 0 and force a TLB flush. */ -	write_cr3(build_cr3(mm, 0)); +	write_cr3(build_cr3(mm->pgd, 0));  	/* Reinitialize tlbstate. */  	this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); @@ -551,7 +605,7 @@ static void do_kernel_range_flush(void *info)  	/* flush range by one by one 'invlpg' */  	for (addr = f->start; addr < f->end; addr += PAGE_SIZE) -		__flush_tlb_single(addr); +		__flush_tlb_one(addr);  }  void flush_tlb_kernel_range(unsigned long start, unsigned long end)  | 

