diff options
Diffstat (limited to 'arch/x86/mm')
35 files changed, 685 insertions, 1952 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 84373dc9b341..98f7c6fa2eaa 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -12,8 +12,10 @@ CFLAGS_REMOVE_mem_encrypt.o = -pg CFLAGS_REMOVE_mem_encrypt_identity.o = -pg endif -obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o +obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o mmap.o \ + pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o maccess.o + +obj-y += pat/ # Make sure __phys_addr has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) @@ -23,13 +25,11 @@ CFLAGS_mem_encrypt_identity.o := $(nostackp) CFLAGS_fault.o := -I $(srctree)/$(src)/../include/asm/trace -obj-$(CONFIG_X86_PAT) += pat_rbtree.o - obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_X86_PTDUMP_CORE) += dump_pagetables.o -obj-$(CONFIG_X86_PTDUMP) += debug_pagetables.o +obj-$(CONFIG_PTDUMP_CORE) += dump_pagetables.o +obj-$(CONFIG_PTDUMP_DEBUGFS) += debug_pagetables.o obj-$(CONFIG_HIGHMEM) += highmem_32.o @@ -45,7 +45,6 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o obj-$(CONFIG_ACPI_NUMA) += srat.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o -obj-$(CONFIG_X86_INTEL_MPX) += mpx.o obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 752ad11d6868..56f9189bbadb 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -17,6 +17,10 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks); DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks); #endif +#if defined(CONFIG_X86_32) && defined(CONFIG_DOUBLEFAULT) +DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack); +#endif + struct cpu_entry_area *get_cpu_entry_area(int cpu) { unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE; @@ -108,7 +112,15 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu) cea_map_stack(MCE); } #else -static inline void percpu_setup_exception_stacks(unsigned int cpu) {} +static inline void percpu_setup_exception_stacks(unsigned int cpu) +{ +#ifdef CONFIG_DOUBLEFAULT + struct cpu_entry_area *cea = get_cpu_entry_area(cpu); + + cea_map_percpu_pages(&cea->doublefault_stack, + &per_cpu(doublefault_stack, cpu), 1, PAGE_KERNEL); +#endif +} #endif /* Setup the fixmap mappings only once per-processor */ @@ -161,6 +173,14 @@ static void __init setup_cpu_entry_area(unsigned int cpu) BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); + /* + * VMX changes the host TR limit to 0x67 after a VM exit. This is + * okay, since 0x67 covers the size of struct x86_hw_tss. Make sure + * that this is correct. + */ + BUILD_BUG_ON(offsetof(struct tss_struct, x86_tss) != 0); + BUILD_BUG_ON(sizeof(struct x86_hw_tss) != 0x68); + cea_map_percpu_pages(&cea->tss, &per_cpu(cpu_tss_rw, cpu), sizeof(struct tss_struct) / PAGE_SIZE, tss_prot); @@ -178,7 +198,9 @@ static __init void setup_cpu_entry_area_ptes(void) #ifdef CONFIG_X86_32 unsigned long start, end; - BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE); + /* The +1 is for the readonly IDT: */ + BUILD_BUG_ON((CPU_ENTRY_AREA_PAGES+1)*PAGE_SIZE != CPU_ENTRY_AREA_MAP_SIZE); + BUILD_BUG_ON(CPU_ENTRY_AREA_TOTAL_SIZE != CPU_ENTRY_AREA_MAP_SIZE); BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK); start = CPU_ENTRY_AREA_BASE; diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index 39001a401eff..4a3b62f780b4 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c @@ -7,7 +7,7 @@ static int ptdump_show(struct seq_file *m, void *v) { - ptdump_walk_pgd_level_debugfs(m, NULL, false); + ptdump_walk_pgd_level_debugfs(m, &init_mm, false); return 0; } @@ -15,11 +15,8 @@ DEFINE_SHOW_ATTRIBUTE(ptdump); static int ptdump_curknl_show(struct seq_file *m, void *v) { - if (current->mm->pgd) { - down_read(¤t->mm->mmap_sem); - ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false); - up_read(¤t->mm->mmap_sem); - } + if (current->mm->pgd) + ptdump_walk_pgd_level_debugfs(m, current->mm, false); return 0; } @@ -28,11 +25,8 @@ DEFINE_SHOW_ATTRIBUTE(ptdump_curknl); #ifdef CONFIG_PAGE_TABLE_ISOLATION static int ptdump_curusr_show(struct seq_file *m, void *v) { - if (current->mm->pgd) { - down_read(¤t->mm->mmap_sem); - ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true); - up_read(¤t->mm->mmap_sem); - } + if (current->mm->pgd) + ptdump_walk_pgd_level_debugfs(m, current->mm, true); return 0; } @@ -43,7 +37,7 @@ DEFINE_SHOW_ATTRIBUTE(ptdump_curusr); static int ptdump_efi_show(struct seq_file *m, void *v) { if (efi_mm.pgd) - ptdump_walk_pgd_level_debugfs(m, efi_mm.pgd, false); + ptdump_walk_pgd_level_debugfs(m, &efi_mm, false); return 0; } diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index ab67822fd2f4..64229dad7eab 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -16,6 +16,7 @@ #include <linux/seq_file.h> #include <linux/highmem.h> #include <linux/pci.h> +#include <linux/ptdump.h> #include <asm/e820/types.h> #include <asm/pgtable.h> @@ -26,16 +27,18 @@ * when a "break" in the continuity is found. */ struct pg_state { + struct ptdump_state ptdump; int level; - pgprot_t current_prot; + pgprotval_t current_prot; pgprotval_t effective_prot; + pgprotval_t prot_levels[5]; unsigned long start_address; - unsigned long current_address; const struct addr_marker *marker; unsigned long lines; bool to_dmesg; bool check_wx; unsigned long wx_pages; + struct seq_file *seq; }; struct addr_marker { @@ -174,11 +177,10 @@ static struct addr_marker address_markers[] = { /* * Print a readable form of a pgprot_t to the seq_file */ -static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) +static void printk_prot(struct seq_file *m, pgprotval_t pr, int level, bool dmsg) { - pgprotval_t pr = pgprot_val(prot); static const char * const level_name[] = - { "cr3", "pgd", "p4d", "pud", "pmd", "pte" }; + { "pgd", "p4d", "pud", "pmd", "pte" }; if (!(pr & _PAGE_PRESENT)) { /* Not present */ @@ -202,12 +204,12 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) pt_dump_cont_printf(m, dmsg, " "); /* Bit 7 has a different meaning on level 3 vs 4 */ - if (level <= 4 && pr & _PAGE_PSE) + if (level <= 3 && pr & _PAGE_PSE) pt_dump_cont_printf(m, dmsg, "PSE "); else pt_dump_cont_printf(m, dmsg, " "); - if ((level == 5 && pr & _PAGE_PAT) || - ((level == 4 || level == 3) && pr & _PAGE_PAT_LARGE)) + if ((level == 4 && pr & _PAGE_PAT) || + ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE)) pt_dump_cont_printf(m, dmsg, "PAT "); else pt_dump_cont_printf(m, dmsg, " "); @@ -223,24 +225,11 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]); } -/* - * On 64 bits, sign-extend the 48 bit address to 64 bit - */ -static unsigned long normalize_addr(unsigned long u) -{ - int shift; - if (!IS_ENABLED(CONFIG_X86_64)) - return u; - - shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); - return (signed long)(u << shift) >> shift; -} - -static void note_wx(struct pg_state *st) +static void note_wx(struct pg_state *st, unsigned long addr) { unsigned long npages; - npages = (st->current_address - st->start_address) / PAGE_SIZE; + npages = (addr - st->start_address) / PAGE_SIZE; #ifdef CONFIG_PCI_BIOS /* @@ -248,7 +237,7 @@ static void note_wx(struct pg_state *st) * Inform about it, but avoid the warning. */ if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN && - st->current_address <= PAGE_OFFSET + BIOS_END) { + addr <= PAGE_OFFSET + BIOS_END) { pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages); return; } @@ -260,27 +249,47 @@ static void note_wx(struct pg_state *st) (void *)st->start_address); } +static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2) +{ + return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) | + ((prot1 | prot2) & _PAGE_NX); +} + /* * This function gets called on a break in a continuous series * of PTE entries; the next one is different so we need to * print what we collected so far. */ -static void note_page(struct seq_file *m, struct pg_state *st, - pgprot_t new_prot, pgprotval_t new_eff, int level) +static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, + unsigned long val) { - pgprotval_t prot, cur, eff; + struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); + pgprotval_t new_prot, new_eff; + pgprotval_t cur, eff; static const char units[] = "BKMGTPE"; + struct seq_file *m = st->seq; + + new_prot = val & PTE_FLAGS_MASK; + + if (level > 0) { + new_eff = effective_prot(st->prot_levels[level - 1], + new_prot); + } else { + new_eff = new_prot; + } + + if (level >= 0) + st->prot_levels[level] = new_eff; /* * If we have a "break" in the series, we need to flush the state that * we have now. "break" is either changing perms, levels or * address space marker. */ - prot = pgprot_val(new_prot); - cur = pgprot_val(st->current_prot); + cur = st->current_prot; eff = st->effective_prot; - if (!st->level) { + if (st->level == -1) { /* First entry */ st->current_prot = new_prot; st->effective_prot = new_eff; @@ -289,14 +298,14 @@ static void note_page(struct seq_file *m, struct pg_state *st, st->lines = 0; pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", st->marker->name); - } else if (prot != cur || new_eff != eff || level != st->level || - st->current_address >= st->marker[1].start_address) { + } else if (new_prot != cur || new_eff != eff || level != st->level || + addr >= st->marker[1].start_address) { const char *unit = units; unsigned long delta; int width = sizeof(unsigned long) * 2; if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) - note_wx(st); + note_wx(st, addr); /* * Now print the actual finished series @@ -306,9 +315,9 @@ static void note_page(struct seq_file *m, struct pg_state *st, pt_dump_seq_printf(m, st->to_dmesg, "0x%0*lx-0x%0*lx ", width, st->start_address, - width, st->current_address); + width, addr); - delta = st->current_address - st->start_address; + delta = addr - st->start_address; while (!(delta & 1023) && unit[1]) { delta >>= 10; unit++; @@ -325,7 +334,7 @@ static void note_page(struct seq_file *m, struct pg_state *st, * such as the start of vmalloc space etc. * This helps in the interpretation. */ - if (st->current_address >= st->marker[1].start_address) { + if (addr >= st->marker[1].start_address) { if (st->marker->max_lines && st->lines > st->marker->max_lines) { unsigned long nskip = @@ -341,222 +350,45 @@ static void note_page(struct seq_file *m, struct pg_state *st, st->marker->name); } - st->start_address = st->current_address; + st->start_address = addr; st->current_prot = new_prot; st->effective_prot = new_eff; st->level = level; } } -static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2) -{ - return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) | - ((prot1 | prot2) & _PAGE_NX); -} - -static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, - pgprotval_t eff_in, unsigned long P) -{ - int i; - pte_t *pte; - pgprotval_t prot, eff; - - for (i = 0; i < PTRS_PER_PTE; i++) { - st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); - pte = pte_offset_map(&addr, st->current_address); - prot = pte_flags(*pte); - eff = effective_prot(eff_in, prot); - note_page(m, st, __pgprot(prot), eff, 5); - pte_unmap(pte); - } -} -#ifdef CONFIG_KASAN - -/* - * This is an optimization for KASAN=y case. Since all kasan page tables - * eventually point to the kasan_early_shadow_page we could call note_page() - * right away without walking through lower level page tables. This saves - * us dozens of seconds (minutes for 5-level config) while checking for - * W+X mapping or reading kernel_page_tables debugfs file. - */ -static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, - void *pt) -{ - if (__pa(pt) == __pa(kasan_early_shadow_pmd) || - (pgtable_l5_enabled() && - __pa(pt) == __pa(kasan_early_shadow_p4d)) || - __pa(pt) == __pa(kasan_early_shadow_pud)) { - pgprotval_t prot = pte_flags(kasan_early_shadow_pte[0]); - note_page(m, st, __pgprot(prot), 0, 5); - return true; - } - return false; -} -#else -static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, - void *pt) -{ - return false; -} -#endif - -#if PTRS_PER_PMD > 1 - -static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, - pgprotval_t eff_in, unsigned long P) -{ - int i; - pmd_t *start, *pmd_start; - pgprotval_t prot, eff; - - pmd_start = start = (pmd_t *)pud_page_vaddr(addr); - for (i = 0; i < PTRS_PER_PMD; i++) { - st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); - if (!pmd_none(*start)) { - prot = pmd_flags(*start); - eff = effective_prot(eff_in, prot); - if (pmd_large(*start) || !pmd_present(*start)) { - note_page(m, st, __pgprot(prot), eff, 4); - } else if (!kasan_page_table(m, st, pmd_start)) { - walk_pte_level(m, st, *start, eff, - P + i * PMD_LEVEL_MULT); - } - } else - note_page(m, st, __pgprot(0), 0, 4); - start++; - } -} - -#else -#define walk_pmd_level(m,s,a,e,p) walk_pte_level(m,s,__pmd(pud_val(a)),e,p) -#define pud_large(a) pmd_large(__pmd(pud_val(a))) -#define pud_none(a) pmd_none(__pmd(pud_val(a))) -#endif - -#if PTRS_PER_PUD > 1 - -static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, - pgprotval_t eff_in, unsigned long P) -{ - int i; - pud_t *start, *pud_start; - pgprotval_t prot, eff; - - pud_start = start = (pud_t *)p4d_page_vaddr(addr); - - for (i = 0; i < PTRS_PER_PUD; i++) { - st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); - if (!pud_none(*start)) { - prot = pud_flags(*start); - eff = effective_prot(eff_in, prot); - if (pud_large(*start) || !pud_present(*start)) { - note_page(m, st, __pgprot(prot), eff, 3); - } else if (!kasan_page_table(m, st, pud_start)) { - walk_pmd_level(m, st, *start, eff, - P + i * PUD_LEVEL_MULT); - } - } else - note_page(m, st, __pgprot(0), 0, 3); - - start++; - } -} - -#else -#define walk_pud_level(m,s,a,e,p) walk_pmd_level(m,s,__pud(p4d_val(a)),e,p) -#define p4d_large(a) pud_large(__pud(p4d_val(a))) -#define p4d_none(a) pud_none(__pud(p4d_val(a))) -#endif - -static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, - pgprotval_t eff_in, unsigned long P) -{ - int i; - p4d_t *start, *p4d_start; - pgprotval_t prot, eff; - - if (PTRS_PER_P4D == 1) - return walk_pud_level(m, st, __p4d(pgd_val(addr)), eff_in, P); - - p4d_start = start = (p4d_t *)pgd_page_vaddr(addr); - - for (i = 0; i < PTRS_PER_P4D; i++) { - st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT); - if (!p4d_none(*start)) { - prot = p4d_flags(*start); - eff = effective_prot(eff_in, prot); - if (p4d_large(*start) || !p4d_present(*start)) { - note_page(m, st, __pgprot(prot), eff, 2); - } else if (!kasan_page_table(m, st, p4d_start)) { - walk_pud_level(m, st, *start, eff, - P + i * P4D_LEVEL_MULT); - } - } else - note_page(m, st, __pgprot(0), 0, 2); - - start++; - } -} - -#define pgd_large(a) (pgtable_l5_enabled() ? pgd_large(a) : p4d_large(__p4d(pgd_val(a)))) -#define pgd_none(a) (pgtable_l5_enabled() ? pgd_none(a) : p4d_none(__p4d(pgd_val(a)))) - -static inline bool is_hypervisor_range(int idx) -{ -#ifdef CONFIG_X86_64 - /* - * A hole in the beginning of kernel address space reserved - * for a hypervisor. - */ - return (idx >= pgd_index(GUARD_HOLE_BASE_ADDR)) && - (idx < pgd_index(GUARD_HOLE_END_ADDR)); -#else - return false; -#endif -} - -static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, +static void ptdump_walk_pgd_level_core(struct seq_file *m, + struct mm_struct *mm, pgd_t *pgd, bool checkwx, bool dmesg) { - pgd_t *start = INIT_PGD; - pgprotval_t prot, eff; - int i; - struct pg_state st = {}; - - if (pgd) { - start = pgd; - st.to_dmesg = dmesg; - } + const struct ptdump_range ptdump_ranges[] = { +#ifdef CONFIG_X86_64 - st.check_wx = checkwx; - if (checkwx) - st.wx_pages = 0; +#define normalize_addr_shift (64 - (__VIRTUAL_MASK_SHIFT + 1)) +#define normalize_addr(u) ((signed long)((u) << normalize_addr_shift) >> \ + normalize_addr_shift) - for (i = 0; i < PTRS_PER_PGD; i++) { - st.current_address = normalize_addr(i * PGD_LEVEL_MULT); - if (!pgd_none(*start) && !is_hypervisor_range(i)) { - prot = pgd_flags(*start); -#ifdef CONFIG_X86_PAE - eff = _PAGE_USER | _PAGE_RW; + {0, PTRS_PER_PGD * PGD_LEVEL_MULT / 2}, + {normalize_addr(PTRS_PER_PGD * PGD_LEVEL_MULT / 2), ~0UL}, #else - eff = prot; + {0, ~0UL}, #endif - if (pgd_large(*start) || !pgd_present(*start)) { - note_page(m, &st, __pgprot(prot), eff, 1); - } else { - walk_p4d_level(m, &st, *start, eff, - i * PGD_LEVEL_MULT); - } - } else - note_page(m, &st, __pgprot(0), 0, 1); + {0, 0} +}; - cond_resched(); - start++; - } + struct pg_state st = { + .ptdump = { + .note_page = note_page, + .range = ptdump_ranges + }, + .level = -1, + .to_dmesg = dmesg, + .check_wx = checkwx, + .seq = m + }; + + ptdump_walk_pgd(&st.ptdump, mm, pgd); - /* Flush out the last page */ - st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT); - note_page(m, &st, __pgprot(0), 0, 0); if (!checkwx) return; if (st.wx_pages) @@ -566,18 +398,20 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n"); } -void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) +void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm) { - ptdump_walk_pgd_level_core(m, pgd, false, true); + ptdump_walk_pgd_level_core(m, mm, mm->pgd, false, true); } -void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user) +void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm, + bool user) { + pgd_t *pgd = mm->pgd; #ifdef CONFIG_PAGE_TABLE_ISOLATION if (user && boot_cpu_has(X86_FEATURE_PTI)) pgd = kernel_to_user_pgdp(pgd); #endif - ptdump_walk_pgd_level_core(m, pgd, false, false); + ptdump_walk_pgd_level_core(m, mm, pgd, false, false); } EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); @@ -592,13 +426,13 @@ void ptdump_walk_user_pgd_level_checkwx(void) pr_info("x86/mm: Checking user space page tables\n"); pgd = kernel_to_user_pgdp(pgd); - ptdump_walk_pgd_level_core(NULL, pgd, true, false); + ptdump_walk_pgd_level_core(NULL, &init_mm, pgd, true, false); #endif } void ptdump_walk_pgd_level_checkwx(void) { - ptdump_walk_pgd_level_core(NULL, NULL, true, false); + ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false); } static int __init pt_dump_init(void) diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 4d75bc656f97..30bb0bd3b1b8 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -45,55 +45,6 @@ __visible bool ex_handler_fault(const struct exception_table_entry *fixup, EXPORT_SYMBOL_GPL(ex_handler_fault); /* - * Handler for UD0 exception following a failed test against the - * result of a refcount inc/dec/add/sub. - */ -__visible bool ex_handler_refcount(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) -{ - /* First unconditionally saturate the refcount. */ - *(int *)regs->cx = INT_MIN / 2; - - /* - * Strictly speaking, this reports the fixup destination, not - * the fault location, and not the actually overflowing - * instruction, which is the instruction before the "js", but - * since that instruction could be a variety of lengths, just - * report the location after the overflow, which should be close - * enough for finding the overflow, as it's at least back in - * the function, having returned from .text.unlikely. - */ - regs->ip = ex_fixup_addr(fixup); - - /* - * This function has been called because either a negative refcount - * value was seen by any of the refcount functions, or a zero - * refcount value was seen by refcount_dec(). - * - * If we crossed from INT_MAX to INT_MIN, OF (Overflow Flag: result - * wrapped around) will be set. Additionally, seeing the refcount - * reach 0 will set ZF (Zero Flag: result was zero). In each of - * these cases we want a report, since it's a boundary condition. - * The SF case is not reported since it indicates post-boundary - * manipulations below zero or above INT_MAX. And if none of the - * flags are set, something has gone very wrong, so report it. - */ - if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) { - bool zero = regs->flags & X86_EFLAGS_ZF; - - refcount_error_report(regs, zero ? "hit zero" : "overflow"); - } else if ((regs->flags & X86_EFLAGS_SF) == 0) { - /* Report if none of OF, ZF, nor SF are set. */ - refcount_error_report(regs, "unexpected saturation"); - } - - return true; -} -EXPORT_SYMBOL(ex_handler_refcount); - -/* * Handler for when we fail to restore a task's FPU state. We should never get * here because the FPU state of a task using the FPU (task->thread.fpu.state) * should always be valid. However, past bugs have allowed userspace to set diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9ceacd1156db..fa4ea09593ab 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -29,6 +29,7 @@ #include <asm/efi.h> /* efi_recover_from_page_fault()*/ #include <asm/desc.h> /* store_idt(), ... */ #include <asm/cpu_entry_area.h> /* exception stack */ +#include <asm/pgtable_areas.h> /* VMALLOC_START, ... */ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> @@ -197,7 +198,7 @@ void vmalloc_sync_all(void) return; for (address = VMALLOC_START & PMD_MASK; - address >= TASK_SIZE_MAX && address < FIXADDR_TOP; + address >= TASK_SIZE_MAX && address < VMALLOC_END; address += PMD_SIZE) { struct page *page; @@ -1486,27 +1487,6 @@ good_area: } NOKPROBE_SYMBOL(do_user_addr_fault); -/* - * Explicitly marked noinline such that the function tracer sees this as the - * page_fault entry point. - */ -static noinline void -__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, - unsigned long address) -{ - prefetchw(¤t->mm->mmap_sem); - - if (unlikely(kmmio_fault(regs, address))) - return; - - /* Was the fault on kernel-controlled part of the address space? */ - if (unlikely(fault_in_kernel_space(address))) - do_kern_addr_fault(regs, hw_error_code, address); - else - do_user_addr_fault(regs, hw_error_code, address); -} -NOKPROBE_SYMBOL(__do_page_fault); - static __always_inline void trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code, unsigned long address) @@ -1521,13 +1501,19 @@ trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code, } dotraplinkage void -do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) +do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, + unsigned long address) { - enum ctx_state prev_state; + prefetchw(¤t->mm->mmap_sem); + trace_page_fault_entries(regs, hw_error_code, address); - prev_state = exception_enter(); - trace_page_fault_entries(regs, error_code, address); - __do_page_fault(regs, error_code, address); - exception_exit(prev_state); + if (unlikely(kmmio_fault(regs, address))) + return; + + /* Was the fault on kernel-controlled part of the address space? */ + if (unlikely(fault_in_kernel_space(address))) + do_kern_addr_fault(regs, hw_error_code, address); + else + do_user_addr_fault(regs, hw_error_code, address); } NOKPROBE_SYMBOL(do_page_fault); diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index fab095362c50..5bfd5aef5378 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -19,7 +19,6 @@ #include <asm/tlbflush.h> #include <asm/pgalloc.h> #include <asm/elf.h> -#include <asm/mpx.h> #if 0 /* This is just for testing */ struct page * @@ -151,10 +150,6 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (len & ~huge_page_mask(h)) return -EINVAL; - addr = mpx_unmapped_area_check(addr, len, flags); - if (IS_ERR_VALUE(addr)) - return addr; - if (len > TASK_SIZE) return -ENOMEM; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index fd10d91a6115..e7bb483557c9 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -829,14 +829,13 @@ void free_init_pages(const char *what, unsigned long begin, unsigned long end) * used for the kernel image only. free_init_pages() will do the * right thing for either kind of address. */ -void free_kernel_image_pages(void *begin, void *end) +void free_kernel_image_pages(const char *what, void *begin, void *end) { unsigned long begin_ul = (unsigned long)begin; unsigned long end_ul = (unsigned long)end; unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT; - - free_init_pages("unused kernel image", begin_ul, end_ul); + free_init_pages(what, begin_ul, end_ul); /* * PTI maps some of the kernel into userspace. For performance, @@ -865,7 +864,8 @@ void __ref free_initmem(void) mem_encrypt_free_decrypted_mem(); - free_kernel_image_pages(&__init_begin, &__init_end); + free_kernel_image_pages("unused kernel image (initmem)", + &__init_begin, &__init_end); } #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 4068abb9427f..23df4885bbed 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -52,6 +52,7 @@ #include <asm/page_types.h> #include <asm/cpu_entry_area.h> #include <asm/init.h> +#include <asm/pgtable_areas.h> #include "mm_internal.h" @@ -865,43 +866,13 @@ void arch_remove_memory(int nid, u64 start, u64 size, { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct zone *zone; - zone = page_zone(pfn_to_page(start_pfn)); - __remove_pages(zone, start_pfn, nr_pages, altmap); + __remove_pages(start_pfn, nr_pages, altmap); } #endif int kernel_set_to_readonly __read_mostly; -void set_kernel_text_rw(void) -{ - unsigned long start = PFN_ALIGN(_text); - unsigned long size = PFN_ALIGN(_etext) - start; - - if (!kernel_set_to_readonly) - return; - - pr_debug("Set kernel text: %lx - %lx for read write\n", - start, start+size); - - set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); -} - -void set_kernel_text_ro(void) -{ - unsigned long start = PFN_ALIGN(_text); - unsigned long size = PFN_ALIGN(_etext) - start; - - if (!kernel_set_to_readonly) - return; - - pr_debug("Set kernel text: %lx - %lx for read only\n", - start, start+size); - - set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); -} - static void mark_nxdata_nx(void) { /* @@ -916,7 +887,7 @@ static void mark_nxdata_nx(void) if (__supported_pte_mask & _PAGE_NX) printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10); - set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT); + set_memory_nx(start, size >> PAGE_SHIFT); } void mark_rodata_ro(void) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a6b5c653727b..abbdecb75fad 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1212,10 +1212,8 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size, { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap); - struct zone *zone = page_zone(page); - __remove_pages(zone, start_pfn, nr_pages, altmap); + __remove_pages(start_pfn, nr_pages, altmap); kernel_physical_mapping_remove(start, start + size); } #endif /* CONFIG_MEMORY_HOTPLUG */ @@ -1260,49 +1258,13 @@ void __init mem_init(void) int kernel_set_to_readonly; -void set_kernel_text_rw(void) -{ - unsigned long start = PFN_ALIGN(_text); - unsigned long end = PFN_ALIGN(__stop___ex_table); - - if (!kernel_set_to_readonly) - return; - - pr_debug("Set kernel text: %lx - %lx for read write\n", - start, end); - - /* - * Make the kernel identity mapping for text RW. Kernel text - * mapping will always be RO. Refer to the comment in - * static_protections() in pageattr.c - */ - set_memory_rw(start, (end - start) >> PAGE_SHIFT); -} - -void set_kernel_text_ro(void) -{ - unsigned long start = PFN_ALIGN(_text); - unsigned long end = PFN_ALIGN(__stop___ex_table); - - if (!kernel_set_to_readonly) - return; - - pr_debug("Set kernel text: %lx - %lx for read only\n", - start, end); - - /* - * Set the kernel identity mapping for text RO. - */ - set_memory_ro(start, (end - start) >> PAGE_SHIFT); -} - void mark_rodata_ro(void) { unsigned long start = PFN_ALIGN(_text); unsigned long rodata_start = PFN_ALIGN(__start_rodata); - unsigned long end = (unsigned long) &__end_rodata_hpage_align; - unsigned long text_end = PFN_ALIGN(&__stop___ex_table); - unsigned long rodata_end = PFN_ALIGN(&__end_rodata); + unsigned long end = (unsigned long)__end_rodata_hpage_align; + unsigned long text_end = PFN_ALIGN(_etext); + unsigned long rodata_end = PFN_ALIGN(__end_rodata); unsigned long all_end; printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", @@ -1334,8 +1296,10 @@ void mark_rodata_ro(void) set_memory_ro(start, (end-start) >> PAGE_SHIFT); #endif - free_kernel_image_pages((void *)text_end, (void *)rodata_start); - free_kernel_image_pages((void *)rodata_end, (void *)_sdata); + free_kernel_image_pages("unused kernel image (text/rodata gap)", + (void *)text_end, (void *)rodata_start); + free_kernel_image_pages("unused kernel image (rodata/data gap)", + (void *)rodata_end, (void *)_sdata); debug_checkwx(); } diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 6748b4c2baff..f60398aeb644 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -4,7 +4,7 @@ */ #include <asm/iomap.h> -#include <asm/pat.h> +#include <asm/memtype.h> #include <linux/export.h> #include <linux/highmem.h> @@ -26,7 +26,7 @@ int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot) if (!is_io_mapping_possible(base, size)) return -EINVAL; - ret = io_reserve_memtype(base, base + size, &pcm); + ret = memtype_reserve_io(base, base + size, &pcm); if (ret) return ret; @@ -40,7 +40,7 @@ EXPORT_SYMBOL_GPL(iomap_create_wc); void iomap_free(resource_size_t base, unsigned long size) { - io_free_memtype(base, base + size); + memtype_free_io(base, base + size); } EXPORT_SYMBOL_GPL(iomap_free); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 63e99f15d7cf..44e4beb4239f 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -19,11 +19,12 @@ #include <asm/set_memory.h> #include <asm/e820/api.h> +#include <asm/efi.h> #include <asm/fixmap.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/pgalloc.h> -#include <asm/pat.h> +#include <asm/memtype.h> #include <asm/setup.h> #include "physaddr.h" @@ -195,10 +196,10 @@ __ioremap_caller(resource_size_t phys_addr, unsigned long size, phys_addr &= PHYSICAL_PAGE_MASK; size = PAGE_ALIGN(last_addr+1) - phys_addr; - retval = reserve_memtype(phys_addr, (u64)phys_addr + size, + retval = memtype_reserve(phys_addr, (u64)phys_addr + size, pcm, &new_pcm); if (retval) { - printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval); + printk(KERN_ERR "ioremap memtype_reserve failed %d\n", retval); return NULL; } @@ -254,7 +255,7 @@ __ioremap_caller(resource_size_t phys_addr, unsigned long size, area->phys_addr = phys_addr; vaddr = (unsigned long) area->addr; - if (kernel_map_sync_memtype(phys_addr, size, pcm)) + if (memtype_kernel_map_sync(phys_addr, size, pcm)) goto err_free_area; if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) @@ -274,16 +275,16 @@ __ioremap_caller(resource_size_t phys_addr, unsigned long size, err_free_area: free_vm_area(area); err_free_memtype: - free_memtype(phys_addr, phys_addr + size); + memtype_free(phys_addr, phys_addr + size); return NULL; } /** - * ioremap_nocache - map bus memory into CPU space + * ioremap - map bus memory into CPU space * @phys_addr: bus address of the memory * @size: size of the resource to map * - * ioremap_nocache performs a platform specific sequence of operations to + * ioremap performs a platform specific sequence of operations to * make bus memory CPU accessible via the readb/readw/readl/writeb/ * writew/writel functions and the other mmio helpers. The returned * address is not guaranteed to be usable directly as a virtual @@ -299,7 +300,7 @@ err_free_memtype: * * Must be freed with iounmap. */ -void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) +void __iomem *ioremap(resource_size_t phys_addr, unsigned long size) { /* * Ideally, this should be: @@ -314,7 +315,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) return __ioremap_caller(phys_addr, size, pcm, __builtin_return_address(0), false); } -EXPORT_SYMBOL(ioremap_nocache); +EXPORT_SYMBOL(ioremap); /** * ioremap_uc - map bus memory into CPU space as strongly uncachable @@ -450,7 +451,7 @@ void iounmap(volatile void __iomem *addr) return; } - free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p)); + memtype_free(p->phys_addr, p->phys_addr + get_vm_area_size(p)); /* Finally remove it */ o = remove_vm_area((void __force *)addr); @@ -625,6 +626,17 @@ static bool memremap_is_setup_data(resource_size_t phys_addr, paddr_next = data->next; len = data->len; + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) { + memunmap(data); + return true; + } + + if (data->type == SETUP_INDIRECT && + ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { + paddr = ((struct setup_indirect *)data->data)->addr; + len = ((struct setup_indirect *)data->data)->len; + } + memunmap(data); if ((phys_addr > paddr) && (phys_addr < (paddr + len))) diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 296da58f3013..763e71abc0fe 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -245,22 +245,48 @@ static void __init kasan_map_early_shadow(pgd_t *pgd) } while (pgd++, addr = next, addr != end); } -#ifdef CONFIG_KASAN_INLINE -static int kasan_die_handler(struct notifier_block *self, - unsigned long val, - void *data) +static void __init kasan_shallow_populate_p4ds(pgd_t *pgd, + unsigned long addr, + unsigned long end) { - if (val == DIE_GPF) { - pr_emerg("CONFIG_KASAN_INLINE enabled\n"); - pr_emerg("GPF could be caused by NULL-ptr deref or user memory access\n"); - } - return NOTIFY_OK; + p4d_t *p4d; + unsigned long next; + void *p; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + + if (p4d_none(*p4d)) { + p = early_alloc(PAGE_SIZE, NUMA_NO_NODE, true); + p4d_populate(&init_mm, p4d, p); + } + } while (p4d++, addr = next, addr != end); } -static struct notifier_block kasan_die_notifier = { - .notifier_call = kasan_die_handler, -}; -#endif +static void __init kasan_shallow_populate_pgds(void *start, void *end) +{ + unsigned long addr, next; + pgd_t *pgd; + void *p; + + addr = (unsigned long)start; + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, (unsigned long)end); + + if (pgd_none(*pgd)) { + p = early_alloc(PAGE_SIZE, NUMA_NO_NODE, true); + pgd_populate(&init_mm, pgd, p); + } + + /* + * we need to populate p4ds to be synced when running in + * four level mode - see sync_global_pgds_l4() + */ + kasan_shallow_populate_p4ds(pgd, addr, next); + } while (pgd++, addr = next, addr != (unsigned long)end); +} void __init kasan_early_init(void) { @@ -298,10 +324,6 @@ void __init kasan_init(void) int i; void *shadow_cpu_entry_begin, *shadow_cpu_entry_end; -#ifdef CONFIG_KASAN_INLINE - register_die_notifier(&kasan_die_notifier); -#endif - memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt)); /* @@ -354,6 +376,24 @@ void __init kasan_init(void) kasan_populate_early_shadow( kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), + kasan_mem_to_shadow((void *)VMALLOC_START)); + + /* + * If we're in full vmalloc mode, don't back vmalloc space with early + * shadow pages. Instead, prepopulate pgds/p4ds so they are synced to + * the global table and we can populate the lower levels on demand. + */ + if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) + kasan_shallow_populate_pgds( + kasan_mem_to_shadow((void *)VMALLOC_START), + kasan_mem_to_shadow((void *)VMALLOC_END)); + else + kasan_populate_early_shadow( + kasan_mem_to_shadow((void *)VMALLOC_START), + kasan_mem_to_shadow((void *)VMALLOC_END)); + + kasan_populate_early_shadow( + kasan_mem_to_shadow((void *)VMALLOC_END + 1), shadow_cpu_entry_begin); kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin, diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 79eb55ce69a9..49d7814b59a9 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -193,8 +193,8 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f) int ret; WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n")); if (f->armed) { - pr_warning("double-arm: addr 0x%08lx, ref %d, old %d\n", - f->addr, f->count, !!f->old_presence); + pr_warn("double-arm: addr 0x%08lx, ref %d, old %d\n", + f->addr, f->count, !!f->old_presence); } ret = clear_page_presence(f, true); WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming at 0x%08lx failed.\n"), @@ -341,8 +341,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) * something external causing them (f.e. using a debugger while * mmio tracing enabled), or erroneous behaviour */ - pr_warning("unexpected debug trap on CPU %d.\n", - smp_processor_id()); + pr_warn("unexpected debug trap on CPU %d.\n", smp_processor_id()); goto out; } diff --git a/arch/x86/mm/maccess.c b/arch/x86/mm/maccess.c new file mode 100644 index 000000000000..f5b85bdc0535 --- /dev/null +++ b/arch/x86/mm/maccess.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/uaccess.h> +#include <linux/kernel.h> + +#ifdef CONFIG_X86_64 +static __always_inline u64 canonical_address(u64 vaddr, u8 vaddr_bits) +{ + return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits); +} + +static __always_inline bool invalid_probe_range(u64 vaddr) +{ + /* + * Range covering the highest possible canonical userspace address + * as well as non-canonical address range. For the canonical range + * we also need to include the userspace guard page. + */ + return vaddr < TASK_SIZE_MAX + PAGE_SIZE || + canonical_address(vaddr, boot_cpu_data.x86_virt_bits) != vaddr; +} +#else +static __always_inline bool invalid_probe_range(u64 vaddr) +{ + return vaddr < TASK_SIZE_MAX; +} +#endif + +long probe_kernel_read_strict(void *dst, const void *src, size_t size) +{ + if (unlikely(invalid_probe_range((unsigned long)src))) + return -EFAULT; + + return __probe_kernel_read(dst, src, size); +} + +long strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr, long count) +{ + if (unlikely(invalid_probe_range((unsigned long)unsafe_addr))) + return -EFAULT; + + return __strncpy_from_unsafe(dst, unsafe_addr, count); +} diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index fece30ca8b0c..a03614bd3e1a 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -344,13 +344,11 @@ bool sme_active(void) { return sme_me_mask && !sev_enabled; } -EXPORT_SYMBOL(sme_active); bool sev_active(void) { return sme_me_mask && sev_enabled; } -EXPORT_SYMBOL(sev_active); /* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */ bool force_dma_unencrypted(struct device *dev) @@ -369,7 +367,7 @@ bool force_dma_unencrypted(struct device *dev) if (sme_active()) { u64 dma_enc_mask = DMA_BIT_MASK(__ffs64(sme_me_mask)); u64 dma_dev_mask = min_not_zero(dev->coherent_dma_mask, - dev->bus_dma_mask); + dev->bus_dma_limit); if (dma_dev_mask <= dma_enc_mask) return true; diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S index 6d71481a1e70..106ead05bbe3 100644 --- a/arch/x86/mm/mem_encrypt_boot.S +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -16,7 +16,7 @@ .text .code64 -ENTRY(sme_encrypt_execute) +SYM_FUNC_START(sme_encrypt_execute) /* * Entry parameters: @@ -66,9 +66,9 @@ ENTRY(sme_encrypt_execute) pop %rbp ret -ENDPROC(sme_encrypt_execute) +SYM_FUNC_END(sme_encrypt_execute) -ENTRY(__enc_copy) +SYM_FUNC_START(__enc_copy) /* * Routine used to encrypt memory in place. * This routine must be run outside of the kernel proper since @@ -153,4 +153,4 @@ ENTRY(__enc_copy) ret .L__enc_copy_end: -ENDPROC(__enc_copy) +SYM_FUNC_END(__enc_copy) diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index aae9a933dfd4..cb91eccc4960 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -163,8 +163,6 @@ unsigned long get_mmap_base(int is_legacy) const char *arch_vma_name(struct vm_area_struct *vma) { - if (vma->vm_flags & VM_MPX) - return "[mpx]"; return NULL; } diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index b8ef8557d4b3..673de6063345 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -394,7 +394,7 @@ static void enter_uniprocessor(void) } out: if (num_online_cpus() > 1) - pr_warning("multiple CPUs still online, may miss events.\n"); + pr_warn("multiple CPUs still online, may miss events.\n"); } static void leave_uniprocessor(void) @@ -418,8 +418,8 @@ static void leave_uniprocessor(void) static void enter_uniprocessor(void) { if (num_online_cpus() > 1) - pr_warning("multiple CPUs are online, may miss events. " - "Suggest booting with maxcpus=1 kernel argument.\n"); + pr_warn("multiple CPUs are online, may miss events. " + "Suggest booting with maxcpus=1 kernel argument.\n"); } static void leave_uniprocessor(void) diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c deleted file mode 100644 index 895fb7a9294d..000000000000 --- a/arch/x86/mm/mpx.c +++ /dev/null @@ -1,938 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * mpx.c - Memory Protection eXtensions - * - * Copyright (c) 2014, Intel Corporation. - * Qiaowei Ren <qiaowei.ren@intel.com> - * Dave Hansen <dave.hansen@intel.com> - */ -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/mm_types.h> -#include <linux/mman.h> -#include <linux/syscalls.h> -#include <linux/sched/sysctl.h> - -#include <asm/insn.h> -#include <asm/insn-eval.h> -#include <asm/mmu_context.h> -#include <asm/mpx.h> -#include <asm/processor.h> -#include <asm/fpu/internal.h> - -#define CREATE_TRACE_POINTS -#include <asm/trace/mpx.h> - -static inline unsigned long mpx_bd_size_bytes(struct mm_struct *mm) -{ - if (is_64bit_mm(mm)) - return MPX_BD_SIZE_BYTES_64; - else - return MPX_BD_SIZE_BYTES_32; -} - -static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm) -{ - if (is_64bit_mm(mm)) - return MPX_BT_SIZE_BYTES_64; - else - return MPX_BT_SIZE_BYTES_32; -} - -/* - * This is really a simplified "vm_mmap". it only handles MPX - * bounds tables (the bounds directory is user-allocated). - */ -static unsigned long mpx_mmap(unsigned long len) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, populate; - - /* Only bounds table can be allocated here */ - if (len != mpx_bt_size_bytes(mm)) - return -EINVAL; - - down_write(&mm->mmap_sem); - addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate, NULL); - up_write(&mm->mmap_sem); - if (populate) - mm_populate(addr, populate); - - return addr; -} - -static int mpx_insn_decode(struct insn *insn, - struct pt_regs *regs) -{ - unsigned char buf[MAX_INSN_SIZE]; - int x86_64 = !test_thread_flag(TIF_IA32); - int not_copied; - int nr_copied; - - not_copied = copy_from_user(buf, (void __user *)regs->ip, sizeof(buf)); - nr_copied = sizeof(buf) - not_copied; - /* - * The decoder _should_ fail nicely if we pass it a short buffer. - * But, let's not depend on that implementation detail. If we - * did not get anything, just error out now. - */ - if (!nr_copied) - return -EFAULT; - insn_init(insn, buf, nr_copied, x86_64); - insn_get_length(insn); - /* - * copy_from_user() tries to get as many bytes as we could see in - * the largest possible instruction. If the instruction we are - * after is shorter than that _and_ we attempt to copy from - * something unreadable, we might get a short read. This is OK - * as long as the read did not stop in the middle of the - * instruction. Check to see if we got a partial instruction. - */ - if (nr_copied < insn->length) - return -EFAULT; - - insn_get_opcode(insn); - /* - * We only _really_ need to decode bndcl/bndcn/bndcu - * Error out on anything else. - */ - if (insn->opcode.bytes[0] != 0x0f) - goto bad_opcode; - if ((insn->opcode.bytes[1] != 0x1a) && - (insn->opcode.bytes[1] != 0x1b)) - goto bad_opcode; - - return 0; -bad_opcode: - return -EINVAL; -} - -/* - * If a bounds overflow occurs then a #BR is generated. This - * function decodes MPX instructions to get violation address - * and set this address into extended struct siginfo. - * - * Note that this is not a super precise way of doing this. - * Userspace could have, by the time we get here, written - * anything it wants in to the instructions. We can not - * trust anything about it. They might not be valid - * instructions or might encode invalid registers, etc... - */ -int mpx_fault_info(struct mpx_fault_info *info, struct pt_regs *regs) -{ - const struct mpx_bndreg_state *bndregs; - const struct mpx_bndreg *bndreg; - struct insn insn; - uint8_t bndregno; - int err; - - err = mpx_insn_decode(&insn, regs); - if (err) - goto err_out; - - /* - * We know at this point that we are only dealing with - * MPX instructions. - */ - insn_get_modrm(&insn); - bndregno = X86_MODRM_REG(insn.modrm.value); - if (bndregno > 3) { - err = -EINVAL; - goto err_out; - } - /* get bndregs field from current task's xsave area */ - bndregs = get_xsave_field_ptr(XFEATURE_BNDREGS); - if (!bndregs) { - err = -EINVAL; - goto err_out; - } - /* now go select the individual register in the set of 4 */ - bndreg = &bndregs->bndreg[bndregno]; - - /* - * The registers are always 64-bit, but the upper 32 - * bits are ignored in 32-bit mode. Also, note that the - * upper bounds are architecturally represented in 1's - * complement form. - * - * The 'unsigned long' cast is because the compiler - * complains when casting from integers to different-size - * pointers. - */ - info->lower = (void __user *)(unsigned long)bndreg->lower_bound; - info->upper = (void __user *)(unsigned long)~bndreg->upper_bound; - info->addr = insn_get_addr_ref(&insn, regs); - - /* - * We were not able to extract an address from the instruction, - * probably because there was something invalid in it. - */ - if (info->addr == (void __user *)-1) { - err = -EINVAL; - goto err_out; - } - trace_mpx_bounds_register_exception(info->addr, bndreg); - return 0; -err_out: - /* info might be NULL, but kfree() handles that */ - return err; -} - -static __user void *mpx_get_bounds_dir(void) -{ - const struct mpx_bndcsr *bndcsr; - - if (!cpu_feature_enabled(X86_FEATURE_MPX)) - return MPX_INVALID_BOUNDS_DIR; - - /* - * The bounds directory pointer is stored in a register - * only accessible if we first do an xsave. - */ - bndcsr = get_xsave_field_ptr(XFEATURE_BNDCSR); - if (!bndcsr) - return MPX_INVALID_BOUNDS_DIR; - - /* - * Make sure the register looks valid by checking the - * enable bit. - */ - if (!(bndcsr->bndcfgu & MPX_BNDCFG_ENABLE_FLAG)) - return MPX_INVALID_BOUNDS_DIR; - - /* - * Lastly, mask off the low bits used for configuration - * flags, and return the address of the bounds table. - */ - return (void __user *)(unsigned long) - (bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK); -} - -int mpx_enable_management(void) -{ - void __user *bd_base = MPX_INVALID_BOUNDS_DIR; - struct mm_struct *mm = current->mm; - int ret = 0; - - /* - * runtime in the userspace will be responsible for allocation of - * the bounds directory. Then, it will save the base of the bounds - * directory into XSAVE/XRSTOR Save Area and enable MPX through - * XRSTOR instruction. - * - * The copy_xregs_to_kernel() beneath get_xsave_field_ptr() is - * expected to be relatively expensive. Storing the bounds - * directory here means that we do not have to do xsave in the - * unmap path; we can just use mm->context.bd_addr instead. - */ - bd_base = mpx_get_bounds_dir(); - down_write(&mm->mmap_sem); - - /* MPX doesn't support addresses above 47 bits yet. */ - if (find_vma(mm, DEFAULT_MAP_WINDOW)) { - pr_warn_once("%s (%d): MPX cannot handle addresses " - "above 47-bits. Disabling.", - current->comm, current->pid); - ret = -ENXIO; - goto out; - } - mm->context.bd_addr = bd_base; - if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR) - ret = -ENXIO; -out: - up_write(&mm->mmap_sem); - return ret; -} - -int mpx_disable_management(void) -{ - struct mm_struct *mm = current->mm; - - if (!cpu_feature_enabled(X86_FEATURE_MPX)) - return -ENXIO; - - down_write(&mm->mmap_sem); - mm->context.bd_addr = MPX_INVALID_BOUNDS_DIR; - up_write(&mm->mmap_sem); - return 0; -} - -static int mpx_cmpxchg_bd_entry(struct mm_struct *mm, - unsigned long *curval, - unsigned long __user *addr, - unsigned long old_val, unsigned long new_val) -{ - int ret; - /* - * user_atomic_cmpxchg_inatomic() actually uses sizeof() - * the pointer that we pass to it to figure out how much - * data to cmpxchg. We have to be careful here not to - * pass a pointer to a 64-bit data type when we only want - * a 32-bit copy. - */ - if (is_64bit_mm(mm)) { - ret = user_atomic_cmpxchg_inatomic(curval, - addr, old_val, new_val); - } else { - u32 uninitialized_var(curval_32); - u32 old_val_32 = old_val; - u32 new_val_32 = new_val; - u32 __user *addr_32 = (u32 __user *)addr; - - ret = user_atomic_cmpxchg_inatomic(&curval_32, - addr_32, old_val_32, new_val_32); - *curval = curval_32; - } - return ret; -} - -/* - * With 32-bit mode, a bounds directory is 4MB, and the size of each - * bounds table is 16KB. With 64-bit mode, a bounds directory is 2GB, - * and the size of each bounds table is 4MB. - */ -static int allocate_bt(struct mm_struct *mm, long __user *bd_entry) -{ - unsigned long expected_old_val = 0; - unsigned long actual_old_val = 0; - unsigned long bt_addr; - unsigned long bd_new_entry; - int ret = 0; - - /* - * Carve the virtual space out of userspace for the new - * bounds table: - */ - bt_addr = mpx_mmap(mpx_bt_size_bytes(mm)); - if (IS_ERR((void *)bt_addr)) - return PTR_ERR((void *)bt_addr); - /* - * Set the valid flag (kinda like _PAGE_PRESENT in a pte) - */ - bd_new_entry = bt_addr | MPX_BD_ENTRY_VALID_FLAG; - - /* - * Go poke the address of the new bounds table in to the - * bounds directory entry out in userspace memory. Note: - * we may race with another CPU instantiating the same table. - * In that case the cmpxchg will see an unexpected - * 'actual_old_val'. - * - * This can fault, but that's OK because we do not hold - * mmap_sem at this point, unlike some of the other part - * of the MPX code that have to pagefault_disable(). - */ - ret = mpx_cmpxchg_bd_entry(mm, &actual_old_val, bd_entry, - expected_old_val, bd_new_entry); - if (ret) - goto out_unmap; - - /* - * The user_atomic_cmpxchg_inatomic() will only return nonzero - * for faults, *not* if the cmpxchg itself fails. Now we must - * verify that the cmpxchg itself completed successfully. - */ - /* - * We expected an empty 'expected_old_val', but instead found - * an apparently valid entry. Assume we raced with another - * thread to instantiate this table and desclare succecss. - */ - if (actual_old_val & MPX_BD_ENTRY_VALID_FLAG) { - ret = 0; - goto out_unmap; - } - /* - * We found a non-empty bd_entry but it did not have the - * VALID_FLAG set. Return an error which will result in - * a SEGV since this probably means that somebody scribbled - * some invalid data in to a bounds table. - */ - if (expected_old_val != actual_old_val) { - ret = -EINVAL; - goto out_unmap; - } - trace_mpx_new_bounds_table(bt_addr); - return 0; -out_unmap: - vm_munmap(bt_addr, mpx_bt_size_bytes(mm)); - return ret; -} - -/* - * When a BNDSTX instruction attempts to save bounds to a bounds - * table, it will first attempt to look up the table in the - * first-level bounds directory. If it does not find a table in - * the directory, a #BR is generated and we get here in order to - * allocate a new table. - * - * With 32-bit mode, the size of BD is 4MB, and the size of each - * bound table is 16KB. With 64-bit mode, the size of BD is 2GB, - * and the size of each bound table is 4MB. - */ -static int do_mpx_bt_fault(void) -{ - unsigned long bd_entry, bd_base; - const struct mpx_bndcsr *bndcsr; - struct mm_struct *mm = current->mm; - - bndcsr = get_xsave_field_ptr(XFEATURE_BNDCSR); - if (!bndcsr) - return -EINVAL; - /* - * Mask off the preserve and enable bits - */ - bd_base = bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK; - /* - * The hardware provides the address of the missing or invalid - * entry via BNDSTATUS, so we don't have to go look it up. - */ - bd_entry = bndcsr->bndstatus & MPX_BNDSTA_ADDR_MASK; - /* - * Make sure the directory entry is within where we think - * the directory is. - */ - if ((bd_entry < bd_base) || - (bd_entry >= bd_base + mpx_bd_size_bytes(mm))) - return -EINVAL; - - return allocate_bt(mm, (long __user *)bd_entry); -} - -int mpx_handle_bd_fault(void) -{ - /* - * Userspace never asked us to manage the bounds tables, - * so refuse to help. - */ - if (!kernel_managing_mpx_tables(current->mm)) - return -EINVAL; - - return do_mpx_bt_fault(); -} - -/* - * A thin wrapper around get_user_pages(). Returns 0 if the - * fault was resolved or -errno if not. - */ -static int mpx_resolve_fault(long __user *addr, int write) -{ - long gup_ret; - int nr_pages = 1; - - gup_ret = get_user_pages((unsigned long)addr, nr_pages, - write ? FOLL_WRITE : 0, NULL, NULL); - /* - * get_user_pages() returns number of pages gotten. - * 0 means we failed to fault in and get anything, - * probably because 'addr' is bad. - */ - if (!gup_ret) - return -EFAULT; - /* Other error, return it */ - if (gup_ret < 0) - return gup_ret; - /* must have gup'd a page and gup_ret>0, success */ - return 0; -} - -static unsigned long mpx_bd_entry_to_bt_addr(struct mm_struct *mm, - unsigned long bd_entry) -{ - unsigned long bt_addr = bd_entry; - int align_to_bytes; - /* - * Bit 0 in a bt_entry is always the valid bit. - */ - bt_addr &= ~MPX_BD_ENTRY_VALID_FLAG; - /* - * Tables are naturally aligned at 8-byte boundaries - * on 64-bit and 4-byte boundaries on 32-bit. The - * documentation makes it appear that the low bits - * are ignored by the hardware, so we do the same. - */ - if (is_64bit_mm(mm)) - align_to_bytes = 8; - else - align_to_bytes = 4; - bt_addr &= ~(align_to_bytes-1); - return bt_addr; -} - -/* - * We only want to do a 4-byte get_user() on 32-bit. Otherwise, - * we might run off the end of the bounds table if we are on - * a 64-bit kernel and try to get 8 bytes. - */ -static int get_user_bd_entry(struct mm_struct *mm, unsigned long *bd_entry_ret, - long __user *bd_entry_ptr) -{ - u32 bd_entry_32; - int ret; - - if (is_64bit_mm(mm)) - return get_user(*bd_entry_ret, bd_entry_ptr); - - /* - * Note that get_user() uses the type of the *pointer* to - * establish the size of the get, not the destination. - */ - ret = get_user(bd_entry_32, (u32 __user *)bd_entry_ptr); - *bd_entry_ret = bd_entry_32; - return ret; -} - -/* - * Get the base of bounds tables pointed by specific bounds - * directory entry. - */ -static int get_bt_addr(struct mm_struct *mm, - long __user *bd_entry_ptr, - unsigned long *bt_addr_result) -{ - int ret; - int valid_bit; - unsigned long bd_entry; - unsigned long bt_addr; - - if (!access_ok((bd_entry_ptr), sizeof(*bd_entry_ptr))) - return -EFAULT; - - while (1) { - int need_write = 0; - - pagefault_disable(); - ret = get_user_bd_entry(mm, &bd_entry, bd_entry_ptr); - pagefault_enable(); - if (!ret) - break; - if (ret == -EFAULT) - ret = mpx_resolve_fault(bd_entry_ptr, need_write); - /* - * If we could not resolve the fault, consider it - * userspace's fault and error out. - */ - if (ret) - return ret; - } - - valid_bit = bd_entry & MPX_BD_ENTRY_VALID_FLAG; - bt_addr = mpx_bd_entry_to_bt_addr(mm, bd_entry); - - /* - * When the kernel is managing bounds tables, a bounds directory - * entry will either have a valid address (plus the valid bit) - * *OR* be completely empty. If we see a !valid entry *and* some - * data in the address field, we know something is wrong. This - * -EINVAL return will cause a SIGSEGV. - */ - if (!valid_bit && bt_addr) - return -EINVAL; - /* - * Do we have an completely zeroed bt entry? That is OK. It - * just means there was no bounds table for this memory. Make - * sure to distinguish this from -EINVAL, which will cause - * a SEGV. - */ - if (!valid_bit) - return -ENOENT; - - *bt_addr_result = bt_addr; - return 0; -} - -static inline int bt_entry_size_bytes(struct mm_struct *mm) -{ - if (is_64bit_mm(mm)) - return MPX_BT_ENTRY_BYTES_64; - else - return MPX_BT_ENTRY_BYTES_32; -} - -/* - * Take a virtual address and turns it in to the offset in bytes - * inside of the bounds table where the bounds table entry - * controlling 'addr' can be found. - */ -static unsigned long mpx_get_bt_entry_offset_bytes(struct mm_struct *mm, - unsigned long addr) -{ - unsigned long bt_table_nr_entries; - unsigned long offset = addr; - - if (is_64bit_mm(mm)) { - /* Bottom 3 bits are ignored on 64-bit */ - offset >>= 3; - bt_table_nr_entries = MPX_BT_NR_ENTRIES_64; - } else { - /* Bottom 2 bits are ignored on 32-bit */ - offset >>= 2; - bt_table_nr_entries = MPX_BT_NR_ENTRIES_32; - } - /* - * We know the size of the table in to which we are - * indexing, and we have eliminated all the low bits - * which are ignored for indexing. - * - * Mask out all the high bits which we do not need - * to index in to the table. Note that the tables - * are always powers of two so this gives us a proper - * mask. - */ - offset &= (bt_table_nr_entries-1); - /* - * We now have an entry offset in terms of *entries* in - * the table. We need to scale it back up to bytes. - */ - offset *= bt_entry_size_bytes(mm); - return offset; -} - -/* - * How much virtual address space does a single bounds - * directory entry cover? - * - * Note, we need a long long because 4GB doesn't fit in - * to a long on 32-bit. - */ -static inline unsigned long bd_entry_virt_space(struct mm_struct *mm) -{ - unsigned long long virt_space; - unsigned long long GB = (1ULL << 30); - - /* - * This covers 32-bit emulation as well as 32-bit kernels - * running on 64-bit hardware. - */ - if (!is_64bit_mm(mm)) - return (4ULL * GB) / MPX_BD_NR_ENTRIES_32; - - /* - * 'x86_virt_bits' returns what the hardware is capable - * of, and returns the full >32-bit address space when - * running 32-bit kernels on 64-bit hardware. - */ - virt_space = (1ULL << boot_cpu_data.x86_virt_bits); - return virt_space / MPX_BD_NR_ENTRIES_64; -} - -/* - * Free the backing physical pages of bounds table 'bt_addr'. - * Assume start...end is within that bounds table. - */ -static noinline int zap_bt_entries_mapping(struct mm_struct *mm, - unsigned long bt_addr, - unsigned long start_mapping, unsigned long end_mapping) -{ - struct vm_area_struct *vma; - unsigned long addr, len; - unsigned long start; - unsigned long end; - - /* - * if we 'end' on a boundary, the offset will be 0 which - * is not what we want. Back it up a byte to get the - * last bt entry. Then once we have the entry itself, - * move 'end' back up by the table entry size. - */ - start = bt_addr + mpx_get_bt_entry_offset_bytes(mm, start_mapping); - end = bt_addr + mpx_get_bt_entry_offset_bytes(mm, end_mapping - 1); - /* - * Move end back up by one entry. Among other things - * this ensures that it remains page-aligned and does - * not screw up zap_page_range() - */ - end += bt_entry_size_bytes(mm); - - /* - * Find the first overlapping vma. If vma->vm_start > start, there - * will be a hole in the bounds table. This -EINVAL return will - * cause a SIGSEGV. - */ - vma = find_vma(mm, start); - if (!vma || vma->vm_start > start) - return -EINVAL; - - /* - * A NUMA policy on a VM_MPX VMA could cause this bounds table to - * be split. So we need to look across the entire 'start -> end' - * range of this bounds table, find all of the VM_MPX VMAs, and - * zap only those. - */ - addr = start; - while (vma && vma->vm_start < end) { - /* - * We followed a bounds directory entry down - * here. If we find a non-MPX VMA, that's bad, - * so stop immediately and return an error. This - * probably results in a SIGSEGV. - */ - if (!(vma->vm_flags & VM_MPX)) - return -EINVAL; - - len = min(vma->vm_end, end) - addr; - zap_page_range(vma, addr, len); - trace_mpx_unmap_zap(addr, addr+len); - - vma = vma->vm_next; - addr = vma->vm_start; - } - return 0; -} - -static unsigned long mpx_get_bd_entry_offset(struct mm_struct *mm, - unsigned long addr) -{ - /* - * There are several ways to derive the bd offsets. We - * use the following approach here: - * 1. We know the size of the virtual address space - * 2. We know the number of entries in a bounds table - * 3. We know that each entry covers a fixed amount of - * virtual address space. - * So, we can just divide the virtual address by the - * virtual space used by one entry to determine which - * entry "controls" the given virtual address. - */ - if (is_64bit_mm(mm)) { - int bd_entry_size = 8; /* 64-bit pointer */ - /* - * Take the 64-bit addressing hole in to account. - */ - addr &= ((1UL << boot_cpu_data.x86_virt_bits) - 1); - return (addr / bd_entry_virt_space(mm)) * bd_entry_size; - } else { - int bd_entry_size = 4; /* 32-bit pointer */ - /* - * 32-bit has no hole so this case needs no mask - */ - return (addr / bd_entry_virt_space(mm)) * bd_entry_size; - } - /* - * The two return calls above are exact copies. If we - * pull out a single copy and put it in here, gcc won't - * realize that we're doing a power-of-2 divide and use - * shifts. It uses a real divide. If we put them up - * there, it manages to figure it out (gcc 4.8.3). - */ -} - -static int unmap_entire_bt(struct mm_struct *mm, - long __user *bd_entry, unsigned long bt_addr) -{ - unsigned long expected_old_val = bt_addr | MPX_BD_ENTRY_VALID_FLAG; - unsigned long uninitialized_var(actual_old_val); - int ret; - - while (1) { - int need_write = 1; - unsigned long cleared_bd_entry = 0; - - pagefault_disable(); - ret = mpx_cmpxchg_bd_entry(mm, &actual_old_val, - bd_entry, expected_old_val, cleared_bd_entry); - pagefault_enable(); - if (!ret) - break; - if (ret == -EFAULT) - ret = mpx_resolve_fault(bd_entry, need_write); - /* - * If we could not resolve the fault, consider it - * userspace's fault and error out. - */ - if (ret) - return ret; - } - /* - * The cmpxchg was performed, check the results. - */ - if (actual_old_val != expected_old_val) { - /* - * Someone else raced with us to unmap the table. - * That is OK, since we were both trying to do - * the same thing. Declare success. - */ - if (!actual_old_val) - return 0; - /* - * Something messed with the bounds directory - * entry. We hold mmap_sem for read or write - * here, so it could not be a _new_ bounds table - * that someone just allocated. Something is - * wrong, so pass up the error and SIGSEGV. - */ - return -EINVAL; - } - /* - * Note, we are likely being called under do_munmap() already. To - * avoid recursion, do_munmap() will check whether it comes - * from one bounds table through VM_MPX flag. - */ - return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm), NULL); -} - -static int try_unmap_single_bt(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - struct vm_area_struct *next; - struct vm_area_struct *prev; - /* - * "bta" == Bounds Table Area: the area controlled by the - * bounds table that we are unmapping. - */ - unsigned long bta_start_vaddr = start & ~(bd_entry_virt_space(mm)-1); - unsigned long bta_end_vaddr = bta_start_vaddr + bd_entry_virt_space(mm); - unsigned long uninitialized_var(bt_addr); - void __user *bde_vaddr; - int ret; - /* - * We already unlinked the VMAs from the mm's rbtree so 'start' - * is guaranteed to be in a hole. This gets us the first VMA - * before the hole in to 'prev' and the next VMA after the hole - * in to 'next'. - */ - next = find_vma_prev(mm, start, &prev); - /* - * Do not count other MPX bounds table VMAs as neighbors. - * Although theoretically possible, we do not allow bounds - * tables for bounds tables so our heads do not explode. - * If we count them as neighbors here, we may end up with - * lots of tables even though we have no actual table - * entries in use. - */ - while (next && (next->vm_flags & VM_MPX)) - next = next->vm_next; - while (prev && (prev->vm_flags & VM_MPX)) - prev = prev->vm_prev; - /* - * We know 'start' and 'end' lie within an area controlled - * by a single bounds table. See if there are any other - * VMAs controlled by that bounds table. If there are not - * then we can "expand" the are we are unmapping to possibly - * cover the entire table. - */ - next = find_vma_prev(mm, start, &prev); - if ((!prev || prev->vm_end <= bta_start_vaddr) && - (!next || next->vm_start >= bta_end_vaddr)) { - /* - * No neighbor VMAs controlled by same bounds - * table. Try to unmap the whole thing - */ - start = bta_start_vaddr; - end = bta_end_vaddr; - } - - bde_vaddr = mm->context.bd_addr + mpx_get_bd_entry_offset(mm, start); - ret = get_bt_addr(mm, bde_vaddr, &bt_addr); - /* - * No bounds table there, so nothing to unmap. - */ - if (ret == -ENOENT) { - ret = 0; - return 0; - } - if (ret) - return ret; - /* - * We are unmapping an entire table. Either because the - * unmap that started this whole process was large enough - * to cover an entire table, or that the unmap was small - * but was the area covered by a bounds table. - */ - if ((start == bta_start_vaddr) && - (end == bta_end_vaddr)) - return unmap_entire_bt(mm, bde_vaddr, bt_addr); - return zap_bt_entries_mapping(mm, bt_addr, start, end); -} - -static int mpx_unmap_tables(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - unsigned long one_unmap_start; - trace_mpx_unmap_search(start, end); - - one_unmap_start = start; - while (one_unmap_start < end) { - int ret; - unsigned long next_unmap_start = ALIGN(one_unmap_start+1, - bd_entry_virt_space(mm)); - unsigned long one_unmap_end = end; - /* - * if the end is beyond the current bounds table, - * move it back so we only deal with a single one - * at a time - */ - if (one_unmap_end > next_unmap_start) - one_unmap_end = next_unmap_start; - ret = try_unmap_single_bt(mm, one_unmap_start, one_unmap_end); - if (ret) - return ret; - - one_unmap_start = next_unmap_start; - } - return 0; -} - -/* - * Free unused bounds tables covered in a virtual address region being - * munmap()ed. Assume end > start. - * - * This function will be called by do_munmap(), and the VMAs covering - * the virtual address region start...end have already been split if - * necessary, and the 'vma' is the first vma in this range (start -> end). - */ -void mpx_notify_unmap(struct mm_struct *mm, unsigned long start, - unsigned long end) -{ - struct vm_area_struct *vma; - int ret; - - /* - * Refuse to do anything unless userspace has asked - * the kernel to help manage the bounds tables, - */ - if (!kernel_managing_mpx_tables(current->mm)) - return; - /* - * This will look across the entire 'start -> end' range, - * and find all of the non-VM_MPX VMAs. - * - * To avoid recursion, if a VM_MPX vma is found in the range - * (start->end), we will not continue follow-up work. This - * recursion represents having bounds tables for bounds tables, - * which should not occur normally. Being strict about it here - * helps ensure that we do not have an exploitable stack overflow. - */ - vma = find_vma(mm, start); - while (vma && vma->vm_start < end) { - if (vma->vm_flags & VM_MPX) - return; - vma = vma->vm_next; - } - - ret = mpx_unmap_tables(mm, start, end); - if (ret) - force_sig(SIGSEGV); -} - -/* MPX cannot handle addresses above 47 bits yet. */ -unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len, - unsigned long flags) -{ - if (!kernel_managing_mpx_tables(current->mm)) - return addr; - if (addr + len <= DEFAULT_MAP_WINDOW) - return addr; - if (flags & MAP_FIXED) - return -ENOMEM; - - /* - * Requested len is larger than the whole area we're allowed to map in. - * Resetting hinting address wouldn't do much good -- fail early. - */ - if (len > DEFAULT_MAP_WINDOW) - return -ENOMEM; - - /* Look for unmap area within DEFAULT_MAP_WINDOW */ - return 0; -} diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index e6dad600614c..99f7a68738f0 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -699,7 +699,7 @@ static int __init dummy_numa_init(void) * x86_numa_init - Initialize NUMA * * Try each configured NUMA initialization method until one succeeds. The - * last fallback is dummy single node config encomapssing whole memory and + * last fallback is dummy single node config encompassing whole memory and * never fails. */ void __init x86_numa_init(void) @@ -861,9 +861,9 @@ void numa_remove_cpu(int cpu) */ const struct cpumask *cpumask_of_node(int node) { - if (node >= nr_node_ids) { + if ((unsigned)node >= nr_node_ids) { printk(KERN_WARNING - "cpumask_of_node(%d): node > nr_node_ids(%u)\n", + "cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n", node, nr_node_ids); dump_stack(); return cpu_none_mask; diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index abffa0be80da..7f1d2034df1e 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -438,7 +438,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) goto no_emu; if (numa_cleanup_meminfo(&ei) < 0) { - pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); + pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); goto no_emu; } @@ -449,7 +449,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), phys_size, PAGE_SIZE); if (!phys) { - pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); + pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); goto no_emu; } memblock_reserve(phys, phys_size); diff --git a/arch/x86/mm/pat/Makefile b/arch/x86/mm/pat/Makefile new file mode 100644 index 000000000000..ea464c995161 --- /dev/null +++ b/arch/x86/mm/pat/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y := set_memory.o memtype.o + +obj-$(CONFIG_X86_PAT) += memtype_interval.o diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pat/cpa-test.c index facce271e8b9..facce271e8b9 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pat/cpa-test.c diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat/memtype.c index d9fbd4f69920..394be8611748 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat/memtype.c @@ -1,11 +1,34 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Handle caching attributes in page tables (PAT) + * Page Attribute Table (PAT) support: handle memory caching attributes in page tables. * * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> * Suresh B Siddha <suresh.b.siddha@intel.com> * * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. + * + * Basic principles: + * + * PAT is a CPU feature supported by all modern x86 CPUs, to allow the firmware and + * the kernel to set one of a handful of 'caching type' attributes for physical + * memory ranges: uncached, write-combining, write-through, write-protected, + * and the most commonly used and default attribute: write-back caching. + * + * PAT support supercedes and augments MTRR support in a compatible fashion: MTRR is + * a hardware interface to enumerate a limited number of physical memory ranges + * and set their caching attributes explicitly, programmed into the CPU via MSRs. + * Even modern CPUs have MTRRs enabled - but these are typically not touched + * by the kernel or by user-space (such as the X server), we rely on PAT for any + * additional cache attribute logic. + * + * PAT doesn't work via explicit memory ranges, but uses page table entries to add + * cache attribute information to the mapped memory range: there's 3 bits used, + * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT), with the 8 possible values mapped by the + * CPU to actual cache attributes via an MSR loaded into the CPU (MSR_IA32_CR_PAT). + * + * ( There's a metric ton of finer details, such as compatibility with CPU quirks + * that only support 4 types of PAT entries, and interaction with MTRRs, see + * below for details. ) */ #include <linux/seq_file.h> @@ -29,44 +52,48 @@ #include <asm/mtrr.h> #include <asm/page.h> #include <asm/msr.h> -#include <asm/pat.h> +#include <asm/memtype.h> #include <asm/io.h> -#include "pat_internal.h" -#include "mm_internal.h" +#include "memtype.h" +#include "../mm_internal.h" #undef pr_fmt #define pr_fmt(fmt) "" fmt -static bool __read_mostly boot_cpu_done; +static bool __read_mostly pat_bp_initialized; static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT); -static bool __read_mostly pat_initialized; -static bool __read_mostly init_cm_done; +static bool __read_mostly pat_bp_enabled; +static bool __read_mostly pat_cm_initialized; -void pat_disable(const char *reason) +/* + * PAT support is enabled by default, but can be disabled for + * various user-requested or hardware-forced reasons: + */ +void pat_disable(const char *msg_reason) { if (pat_disabled) return; - if (boot_cpu_done) { + if (pat_bp_initialized) { WARN_ONCE(1, "x86/PAT: PAT cannot be disabled after initialization\n"); return; } pat_disabled = true; - pr_info("x86/PAT: %s\n", reason); + pr_info("x86/PAT: %s\n", msg_reason); } static int __init nopat(char *str) { - pat_disable("PAT support disabled."); + pat_disable("PAT support disabled via boot option."); return 0; } early_param("nopat", nopat); bool pat_enabled(void) { - return pat_initialized; + return pat_bp_enabled; } EXPORT_SYMBOL_GPL(pat_enabled); @@ -197,6 +224,8 @@ static void __init_cache_modes(u64 pat) char pat_msg[33]; int i; + WARN_ON_ONCE(pat_cm_initialized); + pat_msg[32] = 0; for (i = 7; i >= 0; i--) { cache = pat_get_cache_mode((pat >> (i * 8)) & 7, @@ -205,28 +234,28 @@ static void __init_cache_modes(u64 pat) } pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg); - init_cm_done = true; + pat_cm_initialized = true; } #define PAT(x, y) ((u64)PAT_ ## y << ((x)*8)) -static void pat_bsp_init(u64 pat) +static void pat_bp_init(u64 pat) { u64 tmp_pat; if (!boot_cpu_has(X86_FEATURE_PAT)) { - pat_disable("PAT not supported by CPU."); + pat_disable("PAT not supported by the CPU."); return; } rdmsrl(MSR_IA32_CR_PAT, tmp_pat); if (!tmp_pat) { - pat_disable("PAT MSR is 0, disabled."); + pat_disable("PAT support disabled by the firmware."); return; } wrmsrl(MSR_IA32_CR_PAT, pat); - pat_initialized = true; + pat_bp_enabled = true; __init_cache_modes(pat); } @@ -248,7 +277,7 @@ void init_cache_modes(void) { u64 pat = 0; - if (init_cm_done) + if (pat_cm_initialized) return; if (boot_cpu_has(X86_FEATURE_PAT)) { @@ -291,7 +320,7 @@ void init_cache_modes(void) } /** - * pat_init - Initialize PAT MSR and PAT table + * pat_init - Initialize the PAT MSR and PAT table on the current CPU * * This function initializes PAT MSR and PAT table with an OS-defined value * to enable additional cache attributes, WC, WT and WP. @@ -305,6 +334,10 @@ void pat_init(void) u64 pat; struct cpuinfo_x86 *c = &boot_cpu_data; +#ifndef CONFIG_X86_PAT + pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n"); +#endif + if (pat_disabled) return; @@ -364,9 +397,9 @@ void pat_init(void) PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT); } - if (!boot_cpu_done) { - pat_bsp_init(pat); - boot_cpu_done = true; + if (!pat_bp_initialized) { + pat_bp_init(pat); + pat_bp_initialized = true; } else { pat_ap_init(pat); } @@ -542,10 +575,10 @@ static u64 sanitize_phys(u64 address) * available type in new_type in case of no error. In case of any error * it will return a negative return value. */ -int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, +int memtype_reserve(u64 start, u64 end, enum page_cache_mode req_type, enum page_cache_mode *new_type) { - struct memtype *new; + struct memtype *entry_new; enum page_cache_mode actual_type; int is_range_ram; int err = 0; @@ -593,22 +626,22 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, return -EINVAL; } - new = kzalloc(sizeof(struct memtype), GFP_KERNEL); - if (!new) + entry_new = kzalloc(sizeof(struct memtype), GFP_KERNEL); + if (!entry_new) return -ENOMEM; - new->start = start; - new->end = end; - new->type = actual_type; + entry_new->start = start; + entry_new->end = end; + entry_new->type = actual_type; spin_lock(&memtype_lock); - err = rbt_memtype_check_insert(new, new_type); + err = memtype_check_insert(entry_new, new_type); if (err) { - pr_info("x86/PAT: reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n", + pr_info("x86/PAT: memtype_reserve failed [mem %#010Lx-%#010Lx], track %s, req %s\n", start, end - 1, - cattr_name(new->type), cattr_name(req_type)); - kfree(new); + cattr_name(entry_new->type), cattr_name(req_type)); + kfree(entry_new); spin_unlock(&memtype_lock); return err; @@ -616,18 +649,17 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, spin_unlock(&memtype_lock); - dprintk("reserve_memtype added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n", - start, end - 1, cattr_name(new->type), cattr_name(req_type), + dprintk("memtype_reserve added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n", + start, end - 1, cattr_name(entry_new->type), cattr_name(req_type), new_type ? cattr_name(*new_type) : "-"); return err; } -int free_memtype(u64 start, u64 end) +int memtype_free(u64 start, u64 end) { - int err = -EINVAL; int is_range_ram; - struct memtype *entry; + struct memtype *entry_old; if (!pat_enabled()) return 0; @@ -640,28 +672,24 @@ int free_memtype(u64 start, u64 end) return 0; is_range_ram = pat_pagerange_is_ram(start, end); - if (is_range_ram == 1) { - - err = free_ram_pages_type(start, end); - - return err; - } else if (is_range_ram < 0) { + if (is_range_ram == 1) + return free_ram_pages_type(start, end); + if (is_range_ram < 0) return -EINVAL; - } spin_lock(&memtype_lock); - entry = rbt_memtype_erase(start, end); + entry_old = memtype_erase(start, end); spin_unlock(&memtype_lock); - if (IS_ERR(entry)) { + if (IS_ERR(entry_old)) { pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", current->comm, current->pid, start, end - 1); return -EINVAL; } - kfree(entry); + kfree(entry_old); - dprintk("free_memtype request [mem %#010Lx-%#010Lx]\n", start, end - 1); + dprintk("memtype_free request [mem %#010Lx-%#010Lx]\n", start, end - 1); return 0; } @@ -693,13 +721,14 @@ static enum page_cache_mode lookup_memtype(u64 paddr) spin_lock(&memtype_lock); - entry = rbt_memtype_lookup(paddr); + entry = memtype_lookup(paddr); if (entry != NULL) rettype = entry->type; else rettype = _PAGE_CACHE_MODE_UC_MINUS; spin_unlock(&memtype_lock); + return rettype; } @@ -723,7 +752,7 @@ bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn) EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr); /** - * io_reserve_memtype - Request a memory type mapping for a region of memory + * memtype_reserve_io - Request a memory type mapping for a region of memory * @start: start (physical address) of the region * @end: end (physical address) of the region * @type: A pointer to memtype, with requested type. On success, requested @@ -732,7 +761,7 @@ EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr); * On success, returns 0 * On failure, returns non-zero */ -int io_reserve_memtype(resource_size_t start, resource_size_t end, +int memtype_reserve_io(resource_size_t start, resource_size_t end, enum page_cache_mode *type) { resource_size_t size = end - start; @@ -742,47 +771,47 @@ int io_reserve_memtype(resource_size_t start, resource_size_t end, WARN_ON_ONCE(iomem_map_sanity_check(start, size)); - ret = reserve_memtype(start, end, req_type, &new_type); + ret = memtype_reserve(start, end, req_type, &new_type); if (ret) goto out_err; if (!is_new_memtype_allowed(start, size, req_type, new_type)) goto out_free; - if (kernel_map_sync_memtype(start, size, new_type) < 0) + if (memtype_kernel_map_sync(start, size, new_type) < 0) goto out_free; *type = new_type; return 0; out_free: - free_memtype(start, end); + memtype_free(start, end); ret = -EBUSY; out_err: return ret; } /** - * io_free_memtype - Release a memory type mapping for a region of memory + * memtype_free_io - Release a memory type mapping for a region of memory * @start: start (physical address) of the region * @end: end (physical address) of the region */ -void io_free_memtype(resource_size_t start, resource_size_t end) +void memtype_free_io(resource_size_t start, resource_size_t end) { - free_memtype(start, end); + memtype_free(start, end); } int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size) { enum page_cache_mode type = _PAGE_CACHE_MODE_WC; - return io_reserve_memtype(start, start + size, &type); + return memtype_reserve_io(start, start + size, &type); } EXPORT_SYMBOL(arch_io_reserve_memtype_wc); void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) { - io_free_memtype(start, start + size); + memtype_free_io(start, start + size); } EXPORT_SYMBOL(arch_io_free_memtype_wc); @@ -839,10 +868,10 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, } /* - * Change the memory type for the physial address range in kernel identity + * Change the memory type for the physical address range in kernel identity * mapping space if that range is a part of identity map. */ -int kernel_map_sync_memtype(u64 base, unsigned long size, +int memtype_kernel_map_sync(u64 base, unsigned long size, enum page_cache_mode pcm) { unsigned long id_sz; @@ -851,15 +880,14 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, return 0; /* - * some areas in the middle of the kernel identity range - * are not mapped, like the PCI space. + * Some areas in the middle of the kernel identity range + * are not mapped, for example the PCI space. */ if (!page_is_ram(base >> PAGE_SHIFT)) return 0; id_sz = (__pa(high_memory-1) <= base + size) ? - __pa(high_memory) - base : - size; + __pa(high_memory) - base : size; if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) { pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n", @@ -873,7 +901,7 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, /* * Internal interface to reserve a range of physical memory with prot. - * Reserved non RAM regions only and after successful reserve_memtype, + * Reserved non RAM regions only and after successful memtype_reserve, * this func also keeps identity mapping (if any) in sync with this new prot. */ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, @@ -910,14 +938,14 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, return 0; } - ret = reserve_memtype(paddr, paddr + size, want_pcm, &pcm); + ret = memtype_reserve(paddr, paddr + size, want_pcm, &pcm); if (ret) return ret; if (pcm != want_pcm) { if (strict_prot || !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) { - free_memtype(paddr, paddr + size); + memtype_free(paddr, paddr + size); pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n", current->comm, current->pid, cattr_name(want_pcm), @@ -935,8 +963,8 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, cachemode2protval(pcm)); } - if (kernel_map_sync_memtype(paddr, size, pcm) < 0) { - free_memtype(paddr, paddr + size); + if (memtype_kernel_map_sync(paddr, size, pcm) < 0) { + memtype_free(paddr, paddr + size); return -EINVAL; } return 0; @@ -952,7 +980,7 @@ static void free_pfn_range(u64 paddr, unsigned long size) is_ram = pat_pagerange_is_ram(paddr, paddr + size); if (is_ram == 0) - free_memtype(paddr, paddr + size); + memtype_free(paddr, paddr + size); } /* @@ -1099,25 +1127,30 @@ EXPORT_SYMBOL_GPL(pgprot_writethrough); #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) +/* + * We are allocating a temporary printout-entry to be passed + * between seq_start()/next() and seq_show(): + */ static struct memtype *memtype_get_idx(loff_t pos) { - struct memtype *print_entry; + struct memtype *entry_print; int ret; - print_entry = kzalloc(sizeof(struct memtype), GFP_KERNEL); - if (!print_entry) + entry_print = kzalloc(sizeof(struct memtype), GFP_KERNEL); + if (!entry_print) return NULL; spin_lock(&memtype_lock); - ret = rbt_memtype_copy_nth_element(print_entry, pos); + ret = memtype_copy_nth_element(entry_print, pos); spin_unlock(&memtype_lock); - if (!ret) { - return print_entry; - } else { - kfree(print_entry); + /* Free it on error: */ + if (ret) { + kfree(entry_print); return NULL; } + + return entry_print; } static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) @@ -1142,11 +1175,14 @@ static void memtype_seq_stop(struct seq_file *seq, void *v) static int memtype_seq_show(struct seq_file *seq, void *v) { - struct memtype *print_entry = (struct memtype *)v; + struct memtype *entry_print = (struct memtype *)v; + + seq_printf(seq, "PAT: [mem 0x%016Lx-0x%016Lx] %s\n", + entry_print->start, + entry_print->end, + cattr_name(entry_print->type)); - seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type), - print_entry->start, print_entry->end); - kfree(print_entry); + kfree(entry_print); return 0; } @@ -1178,7 +1214,6 @@ static int __init pat_memtype_list_init(void) } return 0; } - late_initcall(pat_memtype_list_init); #endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */ diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat/memtype.h index eeb5caeb089b..cacecdbceb55 100644 --- a/arch/x86/mm/pat_internal.h +++ b/arch/x86/mm/pat/memtype.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __PAT_INTERNAL_H_ -#define __PAT_INTERNAL_H_ +#ifndef __MEMTYPE_H_ +#define __MEMTYPE_H_ extern int pat_debug_enable; @@ -29,21 +29,21 @@ static inline char *cattr_name(enum page_cache_mode pcm) } #ifdef CONFIG_X86_PAT -extern int rbt_memtype_check_insert(struct memtype *new, - enum page_cache_mode *new_type); -extern struct memtype *rbt_memtype_erase(u64 start, u64 end); -extern struct memtype *rbt_memtype_lookup(u64 addr); -extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos); +extern int memtype_check_insert(struct memtype *entry_new, + enum page_cache_mode *new_type); +extern struct memtype *memtype_erase(u64 start, u64 end); +extern struct memtype *memtype_lookup(u64 addr); +extern int memtype_copy_nth_element(struct memtype *entry_out, loff_t pos); #else -static inline int rbt_memtype_check_insert(struct memtype *new, - enum page_cache_mode *new_type) +static inline int memtype_check_insert(struct memtype *entry_new, + enum page_cache_mode *new_type) { return 0; } -static inline struct memtype *rbt_memtype_erase(u64 start, u64 end) +static inline struct memtype *memtype_erase(u64 start, u64 end) { return NULL; } -static inline struct memtype *rbt_memtype_lookup(u64 addr) +static inline struct memtype *memtype_lookup(u64 addr) { return NULL; } -static inline int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos) +static inline int memtype_copy_nth_element(struct memtype *out, loff_t pos) { return 0; } #endif -#endif /* __PAT_INTERNAL_H_ */ +#endif /* __MEMTYPE_H_ */ diff --git a/arch/x86/mm/pat/memtype_interval.c b/arch/x86/mm/pat/memtype_interval.c new file mode 100644 index 000000000000..a07e4882bf36 --- /dev/null +++ b/arch/x86/mm/pat/memtype_interval.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Handle caching attributes in page tables (PAT) + * + * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * Suresh B Siddha <suresh.b.siddha@intel.com> + * + * Interval tree used to store the PAT memory type reservations. + */ + +#include <linux/seq_file.h> +#include <linux/debugfs.h> +#include <linux/kernel.h> +#include <linux/interval_tree_generic.h> +#include <linux/sched.h> +#include <linux/gfp.h> + +#include <asm/pgtable.h> +#include <asm/memtype.h> + +#include "memtype.h" + +/* + * The memtype tree keeps track of memory type for specific + * physical memory areas. Without proper tracking, conflicting memory + * types in different mappings can cause CPU cache corruption. + * + * The tree is an interval tree (augmented rbtree) which tree is ordered + * by the starting address. The tree can contain multiple entries for + * different regions which overlap. All the aliases have the same + * cache attributes of course, as enforced by the PAT logic. + * + * memtype_lock protects the rbtree. + */ + +static inline u64 interval_start(struct memtype *entry) +{ + return entry->start; +} + +static inline u64 interval_end(struct memtype *entry) +{ + return entry->end - 1; +} + +INTERVAL_TREE_DEFINE(struct memtype, rb, u64, subtree_max_end, + interval_start, interval_end, + static, interval) + +static struct rb_root_cached memtype_rbroot = RB_ROOT_CACHED; + +enum { + MEMTYPE_EXACT_MATCH = 0, + MEMTYPE_END_MATCH = 1 +}; + +static struct memtype *memtype_match(u64 start, u64 end, int match_type) +{ + struct memtype *entry_match; + + entry_match = interval_iter_first(&memtype_rbroot, start, end-1); + + while (entry_match != NULL && entry_match->start < end) { + if ((match_type == MEMTYPE_EXACT_MATCH) && + (entry_match->start == start) && (entry_match->end == end)) + return entry_match; + + if ((match_type == MEMTYPE_END_MATCH) && + (entry_match->start < start) && (entry_match->end == end)) + return entry_match; + + entry_match = interval_iter_next(entry_match, start, end-1); + } + + return NULL; /* Returns NULL if there is no match */ +} + +static int memtype_check_conflict(u64 start, u64 end, + enum page_cache_mode reqtype, + enum page_cache_mode *newtype) +{ + struct memtype *entry_match; + enum page_cache_mode found_type = reqtype; + + entry_match = interval_iter_first(&memtype_rbroot, start, end-1); + if (entry_match == NULL) + goto success; + + if (entry_match->type != found_type && newtype == NULL) + goto failure; + + dprintk("Overlap at 0x%Lx-0x%Lx\n", entry_match->start, entry_match->end); + found_type = entry_match->type; + + entry_match = interval_iter_next(entry_match, start, end-1); + while (entry_match) { + if (entry_match->type != found_type) + goto failure; + + entry_match = interval_iter_next(entry_match, start, end-1); + } +success: + if (newtype) + *newtype = found_type; + + return 0; + +failure: + pr_info("x86/PAT: %s:%d conflicting memory types %Lx-%Lx %s<->%s\n", + current->comm, current->pid, start, end, + cattr_name(found_type), cattr_name(entry_match->type)); + + return -EBUSY; +} + +int memtype_check_insert(struct memtype *entry_new, enum page_cache_mode *ret_type) +{ + int err = 0; + + err = memtype_check_conflict(entry_new->start, entry_new->end, entry_new->type, ret_type); + if (err) + return err; + + if (ret_type) + entry_new->type = *ret_type; + + interval_insert(entry_new, &memtype_rbroot); + return 0; +} + +struct memtype *memtype_erase(u64 start, u64 end) +{ + struct memtype *entry_old; + + /* + * Since the memtype_rbroot tree allows overlapping ranges, + * memtype_erase() checks with EXACT_MATCH first, i.e. free + * a whole node for the munmap case. If no such entry is found, + * it then checks with END_MATCH, i.e. shrink the size of a node + * from the end for the mremap case. + */ + entry_old = memtype_match(start, end, MEMTYPE_EXACT_MATCH); + if (!entry_old) { + entry_old = memtype_match(start, end, MEMTYPE_END_MATCH); + if (!entry_old) + return ERR_PTR(-EINVAL); + } + + if (entry_old->start == start) { + /* munmap: erase this node */ + interval_remove(entry_old, &memtype_rbroot); + } else { + /* mremap: update the end value of this node */ + interval_remove(entry_old, &memtype_rbroot); + entry_old->end = start; + interval_insert(entry_old, &memtype_rbroot); + + return NULL; + } + + return entry_old; +} + +struct memtype *memtype_lookup(u64 addr) +{ + return interval_iter_first(&memtype_rbroot, addr, addr + PAGE_SIZE-1); +} + +/* + * Debugging helper, copy the Nth entry of the tree into a + * a copy for printout. This allows us to print out the tree + * via debugfs, without holding the memtype_lock too long: + */ +#ifdef CONFIG_DEBUG_FS +int memtype_copy_nth_element(struct memtype *entry_out, loff_t pos) +{ + struct memtype *entry_match; + int i = 1; + + entry_match = interval_iter_first(&memtype_rbroot, 0, ULONG_MAX); + + while (entry_match && pos != i) { + entry_match = interval_iter_next(entry_match, 0, ULONG_MAX); + i++; + } + + if (entry_match) { /* pos == i */ + *entry_out = *entry_match; + return 0; + } else { + return 1; + } +} +#endif diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pat/set_memory.c index 6a9a77a403c9..c4aedd00c1ba 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pat/set_memory.c @@ -24,10 +24,10 @@ #include <linux/uaccess.h> #include <asm/pgalloc.h> #include <asm/proto.h> -#include <asm/pat.h> +#include <asm/memtype.h> #include <asm/set_memory.h> -#include "mm_internal.h" +#include "../mm_internal.h" /* * The current flushing context - we pass it instead of 5 arguments: @@ -331,7 +331,7 @@ static void cpa_flush_all(unsigned long cache) on_each_cpu(__cpa_flush_all, (void *) cache, 1); } -void __cpa_flush_tlb(void *data) +static void __cpa_flush_tlb(void *data) { struct cpa_data *cpa = data; unsigned int i; @@ -516,7 +516,7 @@ static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val, */ static inline pgprot_t static_protections(pgprot_t prot, unsigned long start, unsigned long pfn, unsigned long npg, - int warnlvl) + unsigned long lpsize, int warnlvl) { pgprotval_t forbidden, res; unsigned long end; @@ -535,9 +535,17 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long start, check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX"); forbidden = res; - res = protect_kernel_text_ro(start, end); - check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO"); - forbidden |= res; + /* + * Special case to preserve a large page. If the change spawns the + * full large page mapping then there is no point to split it + * up. Happens with ftrace and is going to be removed once ftrace + * switched to text_poke(). + */ + if (lpsize != (npg * PAGE_SIZE) || (start & (lpsize - 1))) { + res = protect_kernel_text_ro(start, end); + check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO"); + forbidden |= res; + } /* Check the PFN directly */ res = protect_pci_bios(pfn, pfn + npg - 1); @@ -610,6 +618,17 @@ pte_t *lookup_address(unsigned long address, unsigned int *level) } EXPORT_SYMBOL_GPL(lookup_address); +/* + * Lookup the page table entry for a virtual address in a given mm. Return a + * pointer to the entry and the level of the mapping. + */ +pte_t *lookup_address_in_mm(struct mm_struct *mm, unsigned long address, + unsigned int *level) +{ + return lookup_address_in_pgd(pgd_offset(mm, address), address, level); +} +EXPORT_SYMBOL_GPL(lookup_address_in_mm); + static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, unsigned int *level) { @@ -819,7 +838,7 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address, * extra conditional required here. */ chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages, - CPA_CONFLICT); + psize, CPA_CONFLICT); if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) { /* @@ -855,7 +874,7 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address, * protection requirement in the large page. */ new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages, - CPA_DETECT); + psize, CPA_DETECT); /* * If there is a conflict, split the large page. @@ -906,7 +925,8 @@ static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn, if (!cpa->force_static_prot) goto set; - prot = static_protections(ref_prot, address, pfn, npg, CPA_PROTECT); + /* Hand in lpsize = 0 to enforce the protection mechanism */ + prot = static_protections(ref_prot, address, pfn, npg, 0, CPA_PROTECT); if (pgprot_val(prot) == pgprot_val(ref_prot)) goto set; @@ -1503,7 +1523,8 @@ repeat: pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); cpa_inc_4k_install(); - new_prot = static_protections(new_prot, address, pfn, 1, + /* Hand in lpsize = 0 to enforce the protection mechanism */ + new_prot = static_protections(new_prot, address, pfn, 1, 0, CPA_PROTECT); new_prot = pgprot_clear_protnone_bits(new_prot); @@ -1774,7 +1795,7 @@ static inline int cpa_clear_pages_array(struct page **pages, int numpages, int _set_memory_uc(unsigned long addr, int numpages) { /* - * for now UC MINUS. see comments in ioremap_nocache() + * for now UC MINUS. see comments in ioremap() * If you really need strong UC use ioremap_uc(), but note * that you cannot override IO areas with set_memory_*() as * these helpers cannot work with IO memory. @@ -1789,9 +1810,9 @@ int set_memory_uc(unsigned long addr, int numpages) int ret; /* - * for now UC MINUS. see comments in ioremap_nocache() + * for now UC MINUS. see comments in ioremap() */ - ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, + ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, _PAGE_CACHE_MODE_UC_MINUS, NULL); if (ret) goto out_err; @@ -1803,69 +1824,12 @@ int set_memory_uc(unsigned long addr, int numpages) return 0; out_free: - free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); + memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); out_err: return ret; } EXPORT_SYMBOL(set_memory_uc); -static int _set_memory_array(unsigned long *addr, int numpages, - enum page_cache_mode new_type) -{ - enum page_cache_mode set_type; - int i, j; - int ret; - - for (i = 0; i < numpages; i++) { - ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE, - new_type, NULL); - if (ret) - goto out_free; - } - - /* If WC, set to UC- first and then WC */ - set_type = (new_type == _PAGE_CACHE_MODE_WC) ? - _PAGE_CACHE_MODE_UC_MINUS : new_type; - - ret = change_page_attr_set(addr, numpages, - cachemode2pgprot(set_type), 1); - - if (!ret && new_type == _PAGE_CACHE_MODE_WC) - ret = change_page_attr_set_clr(addr, numpages, - cachemode2pgprot( - _PAGE_CACHE_MODE_WC), - __pgprot(_PAGE_CACHE_MASK), - 0, CPA_ARRAY, NULL); - if (ret) - goto out_free; - - return 0; - -out_free: - for (j = 0; j < i; j++) - free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE); - - return ret; -} - -int set_memory_array_uc(unsigned long *addr, int numpages) -{ - return _set_memory_array(addr, numpages, _PAGE_CACHE_MODE_UC_MINUS); -} -EXPORT_SYMBOL(set_memory_array_uc); - -int set_memory_array_wc(unsigned long *addr, int numpages) -{ - return _set_memory_array(addr, numpages, _PAGE_CACHE_MODE_WC); -} -EXPORT_SYMBOL(set_memory_array_wc); - -int set_memory_array_wt(unsigned long *addr, int numpages) -{ - return _set_memory_array(addr, numpages, _PAGE_CACHE_MODE_WT); -} -EXPORT_SYMBOL_GPL(set_memory_array_wt); - int _set_memory_wc(unsigned long addr, int numpages) { int ret; @@ -1886,14 +1850,14 @@ int set_memory_wc(unsigned long addr, int numpages) { int ret; - ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, + ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, _PAGE_CACHE_MODE_WC, NULL); if (ret) return ret; ret = _set_memory_wc(addr, numpages); if (ret) - free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); + memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); return ret; } @@ -1905,23 +1869,6 @@ int _set_memory_wt(unsigned long addr, int numpages) cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0); } -int set_memory_wt(unsigned long addr, int numpages) -{ - int ret; - - ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, - _PAGE_CACHE_MODE_WT, NULL); - if (ret) - return ret; - - ret = _set_memory_wt(addr, numpages); - if (ret) - free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); - - return ret; -} -EXPORT_SYMBOL_GPL(set_memory_wt); - int _set_memory_wb(unsigned long addr, int numpages) { /* WB cache mode is hard wired to all cache attribute bits being 0 */ @@ -1937,29 +1884,11 @@ int set_memory_wb(unsigned long addr, int numpages) if (ret) return ret; - free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); + memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); return 0; } EXPORT_SYMBOL(set_memory_wb); -int set_memory_array_wb(unsigned long *addr, int numpages) -{ - int i; - int ret; - - /* WB cache mode is hard wired to all cache attribute bits being 0 */ - ret = change_page_attr_clear(addr, numpages, - __pgprot(_PAGE_CACHE_MASK), 1); - if (ret) - return ret; - - for (i = 0; i < numpages; i++) - free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE); - - return 0; -} -EXPORT_SYMBOL(set_memory_array_wb); - int set_memory_x(unsigned long addr, int numpages) { if (!(__supported_pte_mask & _PAGE_NX)) @@ -1967,7 +1896,6 @@ int set_memory_x(unsigned long addr, int numpages) return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); } -EXPORT_SYMBOL(set_memory_x); int set_memory_nx(unsigned long addr, int numpages) { @@ -1976,7 +1904,6 @@ int set_memory_nx(unsigned long addr, int numpages) return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); } -EXPORT_SYMBOL(set_memory_nx); int set_memory_ro(unsigned long addr, int numpages) { @@ -2098,7 +2025,7 @@ static int _set_pages_array(struct page **pages, int numpages, continue; start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; - if (reserve_memtype(start, end, new_type, NULL)) + if (memtype_reserve(start, end, new_type, NULL)) goto err_out; } @@ -2124,7 +2051,7 @@ err_out: continue; start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; - free_memtype(start, end); + memtype_free(start, end); } return -EINVAL; } @@ -2173,29 +2100,13 @@ int set_pages_array_wb(struct page **pages, int numpages) continue; start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; - free_memtype(start, end); + memtype_free(start, end); } return 0; } EXPORT_SYMBOL(set_pages_array_wb); -int set_pages_x(struct page *page, int numpages) -{ - unsigned long addr = (unsigned long)page_address(page); - - return set_memory_x(addr, numpages); -} -EXPORT_SYMBOL(set_pages_x); - -int set_pages_nx(struct page *page, int numpages) -{ - unsigned long addr = (unsigned long)page_address(page); - - return set_memory_nx(addr, numpages); -} -EXPORT_SYMBOL(set_pages_nx); - int set_pages_ro(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); @@ -2315,7 +2226,7 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, .pgd = pgd, .numpages = numpages, .mask_set = __pgprot(0), - .mask_clr = __pgprot(0), + .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)), .flags = 0, }; @@ -2324,12 +2235,6 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, if (!(__supported_pte_mask & _PAGE_NX)) goto out; - if (!(page_flags & _PAGE_NX)) - cpa.mask_clr = __pgprot(_PAGE_NX); - - if (!(page_flags & _PAGE_RW)) - cpa.mask_clr = __pgprot(_PAGE_RW); - if (!(page_flags & _PAGE_ENC)) cpa.mask_clr = pgprot_encrypted(cpa.mask_clr); @@ -2381,5 +2286,5 @@ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address, * be exposed to the rest of the kernel. Include these directly here. */ #ifdef CONFIG_CPA_DEBUG -#include "pageattr-test.c" +#include "cpa-test.c" #endif diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c deleted file mode 100644 index fa16036fa592..000000000000 --- a/arch/x86/mm/pat_rbtree.c +++ /dev/null @@ -1,281 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Handle caching attributes in page tables (PAT) - * - * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> - * Suresh B Siddha <suresh.b.siddha@intel.com> - * - * Interval tree (augmented rbtree) used to store the PAT memory type - * reservations. - */ - -#include <linux/seq_file.h> -#include <linux/debugfs.h> -#include <linux/kernel.h> -#include <linux/rbtree_augmented.h> -#include <linux/sched.h> -#include <linux/gfp.h> - -#include <asm/pgtable.h> -#include <asm/pat.h> - -#include "pat_internal.h" - -/* - * The memtype tree keeps track of memory type for specific - * physical memory areas. Without proper tracking, conflicting memory - * types in different mappings can cause CPU cache corruption. - * - * The tree is an interval tree (augmented rbtree) with tree ordered - * on starting address. Tree can contain multiple entries for - * different regions which overlap. All the aliases have the same - * cache attributes of course. - * - * memtype_lock protects the rbtree. - */ - -static struct rb_root memtype_rbroot = RB_ROOT; - -static int is_node_overlap(struct memtype *node, u64 start, u64 end) -{ - if (node->start >= end || node->end <= start) - return 0; - - return 1; -} - -static u64 get_subtree_max_end(struct rb_node *node) -{ - u64 ret = 0; - if (node) { - struct memtype *data = rb_entry(node, struct memtype, rb); - ret = data->subtree_max_end; - } - return ret; -} - -static u64 compute_subtree_max_end(struct memtype *data) -{ - u64 max_end = data->end, child_max_end; - - child_max_end = get_subtree_max_end(data->rb.rb_right); - if (child_max_end > max_end) - max_end = child_max_end; - - child_max_end = get_subtree_max_end(data->rb.rb_left); - if (child_max_end > max_end) - max_end = child_max_end; - - return max_end; -} - -RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb, - u64, subtree_max_end, compute_subtree_max_end) - -/* Find the first (lowest start addr) overlapping range from rb tree */ -static struct memtype *memtype_rb_lowest_match(struct rb_root *root, - u64 start, u64 end) -{ - struct rb_node *node = root->rb_node; - struct memtype *last_lower = NULL; - - while (node) { - struct memtype *data = rb_entry(node, struct memtype, rb); - - if (get_subtree_max_end(node->rb_left) > start) { - /* Lowest overlap if any must be on left side */ - node = node->rb_left; - } else if (is_node_overlap(data, start, end)) { - last_lower = data; - break; - } else if (start >= data->start) { - /* Lowest overlap if any must be on right side */ - node = node->rb_right; - } else { - break; - } - } - return last_lower; /* Returns NULL if there is no overlap */ -} - -enum { - MEMTYPE_EXACT_MATCH = 0, - MEMTYPE_END_MATCH = 1 -}; - -static struct memtype *memtype_rb_match(struct rb_root *root, - u64 start, u64 end, int match_type) -{ - struct memtype *match; - - match = memtype_rb_lowest_match(root, start, end); - while (match != NULL && match->start < end) { - struct rb_node *node; - - if ((match_type == MEMTYPE_EXACT_MATCH) && - (match->start == start) && (match->end == end)) - return match; - - if ((match_type == MEMTYPE_END_MATCH) && - (match->start < start) && (match->end == end)) - return match; - - node = rb_next(&match->rb); - if (node) - match = rb_entry(node, struct memtype, rb); - else - match = NULL; - } - - return NULL; /* Returns NULL if there is no match */ -} - -static int memtype_rb_check_conflict(struct rb_root *root, - u64 start, u64 end, - enum page_cache_mode reqtype, - enum page_cache_mode *newtype) -{ - struct rb_node *node; - struct memtype *match; - enum page_cache_mode found_type = reqtype; - - match = memtype_rb_lowest_match(&memtype_rbroot, start, end); - if (match == NULL) - goto success; - - if (match->type != found_type && newtype == NULL) - goto failure; - - dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end); - found_type = match->type; - - node = rb_next(&match->rb); - while (node) { - match = rb_entry(node, struct memtype, rb); - - if (match->start >= end) /* Checked all possible matches */ - goto success; - - if (is_node_overlap(match, start, end) && - match->type != found_type) { - goto failure; - } - - node = rb_next(&match->rb); - } -success: - if (newtype) - *newtype = found_type; - - return 0; - -failure: - pr_info("x86/PAT: %s:%d conflicting memory types %Lx-%Lx %s<->%s\n", - current->comm, current->pid, start, end, - cattr_name(found_type), cattr_name(match->type)); - return -EBUSY; -} - -static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata) -{ - struct rb_node **node = &(root->rb_node); - struct rb_node *parent = NULL; - - while (*node) { - struct memtype *data = rb_entry(*node, struct memtype, rb); - - parent = *node; - if (data->subtree_max_end < newdata->end) - data->subtree_max_end = newdata->end; - if (newdata->start <= data->start) - node = &((*node)->rb_left); - else if (newdata->start > data->start) - node = &((*node)->rb_right); - } - - newdata->subtree_max_end = newdata->end; - rb_link_node(&newdata->rb, parent, node); - rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb); -} - -int rbt_memtype_check_insert(struct memtype *new, - enum page_cache_mode *ret_type) -{ - int err = 0; - - err = memtype_rb_check_conflict(&memtype_rbroot, new->start, new->end, - new->type, ret_type); - - if (!err) { - if (ret_type) - new->type = *ret_type; - - new->subtree_max_end = new->end; - memtype_rb_insert(&memtype_rbroot, new); - } - return err; -} - -struct memtype *rbt_memtype_erase(u64 start, u64 end) -{ - struct memtype *data; - - /* - * Since the memtype_rbroot tree allows overlapping ranges, - * rbt_memtype_erase() checks with EXACT_MATCH first, i.e. free - * a whole node for the munmap case. If no such entry is found, - * it then checks with END_MATCH, i.e. shrink the size of a node - * from the end for the mremap case. - */ - data = memtype_rb_match(&memtype_rbroot, start, end, - MEMTYPE_EXACT_MATCH); - if (!data) { - data = memtype_rb_match(&memtype_rbroot, start, end, - MEMTYPE_END_MATCH); - if (!data) - return ERR_PTR(-EINVAL); - } - - if (data->start == start) { - /* munmap: erase this node */ - rb_erase_augmented(&data->rb, &memtype_rbroot, - &memtype_rb_augment_cb); - } else { - /* mremap: update the end value of this node */ - rb_erase_augmented(&data->rb, &memtype_rbroot, - &memtype_rb_augment_cb); - data->end = start; - data->subtree_max_end = data->end; - memtype_rb_insert(&memtype_rbroot, data); - return NULL; - } - - return data; -} - -struct memtype *rbt_memtype_lookup(u64 addr) -{ - return memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE); -} - -#if defined(CONFIG_DEBUG_FS) -int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos) -{ - struct rb_node *node; - int i = 1; - - node = rb_first(&memtype_rbroot); - while (node && pos != i) { - node = rb_next(node); - i++; - } - - if (node) { /* pos == i */ - struct memtype *this = rb_entry(node, struct memtype, rb); - *out = *this; - return 0; - } else { - return 1; - } -} -#endif diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 44816ff6411f..7bd2c3a52297 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -45,7 +45,7 @@ early_param("userpte", setup_userpte); void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { - pgtable_page_dtor(pte); + pgtable_pte_page_dtor(pte); paravirt_release_pte(page_to_pfn(pte)); paravirt_tlb_remove_table(tlb, pte); } @@ -357,7 +357,7 @@ static void pgd_prepopulate_user_pmd(struct mm_struct *mm, static struct kmem_cache *pgd_cache; -void __init pgd_cache_init(void) +void __init pgtable_cache_init(void) { /* * When PAE kernel is running as a Xen domain, it does not use @@ -402,10 +402,6 @@ static inline void _pgd_free(pgd_t *pgd) } #else -void __init pgd_cache_init(void) -{ -} - static inline pgd_t *_pgd_alloc(void) { return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER, @@ -647,8 +643,8 @@ void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) fixmaps_set++; } -void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, - pgprot_t flags) +void native_set_fixmap(unsigned /* enum fixed_addresses */ idx, + phys_addr_t phys, pgprot_t flags) { /* Sanitize 'prot' against any unsupported bits: */ pgprot_val(flags) &= __default_kernel_pte_mask; diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 9bb7f0ab9fe6..0e6700eaa4f9 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -18,6 +18,7 @@ #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/io.h> +#include <linux/vmalloc.h> unsigned int __VMALLOC_RESERVE = 128 << 20; diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c index bdc98150d4db..fc3f3d3e2ef2 100644 --- a/arch/x86/mm/physaddr.c +++ b/arch/x86/mm/physaddr.c @@ -5,6 +5,7 @@ #include <linux/mm.h> #include <asm/page.h> +#include <linux/vmalloc.h> #include "physaddr.h" diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index b196524759ec..44a9f068eee0 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -330,13 +330,15 @@ pti_clone_pgtable(unsigned long start, unsigned long end, pud = pud_offset(p4d, addr); if (pud_none(*pud)) { - addr += PUD_SIZE; + WARN_ON_ONCE(addr & ~PUD_MASK); + addr = round_up(addr + 1, PUD_SIZE); continue; } pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) { - addr += PMD_SIZE; + WARN_ON_ONCE(addr & ~PMD_MASK); + addr = round_up(addr + 1, PMD_SIZE); continue; } @@ -572,7 +574,7 @@ static void pti_clone_kernel_text(void) */ unsigned long start = PFN_ALIGN(_text); unsigned long end_clone = (unsigned long)__end_rodata_aligned; - unsigned long end_global = PFN_ALIGN((unsigned long)__stop___ex_table); + unsigned long end_global = PFN_ALIGN((unsigned long)_etext); if (!pti_kernel_image_global_ok()) return; @@ -666,6 +668,8 @@ void __init pti_init(void) */ void pti_finalize(void) { + if (!boot_cpu_has(X86_FEATURE_PTI)) + return; /* * We need to clone everything (again) that maps parts of the * kernel image. diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index 0881e1ff1e58..bda73cb7a044 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <linux/io.h> #include <linux/mmiotrace.h> +#include <linux/security.h> static unsigned long mmio_address; module_param_hw(mmio_address, ulong, iomem, 0); @@ -78,7 +79,7 @@ static void do_read_far_test(void __iomem *p) static void do_test(unsigned long size) { - void __iomem *p = ioremap_nocache(mmio_address, size); + void __iomem *p = ioremap(mmio_address, size); if (!p) { pr_err("could not ioremap, aborting.\n"); return; @@ -103,7 +104,7 @@ static void do_test_bulk_ioremapping(void) int i; for (i = 0; i < 10; ++i) { - p = ioremap_nocache(mmio_address, PAGE_SIZE); + p = ioremap(mmio_address, PAGE_SIZE); if (p) iounmap(p); } @@ -115,6 +116,10 @@ static void do_test_bulk_ioremapping(void) static int __init init(void) { unsigned long size = (read_far) ? (8 << 20) : (16 << 10); + int ret = security_locked_down(LOCKDOWN_MMIOTRACE); + + if (ret) + return ret; if (mmio_address == 0) { pr_err("you have to use the module argument mmio_address.\n"); @@ -122,9 +127,9 @@ static int __init init(void) return -ENXIO; } - pr_warning("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, " - "and writing 16 kB of rubbish in there.\n", - size >> 10, mmio_address); + pr_warn("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, " + "and writing 16 kB of rubbish in there.\n", + size >> 10, mmio_address); do_test(size); do_test_bulk_ioremapping(); pr_info("All done.\n"); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 4de9704c4aaf..66f96f21a7b6 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -440,7 +440,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); if (next != real_prev) { - load_mm_cr4(next); + load_mm_cr4_irqsoff(next); switch_ldt(real_prev, next); } } @@ -708,7 +708,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, (void *)info, 1); else on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote, - (void *)info, 1, GFP_ATOMIC, cpumask); + (void *)info, 1, cpumask); } /* |