summaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile15
-rw-r--r--arch/x86/mm/amdtopology.c1
-rw-r--r--arch/x86/mm/cpu_entry_area.c21
-rw-r--r--arch/x86/mm/debug_pagetables.c32
-rw-r--r--arch/x86/mm/dump_pagetables.c196
-rw-r--r--arch/x86/mm/extable.c114
-rw-r--r--arch/x86/mm/fault.c571
-rw-r--r--arch/x86/mm/highmem_32.c4
-rw-r--r--arch/x86/mm/hugetlbpage.c4
-rw-r--r--arch/x86/mm/ident_map.c5
-rw-r--r--arch/x86/mm/init.c102
-rw-r--r--arch/x86/mm/init_32.c39
-rw-r--r--arch/x86/mm/init_64.c102
-rw-r--r--arch/x86/mm/iomap_32.c6
-rw-r--r--arch/x86/mm/ioremap.c29
-rw-r--r--arch/x86/mm/kasan_init_64.c43
-rw-r--r--arch/x86/mm/kaslr.c30
-rw-r--r--arch/x86/mm/kmmio.c25
-rw-r--r--arch/x86/mm/mem_encrypt.c692
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c568
-rw-r--r--arch/x86/mm/mmap.c41
-rw-r--r--arch/x86/mm/mpx.c30
-rw-r--r--arch/x86/mm/numa.c25
-rw-r--r--arch/x86/mm/numa_32.c12
-rw-r--r--arch/x86/mm/numa_64.c2
-rw-r--r--arch/x86/mm/numa_emulation.c116
-rw-r--r--arch/x86/mm/pageattr-test.c2
-rw-r--r--arch/x86/mm/pageattr.c810
-rw-r--r--arch/x86/mm/pat.c18
-rw-r--r--arch/x86/mm/pgtable.c222
-rw-r--r--arch/x86/mm/physaddr.c2
-rw-r--r--arch/x86/mm/pkeys.c21
-rw-r--r--arch/x86/mm/pti.c377
-rw-r--r--arch/x86/mm/tlb.c220
34 files changed, 2830 insertions, 1667 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 27e9e90a8d35..4b101dd6e52f 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,12 +1,15 @@
# SPDX-License-Identifier: GPL-2.0
-# Kernel does not boot with instrumentation of tlb.c and mem_encrypt.c
-KCOV_INSTRUMENT_tlb.o := n
-KCOV_INSTRUMENT_mem_encrypt.o := n
+# Kernel does not boot with instrumentation of tlb.c and mem_encrypt*.c
+KCOV_INSTRUMENT_tlb.o := n
+KCOV_INSTRUMENT_mem_encrypt.o := n
+KCOV_INSTRUMENT_mem_encrypt_identity.o := n
-KASAN_SANITIZE_mem_encrypt.o := n
+KASAN_SANITIZE_mem_encrypt.o := n
+KASAN_SANITIZE_mem_encrypt_identity.o := n
ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_mem_encrypt.o = -pg
+CFLAGS_REMOVE_mem_encrypt.o = -pg
+CFLAGS_REMOVE_mem_encrypt_identity.o = -pg
endif
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
@@ -16,6 +19,7 @@ obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_physaddr.o := $(nostackp)
CFLAGS_setup_nx.o := $(nostackp)
+CFLAGS_mem_encrypt_identity.o := $(nostackp)
CFLAGS_fault.o := -I$(src)/../include/asm/trace
@@ -47,4 +51,5 @@ obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
+obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_identity.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
index 048c761d97b0..058b2f36b3a6 100644
--- a/arch/x86/mm/amdtopology.c
+++ b/arch/x86/mm/amdtopology.c
@@ -12,7 +12,6 @@
#include <linux/string.h>
#include <linux/nodemask.h>
#include <linux/memblock.h>
-#include <linux/bootmem.h>
#include <asm/io.h>
#include <linux/pci_ids.h>
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index 476d810639a8..12d7e7fb4efd 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -2,6 +2,8 @@
#include <linux/spinlock.h>
#include <linux/percpu.h>
+#include <linux/kallsyms.h>
+#include <linux/kcore.h>
#include <asm/cpu_entry_area.h>
#include <asm/pgtable.h>
@@ -27,8 +29,20 @@ EXPORT_SYMBOL(get_cpu_entry_area);
void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags)
{
unsigned long va = (unsigned long) cea_vaddr;
+ pte_t pte = pfn_pte(pa >> PAGE_SHIFT, flags);
- set_pte_vaddr(va, pfn_pte(pa >> PAGE_SHIFT, flags));
+ /*
+ * The cpu_entry_area is shared between the user and kernel
+ * page tables. All of its ptes can safely be global.
+ * _PAGE_GLOBAL gets reused to help indicate PROT_NONE for
+ * non-present PTEs, so be careful not to set it in that
+ * case to avoid confusion.
+ */
+ if (boot_cpu_has(X86_FEATURE_PGE) &&
+ (pgprot_val(flags) & _PAGE_PRESENT))
+ pte = pte_set_flags(pte, _PAGE_GLOBAL);
+
+ set_pte_vaddr(va, pte);
}
static void __init
@@ -68,8 +82,6 @@ static void percpu_setup_debug_store(int cpu)
static void __init setup_cpu_entry_area(int cpu)
{
#ifdef CONFIG_X86_64
- extern char _entry_trampoline[];
-
/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
pgprot_t gdt_prot = PAGE_KERNEL_RO;
pgprot_t tss_prot = PAGE_KERNEL_RO;
@@ -131,9 +143,6 @@ static void __init setup_cpu_entry_area(int cpu)
cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks,
&per_cpu(exception_stacks, cpu),
sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL);
-
- cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
- __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
#endif
percpu_setup_debug_store(cpu);
}
diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
index 421f2664ffa0..225fe2f0bfec 100644
--- a/arch/x86/mm/debug_pagetables.c
+++ b/arch/x86/mm/debug_pagetables.c
@@ -1,4 +1,5 @@
#include <linux/debugfs.h>
+#include <linux/efi.h>
#include <linux/module.h>
#include <linux/seq_file.h>
#include <asm/pgtable.h>
@@ -72,6 +73,30 @@ static const struct file_operations ptdump_curusr_fops = {
};
#endif
+#if defined(CONFIG_EFI) && defined(CONFIG_X86_64)
+static struct dentry *pe_efi;
+
+static int ptdump_show_efi(struct seq_file *m, void *v)
+{
+ if (efi_mm.pgd)
+ ptdump_walk_pgd_level_debugfs(m, efi_mm.pgd, false);
+ return 0;
+}
+
+static int ptdump_open_efi(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, ptdump_show_efi, NULL);
+}
+
+static const struct file_operations ptdump_efi_fops = {
+ .owner = THIS_MODULE,
+ .open = ptdump_open_efi,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif
+
static struct dentry *dir, *pe_knl, *pe_curknl;
static int __init pt_dump_debug_init(void)
@@ -96,6 +121,13 @@ static int __init pt_dump_debug_init(void)
if (!pe_curusr)
goto err;
#endif
+
+#if defined(CONFIG_EFI) && defined(CONFIG_X86_64)
+ pe_efi = debugfs_create_file("efi", 0400, dir, NULL, &ptdump_efi_fops);
+ if (!pe_efi)
+ goto err;
+#endif
+
return 0;
err:
debugfs_remove_recursive(dir);
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 2a4849e92831..fc37bbd23eb8 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -18,7 +18,10 @@
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
+#include <linux/highmem.h>
+#include <linux/pci.h>
+#include <asm/e820/types.h>
#include <asm/pgtable.h>
/*
@@ -29,6 +32,7 @@
struct pg_state {
int level;
pgprot_t current_prot;
+ pgprotval_t effective_prot;
unsigned long start_address;
unsigned long current_address;
const struct addr_marker *marker;
@@ -85,11 +89,15 @@ static struct addr_marker address_markers[] = {
[VMALLOC_START_NR] = { 0UL, "vmalloc() Area" },
[VMEMMAP_START_NR] = { 0UL, "Vmemmap" },
#ifdef CONFIG_KASAN
- [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
- [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" },
+ /*
+ * These fields get initialized with the (dynamic)
+ * KASAN_SHADOW_{START,END} values in pt_dump_init().
+ */
+ [KASAN_SHADOW_START_NR] = { 0UL, "KASAN shadow" },
+ [KASAN_SHADOW_END_NR] = { 0UL, "KASAN shadow end" },
#endif
#ifdef CONFIG_MODIFY_LDT_SYSCALL
- [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" },
+ [LDT_NR] = { 0UL, "LDT remap" },
#endif
[CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
#ifdef CONFIG_X86_ESPFIX64
@@ -105,6 +113,8 @@ static struct addr_marker address_markers[] = {
[END_OF_SPACE_NR] = { -1, NULL }
};
+#define INIT_PGD ((pgd_t *) &init_top_pgt)
+
#else /* CONFIG_X86_64 */
enum address_markers_idx {
@@ -115,6 +125,9 @@ enum address_markers_idx {
#ifdef CONFIG_HIGHMEM
PKMAP_BASE_NR,
#endif
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ LDT_NR,
+#endif
CPU_ENTRY_AREA_NR,
FIXADDR_START_NR,
END_OF_SPACE_NR,
@@ -128,11 +141,16 @@ static struct addr_marker address_markers[] = {
#ifdef CONFIG_HIGHMEM
[PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" },
#endif
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ [LDT_NR] = { 0UL, "LDT remap" },
+#endif
[CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" },
[FIXADDR_START_NR] = { 0UL, "Fixmap area" },
[END_OF_SPACE_NR] = { -1, NULL }
};
+#define INIT_PGD (swapper_pg_dir)
+
#endif /* !CONFIG_X86_64 */
/* Multipliers for offsets within the PTEs */
@@ -225,15 +243,38 @@ static unsigned long normalize_addr(unsigned long u)
return (signed long)(u << shift) >> shift;
}
+static void note_wx(struct pg_state *st)
+{
+ unsigned long npages;
+
+ npages = (st->current_address - st->start_address) / PAGE_SIZE;
+
+#ifdef CONFIG_PCI_BIOS
+ /*
+ * If PCI BIOS is enabled, the PCI BIOS area is forced to WX.
+ * Inform about it, but avoid the warning.
+ */
+ if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN &&
+ st->current_address <= PAGE_OFFSET + BIOS_END) {
+ pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages);
+ return;
+ }
+#endif
+ /* Account the WX pages */
+ st->wx_pages += npages;
+ WARN_ONCE(1, "x86/mm: Found insecure W+X mapping at address %pS\n",
+ (void *)st->start_address);
+}
+
/*
* This function gets called on a break in a continuous series
* of PTE entries; the next one is different so we need to
* print what we collected so far.
*/
static void note_page(struct seq_file *m, struct pg_state *st,
- pgprot_t new_prot, int level)
+ pgprot_t new_prot, pgprotval_t new_eff, int level)
{
- pgprotval_t prot, cur;
+ pgprotval_t prot, cur, eff;
static const char units[] = "BKMGTPE";
/*
@@ -243,30 +284,25 @@ static void note_page(struct seq_file *m, struct pg_state *st,
*/
prot = pgprot_val(new_prot);
cur = pgprot_val(st->current_prot);
+ eff = st->effective_prot;
if (!st->level) {
/* First entry */
st->current_prot = new_prot;
+ st->effective_prot = new_eff;
st->level = level;
st->marker = address_markers;
st->lines = 0;
pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
st->marker->name);
- } else if (prot != cur || level != st->level ||
+ } else if (prot != cur || new_eff != eff || level != st->level ||
st->current_address >= st->marker[1].start_address) {
const char *unit = units;
unsigned long delta;
int width = sizeof(unsigned long) * 2;
- pgprotval_t pr = pgprot_val(st->current_prot);
-
- if (st->check_wx && (pr & _PAGE_RW) && !(pr & _PAGE_NX)) {
- WARN_ONCE(1,
- "x86/mm: Found insecure W+X mapping at address %p/%pS\n",
- (void *)st->start_address,
- (void *)st->start_address);
- st->wx_pages += (st->current_address -
- st->start_address) / PAGE_SIZE;
- }
+
+ if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX))
+ note_wx(st);
/*
* Now print the actual finished series
@@ -313,22 +349,31 @@ static void note_page(struct seq_file *m, struct pg_state *st,
st->start_address = st->current_address;
st->current_prot = new_prot;
+ st->effective_prot = new_eff;
st->level = level;
}
}
-static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, unsigned long P)
+static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2)
+{
+ return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) |
+ ((prot1 | prot2) & _PAGE_NX);
+}
+
+static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
+ pgprotval_t eff_in, unsigned long P)
{
int i;
- pte_t *start;
- pgprotval_t prot;
+ pte_t *pte;
+ pgprotval_t prot, eff;
- start = (pte_t *)pmd_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PTE; i++) {
- prot = pte_flags(*start);
st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
- note_page(m, st, __pgprot(prot), 5);
- start++;
+ pte = pte_offset_map(&addr, st->current_address);
+ prot = pte_flags(*pte);
+ eff = effective_prot(eff_in, prot);
+ note_page(m, st, __pgprot(prot), eff, 5);
+ pte_unmap(pte);
}
}
#ifdef CONFIG_KASAN
@@ -344,12 +389,10 @@ static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
void *pt)
{
if (__pa(pt) == __pa(kasan_zero_pmd) ||
-#ifdef CONFIG_X86_5LEVEL
- __pa(pt) == __pa(kasan_zero_p4d) ||
-#endif
+ (pgtable_l5_enabled() && __pa(pt) == __pa(kasan_zero_p4d)) ||
__pa(pt) == __pa(kasan_zero_pud)) {
pgprotval_t prot = pte_flags(kasan_zero_pte[0]);
- note_page(m, st, __pgprot(prot), 5);
+ note_page(m, st, __pgprot(prot), 0, 5);
return true;
}
return false;
@@ -364,42 +407,45 @@ static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
#if PTRS_PER_PMD > 1
-static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P)
+static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
+ pgprotval_t eff_in, unsigned long P)
{
int i;
pmd_t *start, *pmd_start;
- pgprotval_t prot;
+ pgprotval_t prot, eff;
pmd_start = start = (pmd_t *)pud_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PMD; i++) {
st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
if (!pmd_none(*start)) {
+ prot = pmd_flags(*start);
+ eff = effective_prot(eff_in, prot);
if (pmd_large(*start) || !pmd_present(*start)) {
- prot = pmd_flags(*start);
- note_page(m, st, __pgprot(prot), 4);
+ note_page(m, st, __pgprot(prot), eff, 4);
} else if (!kasan_page_table(m, st, pmd_start)) {
- walk_pte_level(m, st, *start,
+ walk_pte_level(m, st, *start, eff,
P + i * PMD_LEVEL_MULT);
}
} else
- note_page(m, st, __pgprot(0), 4);
+ note_page(m, st, __pgprot(0), 0, 4);
start++;
}
}
#else
-#define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p)
+#define walk_pmd_level(m,s,a,e,p) walk_pte_level(m,s,__pmd(pud_val(a)),e,p)
#define pud_large(a) pmd_large(__pmd(pud_val(a)))
#define pud_none(a) pmd_none(__pmd(pud_val(a)))
#endif
#if PTRS_PER_PUD > 1
-static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P)
+static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
+ pgprotval_t eff_in, unsigned long P)
{
int i;
pud_t *start, *pud_start;
- pgprotval_t prot;
+ pgprotval_t prot, eff;
pud_t *prev_pud = NULL;
pud_start = start = (pud_t *)p4d_page_vaddr(addr);
@@ -407,15 +453,16 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
for (i = 0; i < PTRS_PER_PUD; i++) {
st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
if (!pud_none(*start)) {
+ prot = pud_flags(*start);
+ eff = effective_prot(eff_in, prot);
if (pud_large(*start) || !pud_present(*start)) {
- prot = pud_flags(*start);
- note_page(m, st, __pgprot(prot), 3);
+ note_page(m, st, __pgprot(prot), eff, 3);
} else if (!kasan_page_table(m, st, pud_start)) {
- walk_pmd_level(m, st, *start,
+ walk_pmd_level(m, st, *start, eff,
P + i * PUD_LEVEL_MULT);
}
} else
- note_page(m, st, __pgprot(0), 3);
+ note_page(m, st, __pgprot(0), 0, 3);
prev_pud = start;
start++;
@@ -423,43 +470,43 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
}
#else
-#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(p4d_val(a)),p)
+#define walk_pud_level(m,s,a,e,p) walk_pmd_level(m,s,__pud(p4d_val(a)),e,p)
#define p4d_large(a) pud_large(__pud(p4d_val(a)))
#define p4d_none(a) pud_none(__pud(p4d_val(a)))
#endif
-#if PTRS_PER_P4D > 1
-
-static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)
+static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
+ pgprotval_t eff_in, unsigned long P)
{
int i;
p4d_t *start, *p4d_start;
- pgprotval_t prot;
+ pgprotval_t prot, eff;
+
+ if (PTRS_PER_P4D == 1)
+ return walk_pud_level(m, st, __p4d(pgd_val(addr)), eff_in, P);
p4d_start = start = (p4d_t *)pgd_page_vaddr(addr);
for (i = 0; i < PTRS_PER_P4D; i++) {
st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
if (!p4d_none(*start)) {
+ prot = p4d_flags(*start);
+ eff = effective_prot(eff_in, prot);
if (p4d_large(*start) || !p4d_present(*start)) {
- prot = p4d_flags(*start);
- note_page(m, st, __pgprot(prot), 2);
+ note_page(m, st, __pgprot(prot), eff, 2);
} else if (!kasan_page_table(m, st, p4d_start)) {
- walk_pud_level(m, st, *start,
+ walk_pud_level(m, st, *start, eff,
P + i * P4D_LEVEL_MULT);
}
} else
- note_page(m, st, __pgprot(0), 2);
+ note_page(m, st, __pgprot(0), 0, 2);
start++;
}
}
-#else
-#define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p)
-#define pgd_large(a) p4d_large(__p4d(pgd_val(a)))
-#define pgd_none(a) p4d_none(__p4d(pgd_val(a)))
-#endif
+#define pgd_large(a) (pgtable_l5_enabled() ? pgd_large(a) : p4d_large(__p4d(pgd_val(a))))
+#define pgd_none(a) (pgtable_l5_enabled() ? pgd_none(a) : p4d_none(__p4d(pgd_val(a))))
static inline bool is_hypervisor_range(int idx)
{
@@ -478,12 +525,8 @@ static inline bool is_hypervisor_range(int idx)
static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
bool checkwx, bool dmesg)
{
-#ifdef CONFIG_X86_64
- pgd_t *start = (pgd_t *) &init_top_pgt;
-#else
- pgd_t *start = swapper_pg_dir;
-#endif
- pgprotval_t prot;
+ pgd_t *start = INIT_PGD;
+ pgprotval_t prot, eff;
int i;
struct pg_state st = {};
@@ -499,15 +542,20 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
for (i = 0; i < PTRS_PER_PGD; i++) {
st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
if (!pgd_none(*start) && !is_hypervisor_range(i)) {
+ prot = pgd_flags(*start);
+#ifdef CONFIG_X86_PAE
+ eff = _PAGE_USER | _PAGE_RW;
+#else
+ eff = prot;
+#endif
if (pgd_large(*start) || !pgd_present(*start)) {
- prot = pgd_flags(*start);
- note_page(m, &st, __pgprot(prot), 1);
+ note_page(m, &st, __pgprot(prot), eff, 1);
} else {
- walk_p4d_level(m, &st, *start,
+ walk_p4d_level(m, &st, *start, eff,
i * PGD_LEVEL_MULT);
}
} else
- note_page(m, &st, __pgprot(0), 1);
+ note_page(m, &st, __pgprot(0), 0, 1);
cond_resched();
start++;
@@ -515,7 +563,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
/* Flush out the last page */
st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
- note_page(m, &st, __pgprot(0), 0);
+ note_page(m, &st, __pgprot(0), 0, 0);
if (!checkwx)
return;
if (st.wx_pages)
@@ -540,12 +588,13 @@ void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
}
EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
-static void ptdump_walk_user_pgd_level_checkwx(void)
+void ptdump_walk_user_pgd_level_checkwx(void)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
- pgd_t *pgd = (pgd_t *) &init_top_pgt;
+ pgd_t *pgd = INIT_PGD;
- if (!static_cpu_has(X86_FEATURE_PTI))
+ if (!(__supported_pte_mask & _PAGE_NX) ||
+ !static_cpu_has(X86_FEATURE_PTI))
return;
pr_info("x86/mm: Checking user space page tables\n");
@@ -557,7 +606,6 @@ static void ptdump_walk_user_pgd_level_checkwx(void)
void ptdump_walk_pgd_level_checkwx(void)
{
ptdump_walk_pgd_level_core(NULL, NULL, true, false);
- ptdump_walk_user_pgd_level_checkwx();
}
static int __init pt_dump_init(void)
@@ -570,6 +618,13 @@ static int __init pt_dump_init(void)
address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET;
address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START;
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ address_markers[LDT_NR].start_address = LDT_BASE_ADDR;
+#endif
+#ifdef CONFIG_KASAN
+ address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START;
+ address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END;
+#endif
#endif
#ifdef CONFIG_X86_32
address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
@@ -579,6 +634,9 @@ static int __init pt_dump_init(void)
# endif
address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE;
+# ifdef CONFIG_MODIFY_LDT_SYSCALL
+ address_markers[LDT_NR].start_address = LDT_BASE_ADDR;
+# endif
#endif
return 0;
}
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 45f5d6cf65ae..6521134057e8 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -8,7 +8,8 @@
#include <asm/kdebug.h>
typedef bool (*ex_handler_t)(const struct exception_table_entry *,
- struct pt_regs *, int);
+ struct pt_regs *, int, unsigned long,
+ unsigned long);
static inline unsigned long
ex_fixup_addr(const struct exception_table_entry *x)
@@ -22,7 +23,9 @@ ex_fixup_handler(const struct exception_table_entry *x)
}
__visible bool ex_handler_default(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr)
+ struct pt_regs *regs, int trapnr,
+ unsigned long error_code,
+ unsigned long fault_addr)
{
regs->ip = ex_fixup_addr(fixup);
return true;
@@ -30,7 +33,9 @@ __visible bool ex_handler_default(const struct exception_table_entry *fixup,
EXPORT_SYMBOL(ex_handler_default);
__visible bool ex_handler_fault(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr)
+ struct pt_regs *regs, int trapnr,
+ unsigned long error_code,
+ unsigned long fault_addr)
{
regs->ip = ex_fixup_addr(fixup);
regs->ax = trapnr;
@@ -43,7 +48,9 @@ EXPORT_SYMBOL_GPL(ex_handler_fault);
* result of a refcount inc/dec/add/sub.
*/
__visible bool ex_handler_refcount(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr)
+ struct pt_regs *regs, int trapnr,
+ unsigned long error_code,
+ unsigned long fault_addr)
{
/* First unconditionally saturate the refcount. */
*(int *)regs->cx = INT_MIN / 2;
@@ -96,7 +103,9 @@ EXPORT_SYMBOL(ex_handler_refcount);
* out all the FPU registers) if we can't restore from the task's FPU state.
*/
__visible bool ex_handler_fprestore(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr)
+ struct pt_regs *regs, int trapnr,
+ unsigned long error_code,
+ unsigned long fault_addr)
{
regs->ip = ex_fixup_addr(fixup);
@@ -108,9 +117,79 @@ __visible bool ex_handler_fprestore(const struct exception_table_entry *fixup,
}
EXPORT_SYMBOL_GPL(ex_handler_fprestore);
+/* Helper to check whether a uaccess fault indicates a kernel bug. */
+static bool bogus_uaccess(struct pt_regs *regs, int trapnr,
+ unsigned long fault_addr)
+{
+ /* This is the normal case: #PF with a fault address in userspace. */
+ if (trapnr == X86_TRAP_PF && fault_addr < TASK_SIZE_MAX)
+ return false;
+
+ /*
+ * This code can be reached for machine checks, but only if the #MC
+ * handler has already decided that it looks like a candidate for fixup.
+ * This e.g. happens when attempting to access userspace memory which
+ * the CPU can't access because of uncorrectable bad memory.
+ */
+ if (trapnr == X86_TRAP_MC)
+ return false;
+
+ /*
+ * There are two remaining exception types we might encounter here:
+ * - #PF for faulting accesses to kernel addresses
+ * - #GP for faulting accesses to noncanonical addresses
+ * Complain about anything else.
+ */
+ if (trapnr != X86_TRAP_PF && trapnr != X86_TRAP_GP) {
+ WARN(1, "unexpected trap %d in uaccess\n", trapnr);
+ return false;
+ }
+
+ /*
+ * This is a faulting memory access in kernel space, on a kernel
+ * address, in a usercopy function. This can e.g. be caused by improper
+ * use of helpers like __put_user and by improper attempts to access
+ * userspace addresses in KERNEL_DS regions.
+ * The one (semi-)legitimate exception are probe_kernel_{read,write}(),
+ * which can be invoked from places like kgdb, /dev/mem (for reading)
+ * and privileged BPF code (for reading).
+ * The probe_kernel_*() functions set the kernel_uaccess_faults_ok flag
+ * to tell us that faulting on kernel addresses, and even noncanonical
+ * addresses, in a userspace accessor does not necessarily imply a
+ * kernel bug, root might just be doing weird stuff.
+ */
+ if (current->kernel_uaccess_faults_ok)
+ return false;
+
+ /* This is bad. Refuse the fixup so that we go into die(). */
+ if (trapnr == X86_TRAP_PF) {
+ pr_emerg("BUG: pagefault on kernel address 0x%lx in non-whitelisted uaccess\n",
+ fault_addr);
+ } else {
+ pr_emerg("BUG: GPF in non-whitelisted uaccess (non-canonical address?)\n");
+ }
+ return true;
+}
+
+__visible bool ex_handler_uaccess(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr,
+ unsigned long error_code,
+ unsigned long fault_addr)
+{
+ if (bogus_uaccess(regs, trapnr, fault_addr))
+ return false;
+ regs->ip = ex_fixup_addr(fixup);
+ return true;
+}
+EXPORT_SYMBOL(ex_handler_uaccess);
+
__visible bool ex_handler_ext(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr)
+ struct pt_regs *regs, int trapnr,
+ unsigned long error_code,
+ unsigned long fault_addr)
{
+ if (bogus_uaccess(regs, trapnr, fault_addr))
+ return false;
/* Special hack for uaccess_err */
current->thread.uaccess_err = 1;
regs->ip = ex_fixup_addr(fixup);
@@ -119,7 +198,9 @@ __visible bool ex_handler_ext(const struct exception_table_entry *fixup,
EXPORT_SYMBOL(ex_handler_ext);
__visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr)
+ struct pt_regs *regs, int trapnr,
+ unsigned long error_code,
+ unsigned long fault_addr)
{
if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pF)\n",
(unsigned int)regs->cx, regs->ip, (void *)regs->ip))
@@ -134,7 +215,9 @@ __visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup
EXPORT_SYMBOL(ex_handler_rdmsr_unsafe);
__visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr)
+ struct pt_regs *regs, int trapnr,
+ unsigned long error_code,
+ unsigned long fault_addr)
{
if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pF)\n",
(unsigned int)regs->cx, (unsigned int)regs->dx,
@@ -148,12 +231,14 @@ __visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup
EXPORT_SYMBOL(ex_handler_wrmsr_unsafe);
__visible bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr)
+ struct pt_regs *regs, int trapnr,
+ unsigned long error_code,
+ unsigned long fault_addr)
{
if (static_cpu_has(X86_BUG_NULL_SEG))
asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS));
asm volatile ("mov %0, %%fs" : : "rm" (0));
- return ex_handler_default(fixup, regs, trapnr);
+ return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr);
}
EXPORT_SYMBOL(ex_handler_clear_fs);
@@ -170,7 +255,8 @@ __visible bool ex_has_fault_handler(unsigned long ip)
return handler == ex_handler_fault;
}
-int fixup_exception(struct pt_regs *regs, int trapnr)
+int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
+ unsigned long fault_addr)
{
const struct exception_table_entry *e;
ex_handler_t handler;
@@ -194,7 +280,7 @@ int fixup_exception(struct pt_regs *regs, int trapnr)
return 0;
handler = ex_fixup_handler(e);
- return handler(e, regs, trapnr);
+ return handler(e, regs, trapnr, error_code, fault_addr);
}
extern unsigned int early_recursion_flag;
@@ -230,9 +316,9 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr)
* result in a hard-to-debug panic.
*
* Keep in mind that not all vectors actually get here. Early
- * fage faults, for example, are special.
+ * page faults, for example, are special.
*/
- if (fixup_exception(regs, trapnr))
+ if (fixup_exception(regs, trapnr, regs->orig_ax, 0))
return;
if (fixup_bug(regs, trapnr))
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 25a30b5d6582..71d4b9d4d43f 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -8,7 +8,7 @@
#include <linux/sched/task_stack.h> /* task_stack_*(), ... */
#include <linux/kdebug.h> /* oops_begin/end, ... */
#include <linux/extable.h> /* search_exception_tables */
-#include <linux/bootmem.h> /* max_low_pfn */
+#include <linux/memblock.h> /* max_low_pfn */
#include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
#include <linux/mmiotrace.h> /* kmmio_handler, ... */
#include <linux/perf_event.h> /* perf_sw_event */
@@ -16,6 +16,8 @@
#include <linux/prefetch.h> /* prefetchw */
#include <linux/context_tracking.h> /* exception_enter(), ... */
#include <linux/uaccess.h> /* faulthandler_disabled() */
+#include <linux/efi.h> /* efi_recover_from_page_fault()*/
+#include <linux/mm_types.h>
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
@@ -24,6 +26,7 @@
#include <asm/vsyscall.h> /* emulate_vsyscall */
#include <asm/vm86.h> /* struct vm86 */
#include <asm/mmu_context.h> /* vma_pkey() */
+#include <asm/efi.h> /* efi_recover_from_page_fault()*/
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
@@ -43,17 +46,19 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr)
static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
{
- int ret = 0;
-
- /* kprobe_running() needs smp_processor_id() */
- if (kprobes_built_in() && !user_mode(regs)) {
- preempt_disable();
- if (kprobe_running() && kprobe_fault_handler(regs, 14))
- ret = 1;
- preempt_enable();
- }
-
- return ret;
+ if (!kprobes_built_in())
+ return 0;
+ if (user_mode(regs))
+ return 0;
+ /*
+ * To be potentially processing a kprobe fault and to be allowed to call
+ * kprobe_running(), we have to be non-preemptible.
+ */
+ if (preemptible())
+ return 0;
+ if (!kprobe_running())
+ return 0;
+ return kprobe_fault_handler(regs, X86_TRAP_PF);
}
/*
@@ -152,78 +157,6 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
return prefetch;
}
-/*
- * A protection key fault means that the PKRU value did not allow
- * access to some PTE. Userspace can figure out what PKRU was
- * from the XSAVE state, and this function fills out a field in
- * siginfo so userspace can discover which protection key was set
- * on the PTE.
- *
- * If we get here, we know that the hardware signaled a X86_PF_PK
- * fault and that there was a VMA once we got in the fault
- * handler. It does *not* guarantee that the VMA we find here
- * was the one that we faulted on.
- *
- * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
- * 2. T1 : set PKRU to deny access to pkey=4, touches page
- * 3. T1 : faults...
- * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
- * 5. T1 : enters fault handler, takes mmap_sem, etc...
- * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
- * faulted on a pte with its pkey=4.
- */
-static void fill_sig_info_pkey(int si_signo, int si_code, siginfo_t *info,
- u32 *pkey)
-{
- /* This is effectively an #ifdef */
- if (!boot_cpu_has(X86_FEATURE_OSPKE))
- return;
-
- /* Fault not from Protection Keys: nothing to do */
- if ((si_code != SEGV_PKUERR) || (si_signo != SIGSEGV))
- return;
- /*
- * force_sig_info_fault() is called from a number of
- * contexts, some of which have a VMA and some of which
- * do not. The X86_PF_PK handing happens after we have a
- * valid VMA, so we should never reach this without a
- * valid VMA.
- */
- if (!pkey) {
- WARN_ONCE(1, "PKU fault with no VMA passed in");
- info->si_pkey = 0;
- return;
- }
- /*
- * si_pkey should be thought of as a strong hint, but not
- * absolutely guranteed to be 100% accurate because of
- * the race explained above.
- */
- info->si_pkey = *pkey;
-}
-
-static void
-force_sig_info_fault(int si_signo, int si_code, unsigned long address,
- struct task_struct *tsk, u32 *pkey, int fault)
-{
- unsigned lsb = 0;
- siginfo_t info;
-
- info.si_signo = si_signo;
- info.si_errno = 0;
- info.si_code = si_code;
- info.si_addr = (void __user *)address;
- if (fault & VM_FAULT_HWPOISON_LARGE)
- lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
- if (fault & VM_FAULT_HWPOISON)
- lsb = PAGE_SHIFT;
- info.si_addr_lsb = lsb;
-
- fill_sig_info_pkey(si_signo, si_code, &info, pkey);
-
- force_sig_info(si_signo, &info, tsk);
-}
-
DEFINE_SPINLOCK(pgd_lock);
LIST_HEAD(pgd_list);
@@ -316,8 +249,6 @@ static noinline int vmalloc_fault(unsigned long address)
if (!(address >= VMALLOC_START && address < VMALLOC_END))
return -1;
- WARN_ON_ONCE(in_nmi());
-
/*
* Synchronize this task's top level page-table
* with the 'reference' page table.
@@ -417,11 +348,11 @@ void vmalloc_sync_all(void)
*/
static noinline int vmalloc_fault(unsigned long address)
{
- pgd_t *pgd, *pgd_ref;
- p4d_t *p4d, *p4d_ref;
- pud_t *pud, *pud_ref;
- pmd_t *pmd, *pmd_ref;
- pte_t *pte, *pte_ref;
+ pgd_t *pgd, *pgd_k;
+ p4d_t *p4d, *p4d_k;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
/* Make sure we are in vmalloc area: */
if (!(address >= VMALLOC_START && address < VMALLOC_END))
@@ -435,73 +366,51 @@ static noinline int vmalloc_fault(unsigned long address)
* case just flush:
*/
pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
- pgd_ref = pgd_offset_k(address);
- if (pgd_none(*pgd_ref))
+ pgd_k = pgd_offset_k(address);
+ if (pgd_none(*pgd_k))
return -1;
- if (CONFIG_PGTABLE_LEVELS > 4) {
+ if (pgtable_l5_enabled()) {
if (pgd_none(*pgd)) {
- set_pgd(pgd, *pgd_ref);
+ set_pgd(pgd, *pgd_k);
arch_flush_lazy_mmu_mode();
} else {
- BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k));
}
}
/* With 4-level paging, copying happens on the p4d level. */
p4d = p4d_offset(pgd, address);
- p4d_ref = p4d_offset(pgd_ref, address);
- if (p4d_none(*p4d_ref))
+ p4d_k = p4d_offset(pgd_k, address);
+ if (p4d_none(*p4d_k))
return -1;
- if (p4d_none(*p4d) && CONFIG_PGTABLE_LEVELS == 4) {
- set_p4d(p4d, *p4d_ref);
+ if (p4d_none(*p4d) && !pgtable_l5_enabled()) {
+ set_p4d(p4d, *p4d_k);
arch_flush_lazy_mmu_mode();
} else {
- BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_ref));
+ BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k));
}
- /*
- * Below here mismatches are bugs because these lower tables
- * are shared:
- */
BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4);
pud = pud_offset(p4d, address);
- pud_ref = pud_offset(p4d_ref, address);
- if (pud_none(*pud_ref))
+ if (pud_none(*pud))
return -1;
- if (pud_none(*pud) || pud_pfn(*pud) != pud_pfn(*pud_ref))
- BUG();
-
if (pud_large(*pud))
return 0;
pmd = pmd_offset(pud, address);
- pmd_ref = pmd_offset(pud_ref, address);
- if (pmd_none(*pmd_ref))
+ if (pmd_none(*pmd))
return -1;
- if (pmd_none(*pmd) || pmd_pfn(*pmd) != pmd_pfn(*pmd_ref))
- BUG();
-
if (pmd_large(*pmd))
return 0;
- pte_ref = pte_offset_kernel(pmd_ref, address);
- if (!pte_present(*pte_ref))
- return -1;
-
pte = pte_offset_kernel(pmd, address);
-
- /*
- * Don't use pte_page here, because the mappings can point
- * outside mem_map, and the NUMA hash lookup cannot handle
- * that:
- */
- if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
- BUG();
+ if (!pte_present(*pte))
+ return -1;
return 0;
}
@@ -662,11 +571,6 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
return 0;
}
-static const char nx_warning[] = KERN_CRIT
-"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
-static const char smep_warning[] = KERN_CRIT
-"unable to execute userspace code (SMEP?) (uid: %d)\n";
-
static void
show_fault_oops(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
@@ -685,21 +589,18 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
pte = lookup_address_in_pgd(pgd, address, &level);
if (pte && pte_present(*pte) && !pte_exec(*pte))
- printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
+ pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
+ from_kuid(&init_user_ns, current_uid()));
if (pte && pte_present(*pte) && pte_exec(*pte) &&
(pgd_flags(*pgd) & _PAGE_USER) &&
(__read_cr4() & X86_CR4_SMEP))
- printk(smep_warning, from_kuid(&init_user_ns, current_uid()));
+ pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
+ from_kuid(&init_user_ns, current_uid()));
}
- printk(KERN_ALERT "BUG: unable to handle kernel ");
- if (address < PAGE_SIZE)
- printk(KERN_CONT "NULL pointer dereference");
- else
- printk(KERN_CONT "paging request");
-
- printk(KERN_CONT " at %px\n", (void *) address);
- printk(KERN_ALERT "IP: %pS\n", (void *)regs->ip);
+ pr_alert("BUG: unable to handle kernel %s at %px\n",
+ address < PAGE_SIZE ? "NULL pointer dereference" : "paging request",
+ (void *)address);
dump_pagetable(address);
}
@@ -739,7 +640,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
int sig;
/* Are we prepared to handle this kernel fault? */
- if (fixup_exception(regs, X86_TRAP_PF)) {
+ if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
/*
* Any interrupt that takes a fault gets the fixup. This makes
* the below recursive fault logic only apply to a faults from
@@ -760,8 +661,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,
tsk->thread.cr2 = address;
/* XXX: hwpoison faults will set the wrong code. */
- force_sig_info_fault(signal, si_code, address,
- tsk, NULL, 0);
+ force_sig_fault(signal, si_code, (void __user *)address,
+ tsk);
}
/*
@@ -819,6 +720,13 @@ no_context(struct pt_regs *regs, unsigned long error_code,
return;
/*
+ * Buggy firmware could access regions which might page fault, try to
+ * recover from such faults.
+ */
+ if (IS_ENABLED(CONFIG_EFI))
+ efi_recover_from_page_fault(address);
+
+ /*
* Oops. The kernel tried to access some bad page. We'll have to
* terminate things with extreme prejudice:
*/
@@ -851,6 +759,8 @@ static inline void
show_signal_msg(struct pt_regs *regs, unsigned long error_code,
unsigned long address, struct task_struct *tsk)
{
+ const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;
+
if (!unhandled_signal(tsk, SIGSEGV))
return;
@@ -858,18 +768,28 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
return;
printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
- task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
- tsk->comm, task_pid_nr(tsk), address,
+ loglvl, tsk->comm, task_pid_nr(tsk), address,
(void *)regs->ip, (void *)regs->sp, error_code);
print_vma_addr(KERN_CONT " in ", regs->ip);
printk(KERN_CONT "\n");
+
+ show_opcodes(regs, loglvl);
+}
+
+/*
+ * The (legacy) vsyscall page is the long page in the kernel portion
+ * of the address space that has user-accessible permissions.
+ */
+static bool is_vsyscall_vaddr(unsigned long vaddr)
+{
+ return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
}
static void
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, u32 *pkey, int si_code)
+ unsigned long address, u32 pkey, int si_code)
{
struct task_struct *tsk = current;
@@ -890,18 +810,6 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
if (is_errata100(regs, address))
return;
-#ifdef CONFIG_X86_64
- /*
- * Instruction fetch faults in the vsyscall page might need
- * emulation.
- */
- if (unlikely((error_code & X86_PF_INSTR) &&
- ((address & ~0xfff) == VSYSCALL_ADDR))) {
- if (emulate_vsyscall(regs, address))
- return;
- }
-#endif
-
/*
* To avoid leaking information about the kernel page table
* layout, pretend that user-mode accesses to kernel addresses
@@ -917,7 +825,10 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_PF;
- force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0);
+ if (si_code == SEGV_PKUERR)
+ force_sig_pkuerr((void __user *)address, pkey);
+
+ force_sig_fault(SIGSEGV, si_code, (void __user *)address, tsk);
return;
}
@@ -930,35 +841,29 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
static noinline void
bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, u32 *pkey)
+ unsigned long address)
{
- __bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR);
+ __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR);
}
static void
__bad_area(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, struct vm_area_struct *vma, int si_code)
+ unsigned long address, u32 pkey, int si_code)
{
struct mm_struct *mm = current->mm;
- u32 pkey;
-
- if (vma)
- pkey = vma_pkey(vma);
-
/*
* Something tried to access memory that isn't in our memory map..
* Fix it, but check if it's kernel or user first..
*/
up_read(&mm->mmap_sem);
- __bad_area_nosemaphore(regs, error_code, address,
- (vma) ? &pkey : NULL, si_code);
+ __bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
}
static noinline void
bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
- __bad_area(regs, error_code, address, NULL, SEGV_MAPERR);
+ __bad_area(regs, error_code, address, 0, SEGV_MAPERR);
}
static inline bool bad_area_access_from_pkeys(unsigned long error_code,
@@ -987,18 +892,40 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
* But, doing it this way allows compiler optimizations
* if pkeys are compiled out.
*/
- if (bad_area_access_from_pkeys(error_code, vma))
- __bad_area(regs, error_code, address, vma, SEGV_PKUERR);
- else
- __bad_area(regs, error_code, address, vma, SEGV_ACCERR);
+ if (bad_area_access_from_pkeys(error_code, vma)) {
+ /*
+ * A protection key fault means that the PKRU value did not allow
+ * access to some PTE. Userspace can figure out what PKRU was
+ * from the XSAVE state. This function captures the pkey from
+ * the vma and passes it to userspace so userspace can discover
+ * which protection key was set on the PTE.
+ *
+ * If we get here, we know that the hardware signaled a X86_PF_PK
+ * fault and that there was a VMA once we got in the fault
+ * handler. It does *not* guarantee that the VMA we find here
+ * was the one that we faulted on.
+ *
+ * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
+ * 2. T1 : set PKRU to deny access to pkey=4, touches page
+ * 3. T1 : faults...
+ * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
+ * 5. T1 : enters fault handler, takes mmap_sem, etc...
+ * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
+ * faulted on a pte with its pkey=4.
+ */
+ u32 pkey = vma_pkey(vma);
+
+ __bad_area(regs, error_code, address, pkey, SEGV_PKUERR);
+ } else {
+ __bad_area(regs, error_code, address, 0, SEGV_ACCERR);
+ }
}
static void
do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
- u32 *pkey, unsigned int fault)
+ unsigned int fault)
{
struct task_struct *tsk = current;
- int code = BUS_ADRERR;
/* Kernel mode? Handle exceptions or die: */
if (!(error_code & X86_PF_USER)) {
@@ -1016,18 +943,25 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
#ifdef CONFIG_MEMORY_FAILURE
if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
- printk(KERN_ERR
+ unsigned lsb = 0;
+
+ pr_err(
"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
tsk->comm, tsk->pid, address);
- code = BUS_MCEERR_AR;
+ if (fault & VM_FAULT_HWPOISON_LARGE)
+ lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
+ if (fault & VM_FAULT_HWPOISON)
+ lsb = PAGE_SHIFT;
+ force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, tsk);
+ return;
}
#endif
- force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault);
+ force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, tsk);
}
static noinline void
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, u32 *pkey, unsigned int fault)
+ unsigned long address, vm_fault_t fault)
{
if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
no_context(regs, error_code, address, 0, 0);
@@ -1051,27 +985,21 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
} else {
if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
VM_FAULT_HWPOISON_LARGE))
- do_sigbus(regs, error_code, address, pkey, fault);
+ do_sigbus(regs, error_code, address, fault);
else if (fault & VM_FAULT_SIGSEGV)
- bad_area_nosemaphore(regs, error_code, address, pkey);
+ bad_area_nosemaphore(regs, error_code, address);
else
BUG();
}
}
-static int spurious_fault_check(unsigned long error_code, pte_t *pte)
+static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
{
if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
return 0;
if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
return 0;
- /*
- * Note: We do not do lazy flushing on protection key
- * changes, so no spurious fault will ever set X86_PF_PK.
- */
- if ((error_code & X86_PF_PK))
- return 1;
return 1;
}
@@ -1098,7 +1026,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
* (Optional Invalidation).
*/
static noinline int
-spurious_fault(unsigned long error_code, unsigned long address)
+spurious_kernel_fault(unsigned long error_code, unsigned long address)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -1129,27 +1057,27 @@ spurious_fault(unsigned long error_code, unsigned long address)
return 0;
if (p4d_large(*p4d))
- return spurious_fault_check(error_code, (pte_t *) p4d);
+ return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
pud = pud_offset(p4d, address);
if (!pud_present(*pud))
return 0;
if (pud_large(*pud))
- return spurious_fault_check(error_code, (pte_t *) pud);
+ return spurious_kernel_fault_check(error_code, (pte_t *) pud);
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return 0;
if (pmd_large(*pmd))
- return spurious_fault_check(error_code, (pte_t *) pmd);
+ return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
pte = pte_offset_kernel(pmd, address);
if (!pte_present(*pte))
return 0;
- ret = spurious_fault_check(error_code, pte);
+ ret = spurious_kernel_fault_check(error_code, pte);
if (!ret)
return 0;
@@ -1157,12 +1085,12 @@ spurious_fault(unsigned long error_code, unsigned long address)
* Make sure we have permissions in PMD.
* If not, then there's a bug in the page tables:
*/
- ret = spurious_fault_check(error_code, (pte_t *) pmd);
+ ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
return ret;
}
-NOKPROBE_SYMBOL(spurious_fault);
+NOKPROBE_SYMBOL(spurious_kernel_fault);
int show_unhandled_signals = 1;
@@ -1209,6 +1137,14 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
static int fault_in_kernel_space(unsigned long address)
{
+ /*
+ * On 64-bit systems, the vsyscall page is at an address above
+ * TASK_SIZE_MAX, but is not considered part of the kernel
+ * address space.
+ */
+ if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
+ return false;
+
return address >= TASK_SIZE_MAX;
}
@@ -1230,31 +1166,23 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
}
/*
- * This routine handles page faults. It determines the address,
- * and the problem, and then passes it off to one of the appropriate
- * routines.
+ * Called for all faults where 'address' is part of the kernel address
+ * space. Might get called for faults that originate from *code* that
+ * ran in userspace or the kernel.
*/
-static noinline void
-__do_page_fault(struct pt_regs *regs, unsigned long error_code,
- unsigned long address)
+static void
+do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
+ unsigned long address)
{
- struct vm_area_struct *vma;
- struct task_struct *tsk;
- struct mm_struct *mm;
- int fault, major = 0;
- unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
- u32 pkey;
-
- tsk = current;
- mm = tsk->mm;
-
- prefetchw(&mm->mmap_sem);
-
- if (unlikely(kmmio_fault(regs, address)))
- return;
+ /*
+ * Protection keys exceptions only happen on user pages. We
+ * have no user pages in the kernel portion of the address
+ * space, so do not expect them here.
+ */
+ WARN_ON_ONCE(hw_error_code & X86_PF_PK);
/*
- * We fault-in kernel-space virtual memory on-demand. The
+ * We can fault-in kernel-space virtual memory on-demand. The
* 'reference' page table is init_mm.pgd.
*
* NOTE! We MUST NOT take any locks for this case. We may
@@ -1262,41 +1190,73 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* only copy the information from the master page table,
* nothing more.
*
- * This verifies that the fault happens in kernel space
- * (error_code & 4) == 0, and that the fault was not a
- * protection error (error_code & 9) == 0.
+ * Before doing this on-demand faulting, ensure that the
+ * fault is not any of the following:
+ * 1. A fault on a PTE with a reserved bit set.
+ * 2. A fault caused by a user-mode access. (Do not demand-
+ * fault kernel memory due to user-mode accesses).
+ * 3. A fault caused by a page-level protection violation.
+ * (A demand fault would be on a non-present page which
+ * would have X86_PF_PROT==0).
*/
- if (unlikely(fault_in_kernel_space(address))) {
- if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
- if (vmalloc_fault(address) >= 0)
- return;
- }
-
- /* Can handle a stale RO->RW TLB: */
- if (spurious_fault(error_code, address))
+ if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
+ if (vmalloc_fault(address) >= 0)
return;
+ }
- /* kprobes don't want to hook the spurious faults: */
- if (kprobes_fault(regs))
- return;
- /*
- * Don't take the mm semaphore here. If we fixup a prefetch
- * fault we could otherwise deadlock:
- */
- bad_area_nosemaphore(regs, error_code, address, NULL);
+ /* Was the fault spurious, caused by lazy TLB invalidation? */
+ if (spurious_kernel_fault(hw_error_code, address))
+ return;
+ /* kprobes don't want to hook the spurious faults: */
+ if (kprobes_fault(regs))
return;
- }
+
+ /*
+ * Note, despite being a "bad area", there are quite a few
+ * acceptable reasons to get here, such as erratum fixups
+ * and handling kernel code that can fault, like get_user().
+ *
+ * Don't take the mm semaphore here. If we fixup a prefetch
+ * fault we could otherwise deadlock:
+ */
+ bad_area_nosemaphore(regs, hw_error_code, address);
+}
+NOKPROBE_SYMBOL(do_kern_addr_fault);
+
+/* Handle faults in the user portion of the address space */
+static inline
+void do_user_addr_fault(struct pt_regs *regs,
+ unsigned long hw_error_code,
+ unsigned long address)
+{
+ unsigned long sw_error_code;
+ struct vm_area_struct *vma;
+ struct task_struct *tsk;
+ struct mm_struct *mm;
+ vm_fault_t fault, major = 0;
+ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+
+ tsk = current;
+ mm = tsk->mm;
/* kprobes don't want to hook the spurious faults: */
if (unlikely(kprobes_fault(regs)))
return;
- if (unlikely(error_code & X86_PF_RSVD))
- pgtable_bad(regs, error_code, address);
+ /*
+ * Reserved bits are never expected to be set on
+ * entries in the user portion of the page tables.
+ */
+ if (unlikely(hw_error_code & X86_PF_RSVD))
+ pgtable_bad(regs, hw_error_code, address);
- if (unlikely(smap_violation(error_code, regs))) {
- bad_area_nosemaphore(regs, error_code, address, NULL);
+ /*
+ * Check for invalid kernel (supervisor) access to user
+ * pages in the user address space.
+ */
+ if (unlikely(smap_violation(hw_error_code, regs))) {
+ bad_area_nosemaphore(regs, hw_error_code, address);
return;
}
@@ -1305,11 +1265,18 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* in a region with pagefaults disabled then we must not take the fault
*/
if (unlikely(faulthandler_disabled() || !mm)) {
- bad_area_nosemaphore(regs, error_code, address, NULL);
+ bad_area_nosemaphore(regs, hw_error_code, address);
return;
}
/*
+ * hw_error_code is literally the "page fault error code" passed to
+ * the kernel directly from the hardware. But, we will shortly be
+ * modifying it in software, so give it a new name.
+ */
+ sw_error_code = hw_error_code;
+
+ /*
* It's safe to allow irq's after cr2 has been saved and the
* vmalloc fault has been handled.
*
@@ -1318,7 +1285,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
*/
if (user_mode(regs)) {
local_irq_enable();
- error_code |= X86_PF_USER;
+ /*
+ * Up to this point, X86_PF_USER set in hw_error_code
+ * indicated a user-mode access. But, after this,
+ * X86_PF_USER in sw_error_code will indicate either
+ * that, *or* an implicit kernel(supervisor)-mode access
+ * which originated from user mode.
+ */
+ if (!(hw_error_code & X86_PF_USER)) {
+ /*
+ * The CPU was in user mode, but the CPU says
+ * the fault was not a user-mode access.
+ * Must be an implicit kernel-mode access,
+ * which we do not expect to happen in the
+ * user address space.
+ */
+ pr_warn_once("kernel-mode error from user-mode: %lx\n",
+ hw_error_code);
+
+ sw_error_code |= X86_PF_USER;
+ }
flags |= FAULT_FLAG_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
@@ -1327,31 +1313,49 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
- if (error_code & X86_PF_WRITE)
+ if (sw_error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
- if (error_code & X86_PF_INSTR)
+ if (sw_error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
+#ifdef CONFIG_X86_64
/*
- * When running in the kernel we expect faults to occur only to
- * addresses in user space. All other faults represent errors in
- * the kernel and should generate an OOPS. Unfortunately, in the
- * case of an erroneous fault occurring in a code path which already
- * holds mmap_sem we will deadlock attempting to validate the fault
- * against the address space. Luckily the kernel only validly
- * references user space from well defined areas of code, which are
- * listed in the exceptions table.
+ * Instruction fetch faults in the vsyscall page might need
+ * emulation. The vsyscall page is at a high address
+ * (>PAGE_OFFSET), but is considered to be part of the user
+ * address space.
*
- * As the vast majority of faults will be valid we will only perform
- * the source reference check when there is a possibility of a
- * deadlock. Attempt to lock the address space, if we cannot we then
- * validate the source. If this is invalid we can skip the address
- * space check, thus avoiding the deadlock:
+ * The vsyscall page does not have a "real" VMA, so do this
+ * emulation before we go searching for VMAs.
+ */
+ if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
+ if (emulate_vsyscall(regs, address))
+ return;
+ }
+#endif
+
+ /*
+ * Kernel-mode access to the user address space should only occur
+ * on well-defined single instructions listed in the exception
+ * tables. But, an erroneous kernel fault occurring outside one of
+ * those areas which also holds mmap_sem might deadlock attempting
+ * to validate the fault against the address space.
+ *
+ * Only do the expensive exception table search when we might be at
+ * risk of a deadlock. This happens if we
+ * 1. Failed to acquire mmap_sem, and
+ * 2. The access did not originate in userspace. Note: either the
+ * hardware or earlier page fault code may set X86_PF_USER
+ * in sw_error_code.
*/
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
- if (!(error_code & X86_PF_USER) &&
+ if (!(sw_error_code & X86_PF_USER) &&
!search_exception_tables(regs->ip)) {
- bad_area_nosemaphore(regs, error_code, address, NULL);
+ /*
+ * Fault from code in kernel from
+ * which we do not expect faults.
+ */
+ bad_area_nosemaphore(regs, sw_error_code, address);
return;
}
retry:
@@ -1367,16 +1371,16 @@ retry:
vma = find_vma(mm, address);
if (unlikely(!vma)) {
- bad_area(regs, error_code, address);
+ bad_area(regs, sw_error_code, address);
return;
}
if (likely(vma->vm_start <= address))
goto good_area;
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
- bad_area(regs, error_code, address);
+ bad_area(regs, sw_error_code, address);
return;
}
- if (error_code & X86_PF_USER) {
+ if (sw_error_code & X86_PF_USER) {
/*
* Accessing the stack below %sp is always a bug.
* The large cushion allows instructions like enter
@@ -1384,12 +1388,12 @@ retry:
* 32 pointers and then decrements %sp by 65535.)
*/
if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
- bad_area(regs, error_code, address);
+ bad_area(regs, sw_error_code, address);
return;
}
}
if (unlikely(expand_stack(vma, address))) {
- bad_area(regs, error_code, address);
+ bad_area(regs, sw_error_code, address);
return;
}
@@ -1398,8 +1402,8 @@ retry:
* we can handle it..
*/
good_area:
- if (unlikely(access_error(error_code, vma))) {
- bad_area_access_error(regs, error_code, address, vma);
+ if (unlikely(access_error(sw_error_code, vma))) {
+ bad_area_access_error(regs, sw_error_code, address, vma);
return;
}
@@ -1415,10 +1419,7 @@ good_area:
* (potentially after handling any pending signal during the return to
* userland). The return to userland is identified whenever
* FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
- * Thus we have to be careful about not touching vma after handling the
- * fault, so we read the pkey beforehand.
*/
- pkey = vma_pkey(vma);
fault = handle_mm_fault(vma, address, flags);
major |= fault & VM_FAULT_MAJOR;
@@ -1441,13 +1442,13 @@ good_area:
return;
/* Not returning to user mode? Handle exceptions or die: */
- no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
+ no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR);
return;
}
up_read(&mm->mmap_sem);
if (unlikely(fault & VM_FAULT_ERROR)) {
- mm_fault_error(regs, error_code, address, &pkey, fault);
+ mm_fault_error(regs, sw_error_code, address, fault);
return;
}
@@ -1465,6 +1466,28 @@ good_area:
check_v8086_mode(regs, address, tsk);
}
+NOKPROBE_SYMBOL(do_user_addr_fault);
+
+/*
+ * This routine handles page faults. It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+ * routines.
+ */
+static noinline void
+__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
+ unsigned long address)
+{
+ prefetchw(&current->mm->mmap_sem);
+
+ if (unlikely(kmmio_fault(regs, address)))
+ return;
+
+ /* Was the fault on kernel-controlled part of the address space? */
+ if (unlikely(fault_in_kernel_space(address)))
+ do_kern_addr_fault(regs, hw_error_code, address);
+ else
+ do_user_addr_fault(regs, hw_error_code, address);
+}
NOKPROBE_SYMBOL(__do_page_fault);
static nokprobe_inline void
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 6d18b70ed5a9..0d4bdcb84da5 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -1,7 +1,7 @@
#include <linux/highmem.h>
#include <linux/export.h>
#include <linux/swap.h> /* for totalram_pages */
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
void *kmap(struct page *page)
{
@@ -111,7 +111,7 @@ void __init set_highmem_pages_init(void)
/*
* Explicitly reset zone->managed_pages because set_highmem_pages_init()
- * is invoked before free_all_bootmem()
+ * is invoked before memblock_free_all()
*/
reset_all_zones_managed_pages();
for_each_zone(zone) {
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 00b296617ca4..92e4c4b85bba 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -92,7 +92,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
* If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
* in the full address space.
*/
- info.high_limit = in_compat_syscall() ?
+ info.high_limit = in_32bit_syscall() ?
task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
@@ -116,7 +116,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
* If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
* in the full address space.
*/
- if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall())
+ if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall())
info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index ab33a32df2a8..fe7a12599d8e 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -98,6 +98,9 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
if (!info->kernpg_flag)
info->kernpg_flag = _KERNPG_TABLE;
+ /* Filter out unsupported __PAGE_KERNEL_* bits: */
+ info->kernpg_flag &= __default_kernel_pte_mask;
+
for (; addr < end; addr = next) {
pgd_t *pgd = pgd_page + pgd_index(addr);
p4d_t *p4d;
@@ -120,7 +123,7 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
result = ident_p4d_init(info, p4d, addr, next);
if (result)
return result;
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ if (pgtable_l5_enabled()) {
set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag));
} else {
/*
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 82f5252c723a..ef99f3892e1f 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -3,7 +3,8 @@
#include <linux/ioport.h>
#include <linux/swap.h>
#include <linux/memblock.h>
-#include <linux/bootmem.h> /* for max_low_pfn */
+#include <linux/swapfile.h>
+#include <linux/swapops.h>
#include <asm/set_memory.h>
#include <asm/e820/api.h>
@@ -97,15 +98,22 @@ __ref void *alloc_low_pages(unsigned int num)
}
if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
- unsigned long ret;
- if (min_pfn_mapped >= max_pfn_mapped)
- panic("alloc_low_pages: ran out of memory");
- ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
+ unsigned long ret = 0;
+
+ if (min_pfn_mapped < max_pfn_mapped) {
+ ret = memblock_find_in_range(
+ min_pfn_mapped << PAGE_SHIFT,
max_pfn_mapped << PAGE_SHIFT,
PAGE_SIZE * num , PAGE_SIZE);
+ }
+ if (ret)
+ memblock_reserve(ret, PAGE_SIZE * num);
+ else if (can_use_brk_pgt)
+ ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE));
+
if (!ret)
panic("alloc_low_pages: can not alloc memory");
- memblock_reserve(ret, PAGE_SIZE * num);
+
pfn = ret >> PAGE_SHIFT;
} else {
pfn = pgt_buf_end;
@@ -161,12 +169,6 @@ struct map_range {
static int page_size_mask;
-static void enable_global_pages(void)
-{
- if (!static_cpu_has(X86_FEATURE_PTI))
- __supported_pte_mask |= _PAGE_GLOBAL;
-}
-
static void __init probe_page_size_mask(void)
{
/*
@@ -187,9 +189,15 @@ static void __init probe_page_size_mask(void)
__supported_pte_mask &= ~_PAGE_GLOBAL;
if (boot_cpu_has(X86_FEATURE_PGE)) {
cr4_set_bits_and_update_boot(X86_CR4_PGE);
- enable_global_pages();
+ __supported_pte_mask |= _PAGE_GLOBAL;
}
+ /* By the default is everything supported: */
+ __default_kernel_pte_mask = __supported_pte_mask;
+ /* Except when with PTI where the kernel is mostly non-Global: */
+ if (cpu_feature_enabled(X86_FEATURE_PTI))
+ __default_kernel_pte_mask &= ~_PAGE_GLOBAL;
+
/* Enable 1 GB linear kernel mappings if available: */
if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
printk(KERN_INFO "Using GB pages for direct mapping\n");
@@ -706,7 +714,9 @@ void __init init_mem_mapping(void)
*/
int devmem_is_allowed(unsigned long pagenr)
{
- if (page_is_ram(pagenr)) {
+ if (region_intersects(PFN_PHYS(pagenr), PAGE_SIZE,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)
+ != REGION_DISJOINT) {
/*
* For disallowed memory regions in the low 1MB range,
* request that the page be shown as all zeros.
@@ -771,13 +781,48 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
}
}
+/*
+ * begin/end can be in the direct map or the "high kernel mapping"
+ * used for the kernel image only. free_init_pages() will do the
+ * right thing for either kind of address.
+ */
+void free_kernel_image_pages(void *begin, void *end)
+{
+ unsigned long begin_ul = (unsigned long)begin;
+ unsigned long end_ul = (unsigned long)end;
+ unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT;
+
+
+ free_init_pages("unused kernel image", begin_ul, end_ul);
+
+ /*
+ * PTI maps some of the kernel into userspace. For performance,
+ * this includes some kernel areas that do not contain secrets.
+ * Those areas might be adjacent to the parts of the kernel image
+ * being freed, which may contain secrets. Remove the "high kernel
+ * image mapping" for these freed areas, ensuring they are not even
+ * potentially vulnerable to Meltdown regardless of the specific
+ * optimizations PTI is currently using.
+ *
+ * The "noalias" prevents unmapping the direct map alias which is
+ * needed to access the freed pages.
+ *
+ * This is only valid for 64bit kernels. 32bit has only one mapping
+ * which can't be treated in this way for obvious reasons.
+ */
+ if (IS_ENABLED(CONFIG_X86_64) && cpu_feature_enabled(X86_FEATURE_PTI))
+ set_memory_np_noalias(begin_ul, len_pages);
+}
+
+void __weak mem_encrypt_free_decrypted_mem(void) { }
+
void __ref free_initmem(void)
{
e820__reallocate_tables();
- free_init_pages("unused kernel",
- (unsigned long)(&__init_begin),
- (unsigned long)(&__init_end));
+ mem_encrypt_free_decrypted_mem();
+
+ free_kernel_image_pages(&__init_begin, &__init_end);
}
#ifdef CONFIG_BLK_DEV_INITRD
@@ -878,3 +923,26 @@ void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
__cachemode2pte_tbl[cache] = __cm_idx2pte(entry);
__pte2cachemode_tbl[entry] = cache;
}
+
+#ifdef CONFIG_SWAP
+unsigned long max_swapfile_size(void)
+{
+ unsigned long pages;
+
+ pages = generic_max_swapfile_size();
+
+ if (boot_cpu_has_bug(X86_BUG_L1TF)) {
+ /* Limit the swap file size to MAX_PA/2 for L1TF workaround */
+ unsigned long long l1tf_limit = l1tf_pfn_limit();
+ /*
+ * We encode swap offsets also with 3 bits below those for pfn
+ * which makes the usable limit higher.
+ */
+#if CONFIG_PGTABLE_LEVELS > 2
+ l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT;
+#endif
+ pages = min_t(unsigned long long, l1tf_limit, pages);
+ }
+ return pages;
+}
+#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 396e1f0151ac..49ecf5ecf6d3 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -23,7 +23,6 @@
#include <linux/pci.h>
#include <linux/pfn.h>
#include <linux/poison.h>
-#include <linux/bootmem.h>
#include <linux/memblock.h>
#include <linux/proc_fs.h>
#include <linux/memory_hotplug.h>
@@ -558,8 +557,14 @@ static void __init pagetable_init(void)
permanent_kmaps_init(pgd_base);
}
-pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
+#define DEFAULT_PTE_MASK ~(_PAGE_NX | _PAGE_GLOBAL)
+/* Bits supported by the hardware: */
+pteval_t __supported_pte_mask __read_mostly = DEFAULT_PTE_MASK;
+/* Bits allowed in normal kernel mappings: */
+pteval_t __default_kernel_pte_mask __read_mostly = DEFAULT_PTE_MASK;
EXPORT_SYMBOL_GPL(__supported_pte_mask);
+/* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */
+EXPORT_SYMBOL(__default_kernel_pte_mask);
/* user-defined highmem size */
static unsigned int highmem_pages = -1;
@@ -686,7 +691,7 @@ void __init initmem_init(void)
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
#endif
- memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
+ memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0);
sparse_memory_present_with_active_regions(0);
#ifdef CONFIG_FLATMEM
@@ -765,7 +770,7 @@ void __init mem_init(void)
#endif
/*
* With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to
- * be done before free_all_bootmem(). Memblock use free low memory for
+ * be done before memblock_free_all(). Memblock use free low memory for
* temporary data (see find_range_array()) and for this purpose can use
* pages that was already passed to the buddy allocator, hence marked as
* not accessible in the page tables when compiled with
@@ -775,9 +780,10 @@ void __init mem_init(void)
set_highmem_pages_init();
/* this will put all low memory onto the freelists */
- free_all_bootmem();
+ memblock_free_all();
after_bootmem = 1;
+ x86_init.hyper.init_after_bootmem();
mem_init_print_info(NULL);
printk(KERN_INFO "virtual kernel memory layout:\n"
@@ -916,34 +922,19 @@ static void mark_nxdata_nx(void)
void mark_rodata_ro(void)
{
unsigned long start = PFN_ALIGN(_text);
- unsigned long size = PFN_ALIGN(_etext) - start;
+ unsigned long size = (unsigned long)__end_rodata - start;
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
- printk(KERN_INFO "Write protecting the kernel text: %luk\n",
+ pr_info("Write protecting kernel text and read-only data: %luk\n",
size >> 10);
kernel_set_to_readonly = 1;
#ifdef CONFIG_CPA_DEBUG
- printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
- start, start+size);
- set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
-
- printk(KERN_INFO "Testing CPA: write protecting again\n");
- set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
-#endif
-
- start += size;
- size = (unsigned long)__end_rodata - start;
- set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
- printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
- size >> 10);
-
-#ifdef CONFIG_CPA_DEBUG
- printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
+ pr_info("Testing CPA: Reverting %lx-%lx\n", start, start + size);
set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
- printk(KERN_INFO "Testing CPA: write protecting again\n");
+ pr_info("Testing CPA: write protecting again\n");
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
#endif
mark_nxdata_nx();
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index af11a2890235..5fab264948c2 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -20,7 +20,6 @@
#include <linux/init.h>
#include <linux/initrd.h>
#include <linux/pagemap.h>
-#include <linux/bootmem.h>
#include <linux/memblock.h>
#include <linux/proc_fs.h>
#include <linux/pci.h>
@@ -65,8 +64,13 @@
* around without checking the pgd every time.
*/
+/* Bits supported by the hardware: */
pteval_t __supported_pte_mask __read_mostly = ~0;
+/* Bits allowed in normal kernel mappings: */
+pteval_t __default_kernel_pte_mask __read_mostly = ~0;
EXPORT_SYMBOL_GPL(__supported_pte_mask);
+/* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */
+EXPORT_SYMBOL(__default_kernel_pte_mask);
int force_personality32;
@@ -88,12 +92,7 @@ static int __init nonx32_setup(char *str)
}
__setup("noexec32=", nonx32_setup);
-/*
- * When memory was added make sure all the processes MM have
- * suitable PGD entries in the local PGD level page.
- */
-#ifdef CONFIG_X86_5LEVEL
-void sync_global_pgds(unsigned long start, unsigned long end)
+static void sync_global_pgds_l5(unsigned long start, unsigned long end)
{
unsigned long addr;
@@ -129,8 +128,8 @@ void sync_global_pgds(unsigned long start, unsigned long end)
spin_unlock(&pgd_lock);
}
}
-#else
-void sync_global_pgds(unsigned long start, unsigned long end)
+
+static void sync_global_pgds_l4(unsigned long start, unsigned long end)
{
unsigned long addr;
@@ -143,7 +142,7 @@ void sync_global_pgds(unsigned long start, unsigned long end)
* With folded p4d, pgd_none() is always false, we need to
* handle synchonization on p4d level.
*/
- BUILD_BUG_ON(pgd_none(*pgd_ref));
+ MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref));
p4d_ref = p4d_offset(pgd_ref, addr);
if (p4d_none(*p4d_ref))
@@ -173,7 +172,18 @@ void sync_global_pgds(unsigned long start, unsigned long end)
spin_unlock(&pgd_lock);
}
}
-#endif
+
+/*
+ * When memory was added make sure all the processes MM have
+ * suitable PGD entries in the local PGD level page.
+ */
+void sync_global_pgds(unsigned long start, unsigned long end)
+{
+ if (pgtable_l5_enabled())
+ sync_global_pgds_l5(start, end);
+ else
+ sync_global_pgds_l4(start, end);
+}
/*
* NOTE: This function is marked __ref because it calls __init function
@@ -186,7 +196,7 @@ static __ref void *spp_getpage(void)
if (after_bootmem)
ptr = (void *) get_zeroed_page(GFP_ATOMIC);
else
- ptr = alloc_bootmem_pages(PAGE_SIZE);
+ ptr = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
panic("set_pte_phys: cannot allocate page data %s\n",
@@ -632,7 +642,7 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
unsigned long vaddr = (unsigned long)__va(paddr);
int i = p4d_index(vaddr);
- if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+ if (!pgtable_l5_enabled())
return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask);
for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
@@ -712,7 +722,7 @@ kernel_physical_mapping_init(unsigned long paddr_start,
page_size_mask);
spin_lock(&init_mm.page_table_lock);
- if (IS_ENABLED(CONFIG_X86_5LEVEL))
+ if (pgtable_l5_enabled())
pgd_populate(&init_mm, pgd, p4d);
else
p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
@@ -731,7 +741,7 @@ kernel_physical_mapping_init(unsigned long paddr_start,
#ifndef CONFIG_NUMA
void __init initmem_init(void)
{
- memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
+ memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0);
}
#endif
@@ -1089,7 +1099,7 @@ remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
* 5-level case we should free them. This code will have to change
* to adapt for boot-time switching between 4 and 5 level page tables.
*/
- if (CONFIG_PGTABLE_LEVELS == 5)
+ if (pgtable_l5_enabled())
free_pud_table(pud_base, p4d);
}
@@ -1177,13 +1187,14 @@ void __init mem_init(void)
/* clear_bss() already clear the empty_zero_page */
/* this will put all memory onto the freelists */
- free_all_bootmem();
+ memblock_free_all();
after_bootmem = 1;
+ x86_init.hyper.init_after_bootmem();
/*
* Must be done after boot memory is put on freelist, because here we
* might set fields in deferred struct pages that have not yet been
- * initialized, and free_all_bootmem() initializes all the reserved
+ * initialized, and memblock_free_all() initializes all the reserved
* deferred pages for us.
*/
register_page_bootmem_info();
@@ -1271,12 +1282,8 @@ void mark_rodata_ro(void)
set_memory_ro(start, (end-start) >> PAGE_SHIFT);
#endif
- free_init_pages("unused kernel",
- (unsigned long) __va(__pa_symbol(text_end)),
- (unsigned long) __va(__pa_symbol(rodata_start)));
- free_init_pages("unused kernel",
- (unsigned long) __va(__pa_symbol(rodata_end)),
- (unsigned long) __va(__pa_symbol(_sdata)));
+ free_kernel_image_pages((void *)text_end, (void *)rodata_start);
+ free_kernel_image_pages((void *)rodata_end, (void *)_sdata);
debug_checkwx();
}
@@ -1322,14 +1329,51 @@ int kern_addr_valid(unsigned long addr)
return pfn_valid(pte_pfn(*pte));
}
-static unsigned long probe_memory_block_size(void)
+/*
+ * Block size is the minimum amount of memory which can be hotplugged or
+ * hotremoved. It must be power of two and must be equal or larger than
+ * MIN_MEMORY_BLOCK_SIZE.
+ */
+#define MAX_BLOCK_SIZE (2UL << 30)
+
+/* Amount of ram needed to start using large blocks */
+#define MEM_SIZE_FOR_LARGE_BLOCK (64UL << 30)
+
+/* Adjustable memory block size */
+static unsigned long set_memory_block_size;
+int __init set_memory_block_size_order(unsigned int order)
{
- unsigned long bz = MIN_MEMORY_BLOCK_SIZE;
+ unsigned long size = 1UL << order;
- /* if system is UV or has 64GB of RAM or more, use large blocks */
- if (is_uv_system() || ((max_pfn << PAGE_SHIFT) >= (64UL << 30)))
- bz = 2UL << 30; /* 2GB */
+ if (size > MEM_SIZE_FOR_LARGE_BLOCK || size < MIN_MEMORY_BLOCK_SIZE)
+ return -EINVAL;
+ set_memory_block_size = size;
+ return 0;
+}
+
+static unsigned long probe_memory_block_size(void)
+{
+ unsigned long boot_mem_end = max_pfn << PAGE_SHIFT;
+ unsigned long bz;
+
+ /* If memory block size has been set, then use it */
+ bz = set_memory_block_size;
+ if (bz)
+ goto done;
+
+ /* Use regular block if RAM is smaller than MEM_SIZE_FOR_LARGE_BLOCK */
+ if (boot_mem_end < MEM_SIZE_FOR_LARGE_BLOCK) {
+ bz = MIN_MEMORY_BLOCK_SIZE;
+ goto done;
+ }
+
+ /* Find the largest allowed block size that aligns to memory end */
+ for (bz = MAX_BLOCK_SIZE; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) {
+ if (IS_ALIGNED(boot_mem_end, bz))
+ break;
+ }
+done:
pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20);
return bz;
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index ada98b39b8ad..b3294d36769d 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -44,6 +44,9 @@ int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot)
return ret;
*prot = __pgprot(__PAGE_KERNEL | cachemode2protval(pcm));
+ /* Filter out unsupported __PAGE_KERNEL* bits: */
+ pgprot_val(*prot) &= __default_kernel_pte_mask;
+
return 0;
}
EXPORT_SYMBOL_GPL(iomap_create_wc);
@@ -88,6 +91,9 @@ iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
prot = __pgprot(__PAGE_KERNEL |
cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS));
+ /* Filter out unsupported __PAGE_KERNEL* bits: */
+ pgprot_val(prot) &= __default_kernel_pte_mask;
+
return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot);
}
EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index e2db83bebc3b..5378d10f1d31 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -6,7 +6,7 @@
* (C) Copyright 1995 1996 Linus Torvalds
*/
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/init.h>
#include <linux/io.h>
#include <linux/ioport.h>
@@ -131,7 +131,8 @@ static void __ioremap_check_mem(resource_size_t addr, unsigned long size,
* caller shouldn't need to know that small detail.
*/
static void __iomem *__ioremap_caller(resource_size_t phys_addr,
- unsigned long size, enum page_cache_mode pcm, void *caller)
+ unsigned long size, enum page_cache_mode pcm,
+ void *caller, bool encrypted)
{
unsigned long offset, vaddr;
resource_size_t last_addr;
@@ -199,7 +200,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
* resulting mapping.
*/
prot = PAGE_KERNEL_IO;
- if (sev_active() && mem_flags.desc_other)
+ if ((sev_active() && mem_flags.desc_other) || encrypted)
prot = pgprot_encrypted(prot);
switch (pcm) {
@@ -291,7 +292,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
return __ioremap_caller(phys_addr, size, pcm,
- __builtin_return_address(0));
+ __builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_nocache);
@@ -324,7 +325,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
return __ioremap_caller(phys_addr, size, pcm,
- __builtin_return_address(0));
+ __builtin_return_address(0), false);
}
EXPORT_SYMBOL_GPL(ioremap_uc);
@@ -341,7 +342,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
- __builtin_return_address(0));
+ __builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_wc);
@@ -358,14 +359,21 @@ EXPORT_SYMBOL(ioremap_wc);
void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
- __builtin_return_address(0));
+ __builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_wt);
+void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size)
+{
+ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
+ __builtin_return_address(0), true);
+}
+EXPORT_SYMBOL(ioremap_encrypted);
+
void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
- __builtin_return_address(0));
+ __builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_cache);
@@ -374,7 +382,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
{
return __ioremap_caller(phys_addr, size,
pgprot2cachemode(__pgprot(prot_val)),
- __builtin_return_address(0));
+ __builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_prot);
@@ -816,6 +824,9 @@ void __init __early_set_fixmap(enum fixed_addresses idx,
}
pte = early_ioremap_pte(addr);
+ /* Sanitize 'prot' against any unsupported bits: */
+ pgprot_val(flags) &= __default_kernel_pte_mask;
+
if (pgprot_val(flags))
set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
else
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index af6f2f9c6a26..04a9cf6b034f 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -1,10 +1,13 @@
// SPDX-License-Identifier: GPL-2.0
#define DISABLE_BRANCH_PROFILING
#define pr_fmt(fmt) "kasan: " fmt
-#include <linux/bootmem.h>
+
+/* cpu_feature_enabled() cannot be used this early */
+#define USE_EARLY_PGTABLE_L5
+
+#include <linux/memblock.h>
#include <linux/kasan.h>
#include <linux/kdebug.h>
-#include <linux/memblock.h>
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
@@ -19,16 +22,16 @@
extern struct range pfn_mapped[E820_MAX_ENTRIES];
-static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
+static p4d_t tmp_p4d_table[MAX_PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
static __init void *early_alloc(size_t size, int nid, bool panic)
{
if (panic)
- return memblock_virt_alloc_try_nid(size, size,
- __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid);
+ return memblock_alloc_try_nid(size, size,
+ __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid);
else
- return memblock_virt_alloc_try_nid_nopanic(size, size,
- __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid);
+ return memblock_alloc_try_nid_nopanic(size, size,
+ __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid);
}
static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
@@ -176,10 +179,10 @@ static void __init clear_pgds(unsigned long start,
* With folded p4d, pgd_clear() is nop, use p4d_clear()
* instead.
*/
- if (CONFIG_PGTABLE_LEVELS < 5)
- p4d_clear(p4d_offset(pgd, start));
- else
+ if (pgtable_l5_enabled())
pgd_clear(pgd);
+ else
+ p4d_clear(p4d_offset(pgd, start));
}
pgd = pgd_offset_k(start);
@@ -191,7 +194,7 @@ static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr)
{
unsigned long p4d;
- if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+ if (!pgtable_l5_enabled())
return (p4d_t *)pgd;
p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK;
@@ -263,6 +266,12 @@ void __init kasan_early_init(void)
pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE;
p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE;
+ /* Mask out unsupported __PAGE_KERNEL bits: */
+ pte_val &= __default_kernel_pte_mask;
+ pmd_val &= __default_kernel_pte_mask;
+ pud_val &= __default_kernel_pte_mask;
+ p4d_val &= __default_kernel_pte_mask;
+
for (i = 0; i < PTRS_PER_PTE; i++)
kasan_zero_pte[i] = __pte(pte_val);
@@ -272,7 +281,7 @@ void __init kasan_early_init(void)
for (i = 0; i < PTRS_PER_PUD; i++)
kasan_zero_pud[i] = __pud(pud_val);
- for (i = 0; IS_ENABLED(CONFIG_X86_5LEVEL) && i < PTRS_PER_P4D; i++)
+ for (i = 0; pgtable_l5_enabled() && i < PTRS_PER_P4D; i++)
kasan_zero_p4d[i] = __p4d(p4d_val);
kasan_map_early_shadow(early_top_pgt);
@@ -303,7 +312,7 @@ void __init kasan_init(void)
* bunch of things like kernel code, modules, EFI mapping, etc.
* We need to take extra steps to not overwrite them.
*/
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ if (pgtable_l5_enabled()) {
void *ptr;
ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END));
@@ -365,7 +374,13 @@ void __init kasan_init(void)
*/
memset(kasan_zero_page, 0, PAGE_SIZE);
for (i = 0; i < PTRS_PER_PTE; i++) {
- pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO | _PAGE_ENC);
+ pte_t pte;
+ pgprot_t prot;
+
+ prot = __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC);
+ pgprot_val(prot) &= __default_kernel_pte_mask;
+
+ pte = __pte(__pa(kasan_zero_page) | pgprot_val(prot));
set_pte(&kasan_zero_pte[i], pte);
}
/* Flush TLBs again to be sure that write protection applied. */
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index aedebd2ebf1e..3f452ffed7e9 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -23,6 +23,7 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/random.h>
+#include <linux/memblock.h>
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
@@ -34,23 +35,12 @@
#define TB_SHIFT 40
/*
- * Virtual address start and end range for randomization.
- *
* The end address could depend on more configuration options to make the
* highest amount of space for randomization available, but that's too hard
* to keep straight and caused issues already.
*/
-static const unsigned long vaddr_start = __PAGE_OFFSET_BASE;
static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
-/* Default values */
-unsigned long page_offset_base = __PAGE_OFFSET_BASE;
-EXPORT_SYMBOL(page_offset_base);
-unsigned long vmalloc_base = __VMALLOC_BASE;
-EXPORT_SYMBOL(vmalloc_base);
-unsigned long vmemmap_base = __VMEMMAP_BASE;
-EXPORT_SYMBOL(vmemmap_base);
-
/*
* Memory regions randomized by KASLR (except modules that use a separate logic
* earlier during boot). The list is ordered based on virtual addresses. This
@@ -60,8 +50,8 @@ static __initdata struct kaslr_memory_region {
unsigned long *base;
unsigned long size_tb;
} kaslr_regions[] = {
- { &page_offset_base, 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT) /* Maximum */ },
- { &vmalloc_base, VMALLOC_SIZE_TB },
+ { &page_offset_base, 0 },
+ { &vmalloc_base, 0 },
{ &vmemmap_base, 1 },
};
@@ -84,11 +74,14 @@ static inline bool kaslr_memory_enabled(void)
void __init kernel_randomize_memory(void)
{
size_t i;
- unsigned long vaddr = vaddr_start;
+ unsigned long vaddr_start, vaddr;
unsigned long rand, memory_tb;
struct rnd_state rand_state;
unsigned long remain_entropy;
+ vaddr_start = pgtable_l5_enabled() ? __PAGE_OFFSET_BASE_L5 : __PAGE_OFFSET_BASE_L4;
+ vaddr = vaddr_start;
+
/*
* These BUILD_BUG_ON checks ensure the memory layout is consistent
* with the vaddr_start/vaddr_end variables. These checks are very
@@ -101,6 +94,9 @@ void __init kernel_randomize_memory(void)
if (!kaslr_memory_enabled())
return;
+ kaslr_regions[0].size_tb = 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT);
+ kaslr_regions[1].size_tb = VMALLOC_SIZE_TB;
+
/*
* Update Physical memory mapping to available and
* add padding if needed (especially for memory hotplug support).
@@ -129,7 +125,7 @@ void __init kernel_randomize_memory(void)
*/
entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
prandom_bytes_state(&rand_state, &rand, sizeof(rand));
- if (IS_ENABLED(CONFIG_X86_5LEVEL))
+ if (pgtable_l5_enabled())
entropy = (rand % (entropy + 1)) & P4D_MASK;
else
entropy = (rand % (entropy + 1)) & PUD_MASK;
@@ -141,7 +137,7 @@ void __init kernel_randomize_memory(void)
* randomization alignment.
*/
vaddr += get_padding(&kaslr_regions[i]);
- if (IS_ENABLED(CONFIG_X86_5LEVEL))
+ if (pgtable_l5_enabled())
vaddr = round_up(vaddr + 1, P4D_SIZE);
else
vaddr = round_up(vaddr + 1, PUD_SIZE);
@@ -217,7 +213,7 @@ void __meminit init_trampoline(void)
return;
}
- if (IS_ENABLED(CONFIG_X86_5LEVEL))
+ if (pgtable_l5_enabled())
init_trampoline_p4d();
else
init_trampoline_pud();
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 7c8686709636..79eb55ce69a9 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -126,24 +126,29 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr)
static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
{
+ pmd_t new_pmd;
pmdval_t v = pmd_val(*pmd);
if (clear) {
- *old = v & _PAGE_PRESENT;
- v &= ~_PAGE_PRESENT;
- } else /* presume this has been called with clear==true previously */
- v |= *old;
- set_pmd(pmd, __pmd(v));
+ *old = v;
+ new_pmd = pmd_mknotpresent(*pmd);
+ } else {
+ /* Presume this has been called with clear==true previously */
+ new_pmd = __pmd(*old);
+ }
+ set_pmd(pmd, new_pmd);
}
static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
{
pteval_t v = pte_val(*pte);
if (clear) {
- *old = v & _PAGE_PRESENT;
- v &= ~_PAGE_PRESENT;
- } else /* presume this has been called with clear==true previously */
- v |= *old;
- set_pte_atomic(pte, __pte(v));
+ *old = v;
+ /* Nothing should care about address */
+ pte_clear(&init_mm, 0, pte);
+ } else {
+ /* Presume this has been called with clear==true previously */
+ set_pte_atomic(pte, __pte(*old));
+ }
}
static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 1a53071e2e17..006f373f54ab 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -25,17 +25,12 @@
#include <asm/bootparam.h>
#include <asm/set_memory.h>
#include <asm/cacheflush.h>
-#include <asm/sections.h>
#include <asm/processor-flags.h>
#include <asm/msr.h>
#include <asm/cmdline.h>
#include "mm_internal.h"
-static char sme_cmdline_arg[] __initdata = "mem_encrypt";
-static char sme_cmdline_on[] __initdata = "on";
-static char sme_cmdline_off[] __initdata = "off";
-
/*
* Since SME related variables are set early in the boot process they must
* reside in the .data section so as not to be zeroed out when the .bss
@@ -46,7 +41,7 @@ EXPORT_SYMBOL(sme_me_mask);
DEFINE_STATIC_KEY_FALSE(sev_enable_key);
EXPORT_SYMBOL_GPL(sev_enable_key);
-static bool sev_enabled __section(.data);
+bool sev_enabled __section(.data);
/* Buffer used for early in-place encryption by BSP, no locking needed */
static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE);
@@ -200,67 +195,6 @@ void __init sme_early_init(void)
swiotlb_force = SWIOTLB_FORCE;
}
-static void *sev_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
- gfp_t gfp, unsigned long attrs)
-{
- unsigned long dma_mask;
- unsigned int order;
- struct page *page;
- void *vaddr = NULL;
-
- dma_mask = dma_alloc_coherent_mask(dev, gfp);
- order = get_order(size);
-
- /*
- * Memory will be memset to zero after marking decrypted, so don't
- * bother clearing it before.
- */
- gfp &= ~__GFP_ZERO;
-
- page = alloc_pages_node(dev_to_node(dev), gfp, order);
- if (page) {
- dma_addr_t addr;
-
- /*
- * Since we will be clearing the encryption bit, check the
- * mask with it already cleared.
- */
- addr = __sme_clr(phys_to_dma(dev, page_to_phys(page)));
- if ((addr + size) > dma_mask) {
- __free_pages(page, get_order(size));
- } else {
- vaddr = page_address(page);
- *dma_handle = addr;
- }
- }
-
- if (!vaddr)
- vaddr = swiotlb_alloc_coherent(dev, size, dma_handle, gfp);
-
- if (!vaddr)
- return NULL;
-
- /* Clear the SME encryption bit for DMA use if not swiotlb area */
- if (!is_swiotlb_buffer(dma_to_phys(dev, *dma_handle))) {
- set_memory_decrypted((unsigned long)vaddr, 1 << order);
- memset(vaddr, 0, PAGE_SIZE << order);
- *dma_handle = __sme_clr(*dma_handle);
- }
-
- return vaddr;
-}
-
-static void sev_free(struct device *dev, size_t size, void *vaddr,
- dma_addr_t dma_handle, unsigned long attrs)
-{
- /* Set the SME encryption bit for re-use if not swiotlb area */
- if (!is_swiotlb_buffer(dma_to_phys(dev, dma_handle)))
- set_memory_encrypted((unsigned long)vaddr,
- 1 << get_order(size));
-
- swiotlb_free_coherent(dev, size, vaddr, dma_handle);
-}
-
static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
{
pgprot_t old_prot, new_prot;
@@ -413,21 +347,31 @@ bool sev_active(void)
}
EXPORT_SYMBOL(sev_active);
-static const struct dma_map_ops sev_dma_ops = {
- .alloc = sev_alloc,
- .free = sev_free,
- .map_page = swiotlb_map_page,
- .unmap_page = swiotlb_unmap_page,
- .map_sg = swiotlb_map_sg_attrs,
- .unmap_sg = swiotlb_unmap_sg_attrs,
- .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
- .sync_single_for_device = swiotlb_sync_single_for_device,
- .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
- .sync_sg_for_device = swiotlb_sync_sg_for_device,
- .mapping_error = swiotlb_dma_mapping_error,
-};
-
/* Architecture __weak replacement functions */
+void __init mem_encrypt_free_decrypted_mem(void)
+{
+ unsigned long vaddr, vaddr_end, npages;
+ int r;
+
+ vaddr = (unsigned long)__start_bss_decrypted_unused;
+ vaddr_end = (unsigned long)__end_bss_decrypted;
+ npages = (vaddr_end - vaddr) >> PAGE_SHIFT;
+
+ /*
+ * The unused memory range was mapped decrypted, change the encryption
+ * attribute from decrypted to encrypted before freeing it.
+ */
+ if (mem_encrypt_active()) {
+ r = set_memory_encrypted(vaddr, npages);
+ if (r) {
+ pr_warn("failed to free unused decrypted pages\n");
+ return;
+ }
+ }
+
+ free_init_pages("unused decrypted", vaddr, vaddr_end);
+}
+
void __init mem_encrypt_init(void)
{
if (!sme_me_mask)
@@ -437,12 +381,11 @@ void __init mem_encrypt_init(void)
swiotlb_update_mem_attributes();
/*
- * With SEV, DMA operations cannot use encryption. New DMA ops
- * are required in order to mark the DMA areas as decrypted or
- * to use bounce buffers.
+ * With SEV, DMA operations cannot use encryption, we need to use
+ * SWIOTLB to bounce buffer DMA operation.
*/
if (sev_active())
- dma_ops = &sev_dma_ops;
+ dma_ops = &swiotlb_dma_ops;
/*
* With SEV, we need to unroll the rep string I/O instructions.
@@ -455,582 +398,3 @@ void __init mem_encrypt_init(void)
: "Secure Memory Encryption (SME)");
}
-void swiotlb_set_mem_attributes(void *vaddr, unsigned long size)
-{
- WARN(PAGE_ALIGN(size) != size,
- "size is not page-aligned (%#lx)\n", size);
-
- /* Make the SWIOTLB buffer area decrypted */
- set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT);
-}
-
-struct sme_populate_pgd_data {
- void *pgtable_area;
- pgd_t *pgd;
-
- pmdval_t pmd_flags;
- pteval_t pte_flags;
- unsigned long paddr;
-
- unsigned long vaddr;
- unsigned long vaddr_end;
-};
-
-static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd)
-{
- unsigned long pgd_start, pgd_end, pgd_size;
- pgd_t *pgd_p;
-
- pgd_start = ppd->vaddr & PGDIR_MASK;
- pgd_end = ppd->vaddr_end & PGDIR_MASK;
-
- pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t);
-
- pgd_p = ppd->pgd + pgd_index(ppd->vaddr);
-
- memset(pgd_p, 0, pgd_size);
-}
-
-#define PGD_FLAGS _KERNPG_TABLE_NOENC
-#define P4D_FLAGS _KERNPG_TABLE_NOENC
-#define PUD_FLAGS _KERNPG_TABLE_NOENC
-#define PMD_FLAGS _KERNPG_TABLE_NOENC
-
-#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
-
-#define PMD_FLAGS_DEC PMD_FLAGS_LARGE
-#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
- (_PAGE_PAT | _PAGE_PWT))
-
-#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC)
-
-#define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL)
-
-#define PTE_FLAGS_DEC PTE_FLAGS
-#define PTE_FLAGS_DEC_WP ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
- (_PAGE_PAT | _PAGE_PWT))
-
-#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC)
-
-static pmd_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd)
-{
- pgd_t *pgd_p;
- p4d_t *p4d_p;
- pud_t *pud_p;
- pmd_t *pmd_p;
-
- pgd_p = ppd->pgd + pgd_index(ppd->vaddr);
- if (native_pgd_val(*pgd_p)) {
- if (IS_ENABLED(CONFIG_X86_5LEVEL))
- p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
- else
- pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
- } else {
- pgd_t pgd;
-
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
- p4d_p = ppd->pgtable_area;
- memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
- ppd->pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D;
-
- pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS);
- } else {
- pud_p = ppd->pgtable_area;
- memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
- ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
-
- pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS);
- }
- native_set_pgd(pgd_p, pgd);
- }
-
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
- p4d_p += p4d_index(ppd->vaddr);
- if (native_p4d_val(*p4d_p)) {
- pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK);
- } else {
- p4d_t p4d;
-
- pud_p = ppd->pgtable_area;
- memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
- ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
-
- p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS);
- native_set_p4d(p4d_p, p4d);
- }
- }
-
- pud_p += pud_index(ppd->vaddr);
- if (native_pud_val(*pud_p)) {
- if (native_pud_val(*pud_p) & _PAGE_PSE)
- return NULL;
-
- pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK);
- } else {
- pud_t pud;
-
- pmd_p = ppd->pgtable_area;
- memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
- ppd->pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD;
-
- pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS);
- native_set_pud(pud_p, pud);
- }
-
- return pmd_p;
-}
-
-static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
-{
- pmd_t *pmd_p;
-
- pmd_p = sme_prepare_pgd(ppd);
- if (!pmd_p)
- return;
-
- pmd_p += pmd_index(ppd->vaddr);
- if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE))
- native_set_pmd(pmd_p, native_make_pmd(ppd->paddr | ppd->pmd_flags));
-}
-
-static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd)
-{
- pmd_t *pmd_p;
- pte_t *pte_p;
-
- pmd_p = sme_prepare_pgd(ppd);
- if (!pmd_p)
- return;
-
- pmd_p += pmd_index(ppd->vaddr);
- if (native_pmd_val(*pmd_p)) {
- if (native_pmd_val(*pmd_p) & _PAGE_PSE)
- return;
-
- pte_p = (pte_t *)(native_pmd_val(*pmd_p) & ~PTE_FLAGS_MASK);
- } else {
- pmd_t pmd;
-
- pte_p = ppd->pgtable_area;
- memset(pte_p, 0, sizeof(*pte_p) * PTRS_PER_PTE);
- ppd->pgtable_area += sizeof(*pte_p) * PTRS_PER_PTE;
-
- pmd = native_make_pmd((pteval_t)pte_p + PMD_FLAGS);
- native_set_pmd(pmd_p, pmd);
- }
-
- pte_p += pte_index(ppd->vaddr);
- if (!native_pte_val(*pte_p))
- native_set_pte(pte_p, native_make_pte(ppd->paddr | ppd->pte_flags));
-}
-
-static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
-{
- while (ppd->vaddr < ppd->vaddr_end) {
- sme_populate_pgd_large(ppd);
-
- ppd->vaddr += PMD_PAGE_SIZE;
- ppd->paddr += PMD_PAGE_SIZE;
- }
-}
-
-static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd)
-{
- while (ppd->vaddr < ppd->vaddr_end) {
- sme_populate_pgd(ppd);
-
- ppd->vaddr += PAGE_SIZE;
- ppd->paddr += PAGE_SIZE;
- }
-}
-
-static void __init __sme_map_range(struct sme_populate_pgd_data *ppd,
- pmdval_t pmd_flags, pteval_t pte_flags)
-{
- unsigned long vaddr_end;
-
- ppd->pmd_flags = pmd_flags;
- ppd->pte_flags = pte_flags;
-
- /* Save original end value since we modify the struct value */
- vaddr_end = ppd->vaddr_end;
-
- /* If start is not 2MB aligned, create PTE entries */
- ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE);
- __sme_map_range_pte(ppd);
-
- /* Create PMD entries */
- ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK;
- __sme_map_range_pmd(ppd);
-
- /* If end is not 2MB aligned, create PTE entries */
- ppd->vaddr_end = vaddr_end;
- __sme_map_range_pte(ppd);
-}
-
-static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd)
-{
- __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC);
-}
-
-static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd)
-{
- __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC);
-}
-
-static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd)
-{
- __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP);
-}
-
-static unsigned long __init sme_pgtable_calc(unsigned long len)
-{
- unsigned long p4d_size, pud_size, pmd_size, pte_size;
- unsigned long total;
-
- /*
- * Perform a relatively simplistic calculation of the pagetable
- * entries that are needed. Those mappings will be covered mostly
- * by 2MB PMD entries so we can conservatively calculate the required
- * number of P4D, PUD and PMD structures needed to perform the
- * mappings. For mappings that are not 2MB aligned, PTE mappings
- * would be needed for the start and end portion of the address range
- * that fall outside of the 2MB alignment. This results in, at most,
- * two extra pages to hold PTE entries for each range that is mapped.
- * Incrementing the count for each covers the case where the addresses
- * cross entries.
- */
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
- p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
- p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
- pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1;
- pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
- } else {
- p4d_size = 0;
- pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
- pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
- }
- pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1;
- pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
- pte_size = 2 * sizeof(pte_t) * PTRS_PER_PTE;
-
- total = p4d_size + pud_size + pmd_size + pte_size;
-
- /*
- * Now calculate the added pagetable structures needed to populate
- * the new pagetables.
- */
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
- p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
- p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
- pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE;
- pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
- } else {
- p4d_size = 0;
- pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
- pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
- }
- pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE;
- pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
-
- total += p4d_size + pud_size + pmd_size;
-
- return total;
-}
-
-void __init __nostackprotector sme_encrypt_kernel(struct boot_params *bp)
-{
- unsigned long workarea_start, workarea_end, workarea_len;
- unsigned long execute_start, execute_end, execute_len;
- unsigned long kernel_start, kernel_end, kernel_len;
- unsigned long initrd_start, initrd_end, initrd_len;
- struct sme_populate_pgd_data ppd;
- unsigned long pgtable_area_len;
- unsigned long decrypted_base;
-
- if (!sme_active())
- return;
-
- /*
- * Prepare for encrypting the kernel and initrd by building new
- * pagetables with the necessary attributes needed to encrypt the
- * kernel in place.
- *
- * One range of virtual addresses will map the memory occupied
- * by the kernel and initrd as encrypted.
- *
- * Another range of virtual addresses will map the memory occupied
- * by the kernel and initrd as decrypted and write-protected.
- *
- * The use of write-protect attribute will prevent any of the
- * memory from being cached.
- */
-
- /* Physical addresses gives us the identity mapped virtual addresses */
- kernel_start = __pa_symbol(_text);
- kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
- kernel_len = kernel_end - kernel_start;
-
- initrd_start = 0;
- initrd_end = 0;
- initrd_len = 0;
-#ifdef CONFIG_BLK_DEV_INITRD
- initrd_len = (unsigned long)bp->hdr.ramdisk_size |
- ((unsigned long)bp->ext_ramdisk_size << 32);
- if (initrd_len) {
- initrd_start = (unsigned long)bp->hdr.ramdisk_image |
- ((unsigned long)bp->ext_ramdisk_image << 32);
- initrd_end = PAGE_ALIGN(initrd_start + initrd_len);
- initrd_len = initrd_end - initrd_start;
- }
-#endif
-
- /* Set the encryption workarea to be immediately after the kernel */
- workarea_start = kernel_end;
-
- /*
- * Calculate required number of workarea bytes needed:
- * executable encryption area size:
- * stack page (PAGE_SIZE)
- * encryption routine page (PAGE_SIZE)
- * intermediate copy buffer (PMD_PAGE_SIZE)
- * pagetable structures for the encryption of the kernel
- * pagetable structures for workarea (in case not currently mapped)
- */
- execute_start = workarea_start;
- execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE;
- execute_len = execute_end - execute_start;
-
- /*
- * One PGD for both encrypted and decrypted mappings and a set of
- * PUDs and PMDs for each of the encrypted and decrypted mappings.
- */
- pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD;
- pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2;
- if (initrd_len)
- pgtable_area_len += sme_pgtable_calc(initrd_len) * 2;
-
- /* PUDs and PMDs needed in the current pagetables for the workarea */
- pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len);
-
- /*
- * The total workarea includes the executable encryption area and
- * the pagetable area. The start of the workarea is already 2MB
- * aligned, align the end of the workarea on a 2MB boundary so that
- * we don't try to create/allocate PTE entries from the workarea
- * before it is mapped.
- */
- workarea_len = execute_len + pgtable_area_len;
- workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE);
-
- /*
- * Set the address to the start of where newly created pagetable
- * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable
- * structures are created when the workarea is added to the current
- * pagetables and when the new encrypted and decrypted kernel
- * mappings are populated.
- */
- ppd.pgtable_area = (void *)execute_end;
-
- /*
- * Make sure the current pagetable structure has entries for
- * addressing the workarea.
- */
- ppd.pgd = (pgd_t *)native_read_cr3_pa();
- ppd.paddr = workarea_start;
- ppd.vaddr = workarea_start;
- ppd.vaddr_end = workarea_end;
- sme_map_range_decrypted(&ppd);
-
- /* Flush the TLB - no globals so cr3 is enough */
- native_write_cr3(__native_read_cr3());
-
- /*
- * A new pagetable structure is being built to allow for the kernel
- * and initrd to be encrypted. It starts with an empty PGD that will
- * then be populated with new PUDs and PMDs as the encrypted and
- * decrypted kernel mappings are created.
- */
- ppd.pgd = ppd.pgtable_area;
- memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD);
- ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD;
-
- /*
- * A different PGD index/entry must be used to get different
- * pagetable entries for the decrypted mapping. Choose the next
- * PGD index and convert it to a virtual address to be used as
- * the base of the mapping.
- */
- decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1);
- if (initrd_len) {
- unsigned long check_base;
-
- check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1);
- decrypted_base = max(decrypted_base, check_base);
- }
- decrypted_base <<= PGDIR_SHIFT;
-
- /* Add encrypted kernel (identity) mappings */
- ppd.paddr = kernel_start;
- ppd.vaddr = kernel_start;
- ppd.vaddr_end = kernel_end;
- sme_map_range_encrypted(&ppd);
-
- /* Add decrypted, write-protected kernel (non-identity) mappings */
- ppd.paddr = kernel_start;
- ppd.vaddr = kernel_start + decrypted_base;
- ppd.vaddr_end = kernel_end + decrypted_base;
- sme_map_range_decrypted_wp(&ppd);
-
- if (initrd_len) {
- /* Add encrypted initrd (identity) mappings */
- ppd.paddr = initrd_start;
- ppd.vaddr = initrd_start;
- ppd.vaddr_end = initrd_end;
- sme_map_range_encrypted(&ppd);
- /*
- * Add decrypted, write-protected initrd (non-identity) mappings
- */
- ppd.paddr = initrd_start;
- ppd.vaddr = initrd_start + decrypted_base;
- ppd.vaddr_end = initrd_end + decrypted_base;
- sme_map_range_decrypted_wp(&ppd);
- }
-
- /* Add decrypted workarea mappings to both kernel mappings */
- ppd.paddr = workarea_start;
- ppd.vaddr = workarea_start;
- ppd.vaddr_end = workarea_end;
- sme_map_range_decrypted(&ppd);
-
- ppd.paddr = workarea_start;
- ppd.vaddr = workarea_start + decrypted_base;
- ppd.vaddr_end = workarea_end + decrypted_base;
- sme_map_range_decrypted(&ppd);
-
- /* Perform the encryption */
- sme_encrypt_execute(kernel_start, kernel_start + decrypted_base,
- kernel_len, workarea_start, (unsigned long)ppd.pgd);
-
- if (initrd_len)
- sme_encrypt_execute(initrd_start, initrd_start + decrypted_base,
- initrd_len, workarea_start,
- (unsigned long)ppd.pgd);
-
- /*
- * At this point we are running encrypted. Remove the mappings for
- * the decrypted areas - all that is needed for this is to remove
- * the PGD entry/entries.
- */
- ppd.vaddr = kernel_start + decrypted_base;
- ppd.vaddr_end = kernel_end + decrypted_base;
- sme_clear_pgd(&ppd);
-
- if (initrd_len) {
- ppd.vaddr = initrd_start + decrypted_base;
- ppd.vaddr_end = initrd_end + decrypted_base;
- sme_clear_pgd(&ppd);
- }
-
- ppd.vaddr = workarea_start + decrypted_base;
- ppd.vaddr_end = workarea_end + decrypted_base;
- sme_clear_pgd(&ppd);
-
- /* Flush the TLB - no globals so cr3 is enough */
- native_write_cr3(__native_read_cr3());
-}
-
-void __init __nostackprotector sme_enable(struct boot_params *bp)
-{
- const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
- unsigned int eax, ebx, ecx, edx;
- unsigned long feature_mask;
- bool active_by_default;
- unsigned long me_mask;
- char buffer[16];
- u64 msr;
-
- /* Check for the SME/SEV support leaf */
- eax = 0x80000000;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
- if (eax < 0x8000001f)
- return;
-
-#define AMD_SME_BIT BIT(0)
-#define AMD_SEV_BIT BIT(1)
- /*
- * Set the feature mask (SME or SEV) based on whether we are
- * running under a hypervisor.
- */
- eax = 1;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
- feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT;
-
- /*
- * Check for the SME/SEV feature:
- * CPUID Fn8000_001F[EAX]
- * - Bit 0 - Secure Memory Encryption support
- * - Bit 1 - Secure Encrypted Virtualization support
- * CPUID Fn8000_001F[EBX]
- * - Bits 5:0 - Pagetable bit position used to indicate encryption
- */
- eax = 0x8000001f;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
- if (!(eax & feature_mask))
- return;
-
- me_mask = 1UL << (ebx & 0x3f);
-
- /* Check if memory encryption is enabled */
- if (feature_mask == AMD_SME_BIT) {
- /* For SME, check the SYSCFG MSR */
- msr = __rdmsr(MSR_K8_SYSCFG);
- if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
- return;
- } else {
- /* For SEV, check the SEV MSR */
- msr = __rdmsr(MSR_AMD64_SEV);
- if (!(msr & MSR_AMD64_SEV_ENABLED))
- return;
-
- /* SEV state cannot be controlled by a command line option */
- sme_me_mask = me_mask;
- sev_enabled = true;
- return;
- }
-
- /*
- * Fixups have not been applied to phys_base yet and we're running
- * identity mapped, so we must obtain the address to the SME command
- * line argument data using rip-relative addressing.
- */
- asm ("lea sme_cmdline_arg(%%rip), %0"
- : "=r" (cmdline_arg)
- : "p" (sme_cmdline_arg));
- asm ("lea sme_cmdline_on(%%rip), %0"
- : "=r" (cmdline_on)
- : "p" (sme_cmdline_on));
- asm ("lea sme_cmdline_off(%%rip), %0"
- : "=r" (cmdline_off)
- : "p" (sme_cmdline_off));
-
- if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT))
- active_by_default = true;
- else
- active_by_default = false;
-
- cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr |
- ((u64)bp->ext_cmd_line_ptr << 32));
-
- cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer));
-
- if (!strncmp(buffer, cmdline_on, sizeof(buffer)))
- sme_me_mask = me_mask;
- else if (!strncmp(buffer, cmdline_off, sizeof(buffer)))
- sme_me_mask = 0;
- else
- sme_me_mask = active_by_default ? me_mask : 0;
-}
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
new file mode 100644
index 000000000000..a19ef1a416ff
--- /dev/null
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -0,0 +1,568 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define DISABLE_BRANCH_PROFILING
+
+/*
+ * Since we're dealing with identity mappings, physical and virtual
+ * addresses are the same, so override these defines which are ultimately
+ * used by the headers in misc.h.
+ */
+#define __pa(x) ((unsigned long)(x))
+#define __va(x) ((void *)((unsigned long)(x)))
+
+/*
+ * Special hack: we have to be careful, because no indirections are
+ * allowed here, and paravirt_ops is a kind of one. As it will only run in
+ * baremetal anyway, we just keep it from happening. (This list needs to
+ * be extended when new paravirt and debugging variants are added.)
+ */
+#undef CONFIG_PARAVIRT
+#undef CONFIG_PARAVIRT_XXL
+#undef CONFIG_PARAVIRT_SPINLOCKS
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mem_encrypt.h>
+
+#include <asm/setup.h>
+#include <asm/sections.h>
+#include <asm/cmdline.h>
+
+#include "mm_internal.h"
+
+#define PGD_FLAGS _KERNPG_TABLE_NOENC
+#define P4D_FLAGS _KERNPG_TABLE_NOENC
+#define PUD_FLAGS _KERNPG_TABLE_NOENC
+#define PMD_FLAGS _KERNPG_TABLE_NOENC
+
+#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
+
+#define PMD_FLAGS_DEC PMD_FLAGS_LARGE
+#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
+ (_PAGE_PAT | _PAGE_PWT))
+
+#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC)
+
+#define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL)
+
+#define PTE_FLAGS_DEC PTE_FLAGS
+#define PTE_FLAGS_DEC_WP ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
+ (_PAGE_PAT | _PAGE_PWT))
+
+#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC)
+
+struct sme_populate_pgd_data {
+ void *pgtable_area;
+ pgd_t *pgd;
+
+ pmdval_t pmd_flags;
+ pteval_t pte_flags;
+ unsigned long paddr;
+
+ unsigned long vaddr;
+ unsigned long vaddr_end;
+};
+
+static char sme_cmdline_arg[] __initdata = "mem_encrypt";
+static char sme_cmdline_on[] __initdata = "on";
+static char sme_cmdline_off[] __initdata = "off";
+
+static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd)
+{
+ unsigned long pgd_start, pgd_end, pgd_size;
+ pgd_t *pgd_p;
+
+ pgd_start = ppd->vaddr & PGDIR_MASK;
+ pgd_end = ppd->vaddr_end & PGDIR_MASK;
+
+ pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t);
+
+ pgd_p = ppd->pgd + pgd_index(ppd->vaddr);
+
+ memset(pgd_p, 0, pgd_size);
+}
+
+static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd = ppd->pgd + pgd_index(ppd->vaddr);
+ if (pgd_none(*pgd)) {
+ p4d = ppd->pgtable_area;
+ memset(p4d, 0, sizeof(*p4d) * PTRS_PER_P4D);
+ ppd->pgtable_area += sizeof(*p4d) * PTRS_PER_P4D;
+ set_pgd(pgd, __pgd(PGD_FLAGS | __pa(p4d)));
+ }
+
+ p4d = p4d_offset(pgd, ppd->vaddr);
+ if (p4d_none(*p4d)) {
+ pud = ppd->pgtable_area;
+ memset(pud, 0, sizeof(*pud) * PTRS_PER_PUD);
+ ppd->pgtable_area += sizeof(*pud) * PTRS_PER_PUD;
+ set_p4d(p4d, __p4d(P4D_FLAGS | __pa(pud)));
+ }
+
+ pud = pud_offset(p4d, ppd->vaddr);
+ if (pud_none(*pud)) {
+ pmd = ppd->pgtable_area;
+ memset(pmd, 0, sizeof(*pmd) * PTRS_PER_PMD);
+ ppd->pgtable_area += sizeof(*pmd) * PTRS_PER_PMD;
+ set_pud(pud, __pud(PUD_FLAGS | __pa(pmd)));
+ }
+
+ if (pud_large(*pud))
+ return NULL;
+
+ return pud;
+}
+
+static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pud = sme_prepare_pgd(ppd);
+ if (!pud)
+ return;
+
+ pmd = pmd_offset(pud, ppd->vaddr);
+ if (pmd_large(*pmd))
+ return;
+
+ set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags));
+}
+
+static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pud = sme_prepare_pgd(ppd);
+ if (!pud)
+ return;
+
+ pmd = pmd_offset(pud, ppd->vaddr);
+ if (pmd_none(*pmd)) {
+ pte = ppd->pgtable_area;
+ memset(pte, 0, sizeof(pte) * PTRS_PER_PTE);
+ ppd->pgtable_area += sizeof(pte) * PTRS_PER_PTE;
+ set_pmd(pmd, __pmd(PMD_FLAGS | __pa(pte)));
+ }
+
+ if (pmd_large(*pmd))
+ return;
+
+ pte = pte_offset_map(pmd, ppd->vaddr);
+ if (pte_none(*pte))
+ set_pte(pte, __pte(ppd->paddr | ppd->pte_flags));
+}
+
+static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
+{
+ while (ppd->vaddr < ppd->vaddr_end) {
+ sme_populate_pgd_large(ppd);
+
+ ppd->vaddr += PMD_PAGE_SIZE;
+ ppd->paddr += PMD_PAGE_SIZE;
+ }
+}
+
+static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd)
+{
+ while (ppd->vaddr < ppd->vaddr_end) {
+ sme_populate_pgd(ppd);
+
+ ppd->vaddr += PAGE_SIZE;
+ ppd->paddr += PAGE_SIZE;
+ }
+}
+
+static void __init __sme_map_range(struct sme_populate_pgd_data *ppd,
+ pmdval_t pmd_flags, pteval_t pte_flags)
+{
+ unsigned long vaddr_end;
+
+ ppd->pmd_flags = pmd_flags;
+ ppd->pte_flags = pte_flags;
+
+ /* Save original end value since we modify the struct value */
+ vaddr_end = ppd->vaddr_end;
+
+ /* If start is not 2MB aligned, create PTE entries */
+ ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE);
+ __sme_map_range_pte(ppd);
+
+ /* Create PMD entries */
+ ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK;
+ __sme_map_range_pmd(ppd);
+
+ /* If end is not 2MB aligned, create PTE entries */
+ ppd->vaddr_end = vaddr_end;
+ __sme_map_range_pte(ppd);
+}
+
+static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd)
+{
+ __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC);
+}
+
+static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd)
+{
+ __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC);
+}
+
+static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd)
+{
+ __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP);
+}
+
+static unsigned long __init sme_pgtable_calc(unsigned long len)
+{
+ unsigned long entries = 0, tables = 0;
+
+ /*
+ * Perform a relatively simplistic calculation of the pagetable
+ * entries that are needed. Those mappings will be covered mostly
+ * by 2MB PMD entries so we can conservatively calculate the required
+ * number of P4D, PUD and PMD structures needed to perform the
+ * mappings. For mappings that are not 2MB aligned, PTE mappings
+ * would be needed for the start and end portion of the address range
+ * that fall outside of the 2MB alignment. This results in, at most,
+ * two extra pages to hold PTE entries for each range that is mapped.
+ * Incrementing the count for each covers the case where the addresses
+ * cross entries.
+ */
+
+ /* PGDIR_SIZE is equal to P4D_SIZE on 4-level machine. */
+ if (PTRS_PER_P4D > 1)
+ entries += (DIV_ROUND_UP(len, PGDIR_SIZE) + 1) * sizeof(p4d_t) * PTRS_PER_P4D;
+ entries += (DIV_ROUND_UP(len, P4D_SIZE) + 1) * sizeof(pud_t) * PTRS_PER_PUD;
+ entries += (DIV_ROUND_UP(len, PUD_SIZE) + 1) * sizeof(pmd_t) * PTRS_PER_PMD;
+ entries += 2 * sizeof(pte_t) * PTRS_PER_PTE;
+
+ /*
+ * Now calculate the added pagetable structures needed to populate
+ * the new pagetables.
+ */
+
+ if (PTRS_PER_P4D > 1)
+ tables += DIV_ROUND_UP(entries, PGDIR_SIZE) * sizeof(p4d_t) * PTRS_PER_P4D;
+ tables += DIV_ROUND_UP(entries, P4D_SIZE) * sizeof(pud_t) * PTRS_PER_PUD;
+ tables += DIV_ROUND_UP(entries, PUD_SIZE) * sizeof(pmd_t) * PTRS_PER_PMD;
+
+ return entries + tables;
+}
+
+void __init sme_encrypt_kernel(struct boot_params *bp)
+{
+ unsigned long workarea_start, workarea_end, workarea_len;
+ unsigned long execute_start, execute_end, execute_len;
+ unsigned long kernel_start, kernel_end, kernel_len;
+ unsigned long initrd_start, initrd_end, initrd_len;
+ struct sme_populate_pgd_data ppd;
+ unsigned long pgtable_area_len;
+ unsigned long decrypted_base;
+
+ if (!sme_active())
+ return;
+
+ /*
+ * Prepare for encrypting the kernel and initrd by building new
+ * pagetables with the necessary attributes needed to encrypt the
+ * kernel in place.
+ *
+ * One range of virtual addresses will map the memory occupied
+ * by the kernel and initrd as encrypted.
+ *
+ * Another range of virtual addresses will map the memory occupied
+ * by the kernel and initrd as decrypted and write-protected.
+ *
+ * The use of write-protect attribute will prevent any of the
+ * memory from being cached.
+ */
+
+ /* Physical addresses gives us the identity mapped virtual addresses */
+ kernel_start = __pa_symbol(_text);
+ kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
+ kernel_len = kernel_end - kernel_start;
+
+ initrd_start = 0;
+ initrd_end = 0;
+ initrd_len = 0;
+#ifdef CONFIG_BLK_DEV_INITRD
+ initrd_len = (unsigned long)bp->hdr.ramdisk_size |
+ ((unsigned long)bp->ext_ramdisk_size << 32);
+ if (initrd_len) {
+ initrd_start = (unsigned long)bp->hdr.ramdisk_image |
+ ((unsigned long)bp->ext_ramdisk_image << 32);
+ initrd_end = PAGE_ALIGN(initrd_start + initrd_len);
+ initrd_len = initrd_end - initrd_start;
+ }
+#endif
+
+ /* Set the encryption workarea to be immediately after the kernel */
+ workarea_start = kernel_end;
+
+ /*
+ * Calculate required number of workarea bytes needed:
+ * executable encryption area size:
+ * stack page (PAGE_SIZE)
+ * encryption routine page (PAGE_SIZE)
+ * intermediate copy buffer (PMD_PAGE_SIZE)
+ * pagetable structures for the encryption of the kernel
+ * pagetable structures for workarea (in case not currently mapped)
+ */
+ execute_start = workarea_start;
+ execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE;
+ execute_len = execute_end - execute_start;
+
+ /*
+ * One PGD for both encrypted and decrypted mappings and a set of
+ * PUDs and PMDs for each of the encrypted and decrypted mappings.
+ */
+ pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD;
+ pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2;
+ if (initrd_len)
+ pgtable_area_len += sme_pgtable_calc(initrd_len) * 2;
+
+ /* PUDs and PMDs needed in the current pagetables for the workarea */
+ pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len);
+
+ /*
+ * The total workarea includes the executable encryption area and
+ * the pagetable area. The start of the workarea is already 2MB
+ * aligned, align the end of the workarea on a 2MB boundary so that
+ * we don't try to create/allocate PTE entries from the workarea
+ * before it is mapped.
+ */
+ workarea_len = execute_len + pgtable_area_len;
+ workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE);
+
+ /*
+ * Set the address to the start of where newly created pagetable
+ * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable
+ * structures are created when the workarea is added to the current
+ * pagetables and when the new encrypted and decrypted kernel
+ * mappings are populated.
+ */
+ ppd.pgtable_area = (void *)execute_end;
+
+ /*
+ * Make sure the current pagetable structure has entries for
+ * addressing the workarea.
+ */
+ ppd.pgd = (pgd_t *)native_read_cr3_pa();
+ ppd.paddr = workarea_start;
+ ppd.vaddr = workarea_start;
+ ppd.vaddr_end = workarea_end;
+ sme_map_range_decrypted(&ppd);
+
+ /* Flush the TLB - no globals so cr3 is enough */
+ native_write_cr3(__native_read_cr3());
+
+ /*
+ * A new pagetable structure is being built to allow for the kernel
+ * and initrd to be encrypted. It starts with an empty PGD that will
+ * then be populated with new PUDs and PMDs as the encrypted and
+ * decrypted kernel mappings are created.
+ */
+ ppd.pgd = ppd.pgtable_area;
+ memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD);
+ ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD;
+
+ /*
+ * A different PGD index/entry must be used to get different
+ * pagetable entries for the decrypted mapping. Choose the next
+ * PGD index and convert it to a virtual address to be used as
+ * the base of the mapping.
+ */
+ decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1);
+ if (initrd_len) {
+ unsigned long check_base;
+
+ check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1);
+ decrypted_base = max(decrypted_base, check_base);
+ }
+ decrypted_base <<= PGDIR_SHIFT;
+
+ /* Add encrypted kernel (identity) mappings */
+ ppd.paddr = kernel_start;
+ ppd.vaddr = kernel_start;
+ ppd.vaddr_end = kernel_end;
+ sme_map_range_encrypted(&ppd);
+
+ /* Add decrypted, write-protected kernel (non-identity) mappings */
+ ppd.paddr = kernel_start;
+ ppd.vaddr = kernel_start + decrypted_base;
+ ppd.vaddr_end = kernel_end + decrypted_base;
+ sme_map_range_decrypted_wp(&ppd);
+
+ if (initrd_len) {
+ /* Add encrypted initrd (identity) mappings */
+ ppd.paddr = initrd_start;
+ ppd.vaddr = initrd_start;
+ ppd.vaddr_end = initrd_end;
+ sme_map_range_encrypted(&ppd);
+ /*
+ * Add decrypted, write-protected initrd (non-identity) mappings
+ */
+ ppd.paddr = initrd_start;
+ ppd.vaddr = initrd_start + decrypted_base;
+ ppd.vaddr_end = initrd_end + decrypted_base;
+ sme_map_range_decrypted_wp(&ppd);
+ }
+
+ /* Add decrypted workarea mappings to both kernel mappings */
+ ppd.paddr = workarea_start;
+ ppd.vaddr = workarea_start;
+ ppd.vaddr_end = workarea_end;
+ sme_map_range_decrypted(&ppd);
+
+ ppd.paddr = workarea_start;
+ ppd.vaddr = workarea_start + decrypted_base;
+ ppd.vaddr_end = workarea_end + decrypted_base;
+ sme_map_range_decrypted(&ppd);
+
+ /* Perform the encryption */
+ sme_encrypt_execute(kernel_start, kernel_start + decrypted_base,
+ kernel_len, workarea_start, (unsigned long)ppd.pgd);
+
+ if (initrd_len)
+ sme_encrypt_execute(initrd_start, initrd_start + decrypted_base,
+ initrd_len, workarea_start,
+ (unsigned long)ppd.pgd);
+
+ /*
+ * At this point we are running encrypted. Remove the mappings for
+ * the decrypted areas - all that is needed for this is to remove
+ * the PGD entry/entries.
+ */
+ ppd.vaddr = kernel_start + decrypted_base;
+ ppd.vaddr_end = kernel_end + decrypted_base;
+ sme_clear_pgd(&ppd);
+
+ if (initrd_len) {
+ ppd.vaddr = initrd_start + decrypted_base;
+ ppd.vaddr_end = initrd_end + decrypted_base;
+ sme_clear_pgd(&ppd);
+ }
+
+ ppd.vaddr = workarea_start + decrypted_base;
+ ppd.vaddr_end = workarea_end + decrypted_base;
+ sme_clear_pgd(&ppd);
+
+ /* Flush the TLB - no globals so cr3 is enough */
+ native_write_cr3(__native_read_cr3());
+}
+
+void __init sme_enable(struct boot_params *bp)
+{
+ const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
+ unsigned int eax, ebx, ecx, edx;
+ unsigned long feature_mask;
+ bool active_by_default;
+ unsigned long me_mask;
+ char buffer[16];
+ u64 msr;
+
+ /* Check for the SME/SEV support leaf */
+ eax = 0x80000000;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (eax < 0x8000001f)
+ return;
+
+#define AMD_SME_BIT BIT(0)
+#define AMD_SEV_BIT BIT(1)
+ /*
+ * Set the feature mask (SME or SEV) based on whether we are
+ * running under a hypervisor.
+ */
+ eax = 1;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT;
+
+ /*
+ * Check for the SME/SEV feature:
+ * CPUID Fn8000_001F[EAX]
+ * - Bit 0 - Secure Memory Encryption support
+ * - Bit 1 - Secure Encrypted Virtualization support
+ * CPUID Fn8000_001F[EBX]
+ * - Bits 5:0 - Pagetable bit position used to indicate encryption
+ */
+ eax = 0x8000001f;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (!(eax & feature_mask))
+ return;
+
+ me_mask = 1UL << (ebx & 0x3f);
+
+ /* Check if memory encryption is enabled */
+ if (feature_mask == AMD_SME_BIT) {
+ /* For SME, check the SYSCFG MSR */
+ msr = __rdmsr(MSR_K8_SYSCFG);
+ if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+ return;
+ } else {
+ /* For SEV, check the SEV MSR */
+ msr = __rdmsr(MSR_AMD64_SEV);
+ if (!(msr & MSR_AMD64_SEV_ENABLED))
+ return;
+
+ /* SEV state cannot be controlled by a command line option */
+ sme_me_mask = me_mask;
+ sev_enabled = true;
+ physical_mask &= ~sme_me_mask;
+ return;
+ }
+
+ /*
+ * Fixups have not been applied to phys_base yet and we're running
+ * identity mapped, so we must obtain the address to the SME command
+ * line argument data using rip-relative addressing.
+ */
+ asm ("lea sme_cmdline_arg(%%rip), %0"
+ : "=r" (cmdline_arg)
+ : "p" (sme_cmdline_arg));
+ asm ("lea sme_cmdline_on(%%rip), %0"
+ : "=r" (cmdline_on)
+ : "p" (sme_cmdline_on));
+ asm ("lea sme_cmdline_off(%%rip), %0"
+ : "=r" (cmdline_off)
+ : "p" (sme_cmdline_off));
+
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT))
+ active_by_default = true;
+ else
+ active_by_default = false;
+
+ cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr |
+ ((u64)bp->ext_cmd_line_ptr << 32));
+
+ cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer));
+
+ if (!strncmp(buffer, cmdline_on, sizeof(buffer)))
+ sme_me_mask = me_mask;
+ else if (!strncmp(buffer, cmdline_off, sizeof(buffer)))
+ sme_me_mask = 0;
+ else
+ sme_me_mask = active_by_default ? me_mask : 0;
+
+ physical_mask &= ~sme_me_mask;
+}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 155ecbac9e28..db3165714521 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -90,9 +90,10 @@ unsigned long arch_mmap_rnd(void)
return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits);
}
-static unsigned long mmap_base(unsigned long rnd, unsigned long task_size)
+static unsigned long mmap_base(unsigned long rnd, unsigned long task_size,
+ struct rlimit *rlim_stack)
{
- unsigned long gap = rlimit(RLIMIT_STACK);
+ unsigned long gap = rlim_stack->rlim_cur;
unsigned long pad = stack_maxrandom_size(task_size) + stack_guard_gap;
unsigned long gap_min, gap_max;
@@ -126,16 +127,17 @@ static unsigned long mmap_legacy_base(unsigned long rnd,
* process VM image, sets up which VM layout function to use:
*/
static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base,
- unsigned long random_factor, unsigned long task_size)
+ unsigned long random_factor, unsigned long task_size,
+ struct rlimit *rlim_stack)
{
*legacy_base = mmap_legacy_base(random_factor, task_size);
if (mmap_is_legacy())
*base = *legacy_base;
else
- *base = mmap_base(random_factor, task_size);
+ *base = mmap_base(random_factor, task_size, rlim_stack);
}
-void arch_pick_mmap_layout(struct mm_struct *mm)
+void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
if (mmap_is_legacy())
mm->get_unmapped_area = arch_get_unmapped_area;
@@ -143,7 +145,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
- arch_rnd(mmap64_rnd_bits), task_size_64bit(0));
+ arch_rnd(mmap64_rnd_bits), task_size_64bit(0),
+ rlim_stack);
#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
/*
@@ -153,7 +156,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
* mmap_base, the compat syscall uses mmap_compat_base.
*/
arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base,
- arch_rnd(mmap32_rnd_bits), task_size_32bit());
+ arch_rnd(mmap32_rnd_bits), task_size_32bit(),
+ rlim_stack);
#endif
}
@@ -162,7 +166,7 @@ unsigned long get_mmap_base(int is_legacy)
struct mm_struct *mm = current->mm;
#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
- if (in_compat_syscall()) {
+ if (in_32bit_syscall()) {
return is_legacy ? mm->mmap_compat_legacy_base
: mm->mmap_compat_base;
}
@@ -236,3 +240,24 @@ int valid_mmap_phys_addr_range(unsigned long pfn, size_t count)
return phys_addr_valid(addr + count - 1);
}
+
+/*
+ * Only allow root to set high MMIO mappings to PROT_NONE.
+ * This prevents an unpriv. user to set them to PROT_NONE and invert
+ * them, then pointing to valid memory for L1TF speculation.
+ *
+ * Note: for locked down kernels may want to disable the root override.
+ */
+bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
+{
+ if (!boot_cpu_has_bug(X86_BUG_L1TF))
+ return true;
+ if (!__pte_needs_invert(pgprot_val(prot)))
+ return true;
+ /* If it's real memory always allow */
+ if (pfn_valid(pfn))
+ return true;
+ if (pfn >= l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN))
+ return false;
+ return true;
+}
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index e500949bae24..2385538e8065 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -118,14 +118,11 @@ bad_opcode:
* anything it wants in to the instructions. We can not
* trust anything about it. They might not be valid
* instructions or might encode invalid registers, etc...
- *
- * The caller is expected to kfree() the returned siginfo_t.
*/
-siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
+int mpx_fault_info(struct mpx_fault_info *info, struct pt_regs *regs)
{
const struct mpx_bndreg_state *bndregs;
const struct mpx_bndreg *bndreg;
- siginfo_t *info = NULL;
struct insn insn;
uint8_t bndregno;
int err;
@@ -153,11 +150,6 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
/* now go select the individual register in the set of 4 */
bndreg = &bndregs->bndreg[bndregno];
- info = kzalloc(sizeof(*info), GFP_KERNEL);
- if (!info) {
- err = -ENOMEM;
- goto err_out;
- }
/*
* The registers are always 64-bit, but the upper 32
* bits are ignored in 32-bit mode. Also, note that the
@@ -168,27 +160,23 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
* complains when casting from integers to different-size
* pointers.
*/
- info->si_lower = (void __user *)(unsigned long)bndreg->lower_bound;
- info->si_upper = (void __user *)(unsigned long)~bndreg->upper_bound;
- info->si_addr_lsb = 0;
- info->si_signo = SIGSEGV;
- info->si_errno = 0;
- info->si_code = SEGV_BNDERR;
- info->si_addr = insn_get_addr_ref(&insn, regs);
+ info->lower = (void __user *)(unsigned long)bndreg->lower_bound;
+ info->upper = (void __user *)(unsigned long)~bndreg->upper_bound;
+ info->addr = insn_get_addr_ref(&insn, regs);
+
/*
* We were not able to extract an address from the instruction,
* probably because there was something invalid in it.
*/
- if (info->si_addr == (void __user *)-1) {
+ if (info->addr == (void __user *)-1) {
err = -EINVAL;
goto err_out;
}
- trace_mpx_bounds_register_exception(info->si_addr, bndreg);
- return info;
+ trace_mpx_bounds_register_exception(info->addr, bndreg);
+ return 0;
err_out:
/* info might be NULL, but kfree() handles that */
- kfree(info);
- return ERR_PTR(err);
+ return err;
}
static __user void *mpx_get_bounds_dir(void)
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 25504d5aa816..1308f5408bf7 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -4,7 +4,6 @@
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/init.h>
-#include <linux/bootmem.h>
#include <linux/memblock.h>
#include <linux/mmzone.h>
#include <linux/ctype.h>
@@ -136,13 +135,13 @@ static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
/* whine about and ignore invalid blks */
if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
- pr_warning("NUMA: Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
- nid, start, end - 1);
+ pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
+ nid, start, end - 1);
return 0;
}
if (mi->nr_blks >= NR_NODE_MEMBLKS) {
- pr_err("NUMA: too many memblk ranges\n");
+ pr_err("too many memblk ranges\n");
return -EINVAL;
}
@@ -196,7 +195,7 @@ static void __init alloc_node_data(int nid)
* Allocate node data. Try node-local memory and then any node.
* Never allocate in DMA zone.
*/
- nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
+ nd_pa = memblock_phys_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
if (!nd_pa) {
nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES,
MEMBLOCK_ALLOC_ACCESSIBLE);
@@ -267,14 +266,14 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
*/
if (bi->end > bj->start && bi->start < bj->end) {
if (bi->nid != bj->nid) {
- pr_err("NUMA: node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
+ pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
bi->nid, bi->start, bi->end - 1,
bj->nid, bj->start, bj->end - 1);
return -EINVAL;
}
- pr_warning("NUMA: Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
- bi->nid, bi->start, bi->end - 1,
- bj->start, bj->end - 1);
+ pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
+ bi->nid, bi->start, bi->end - 1,
+ bj->start, bj->end - 1);
}
/*
@@ -364,7 +363,7 @@ static int __init numa_alloc_distance(void)
phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
size, PAGE_SIZE);
if (!phys) {
- pr_warning("NUMA: Warning: can't allocate distance table!\n");
+ pr_warn("Warning: can't allocate distance table!\n");
/* don't retry until explicitly reset */
numa_distance = (void *)1LU;
return -ENOMEM;
@@ -410,14 +409,14 @@ void __init numa_set_distance(int from, int to, int distance)
if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
from < 0 || to < 0) {
- pr_warn_once("NUMA: Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
- from, to, distance);
+ pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
+ from, to, distance);
return;
}
if ((u8)distance != distance ||
(from == to && distance != LOCAL_DISTANCE)) {
- pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
+ pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
from, to, distance);
return;
}
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index aca6295350f3..f2bd3d61e16b 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -22,7 +22,6 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
-#include <linux/bootmem.h>
#include <linux/memblock.h>
#include <linux/init.h>
@@ -60,17 +59,6 @@ void memory_present(int nid, unsigned long start, unsigned long end)
}
printk(KERN_CONT "\n");
}
-
-unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
- unsigned long end_pfn)
-{
- unsigned long nr_pages = end_pfn - start_pfn;
-
- if (!nr_pages)
- return 0;
-
- return (nr_pages + 1) * sizeof(struct page);
-}
#endif
extern unsigned long highend_pfn, highstart_pfn;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 066f3511d5f1..59d80160fa5a 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -3,7 +3,7 @@
* Generic VM initialization for x86-64 NUMA setups.
* Copyright 2002,2003 Andi Kleen, SuSE Labs.
*/
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include "numa_internal.h"
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 34a2a3bfde9c..abffa0be80da 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -6,7 +6,6 @@
#include <linux/errno.h>
#include <linux/topology.h>
#include <linux/memblock.h>
-#include <linux/bootmem.h>
#include <asm/dma.h>
#include "numa_internal.h"
@@ -61,7 +60,7 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei,
eb->nid = nid;
if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
- emu_nid_to_phys[nid] = nid;
+ emu_nid_to_phys[nid] = pb->nid;
pb->start += size;
if (pb->start >= pb->end) {
@@ -198,40 +197,73 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
return end;
}
+static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes)
+{
+ unsigned long max_pfn = PHYS_PFN(max_addr);
+ unsigned long base_pfn = PHYS_PFN(base);
+ unsigned long hole_pfns = PHYS_PFN(hole);
+
+ return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes);
+}
+
/*
* Sets up fake nodes of `size' interleaved over physical nodes ranging from
* `addr' to `max_addr'.
*
* Returns zero on success or negative on error.
*/
-static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
+static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei,
struct numa_meminfo *pi,
- u64 addr, u64 max_addr, u64 size)
+ u64 addr, u64 max_addr, u64 size,
+ int nr_nodes, struct numa_memblk *pblk,
+ int nid)
{
nodemask_t physnode_mask = numa_nodes_parsed;
+ int i, ret, uniform = 0;
u64 min_size;
- int nid = 0;
- int i, ret;
- if (!size)
+ if ((!size && !nr_nodes) || (nr_nodes && !pblk))
return -1;
+
/*
- * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
- * increased accordingly if the requested size is too small. This
- * creates a uniform distribution of node sizes across the entire
- * machine (but not necessarily over physical nodes).
+ * In the 'uniform' case split the passed in physical node by
+ * nr_nodes, in the non-uniform case, ignore the passed in
+ * physical block and try to create nodes of at least size
+ * @size.
+ *
+ * In the uniform case, split the nodes strictly by physical
+ * capacity, i.e. ignore holes. In the non-uniform case account
+ * for holes and treat @size as a minimum floor.
*/
- min_size = (max_addr - addr - mem_hole_size(addr, max_addr)) / MAX_NUMNODES;
- min_size = max(min_size, FAKE_NODE_MIN_SIZE);
- if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
- min_size = (min_size + FAKE_NODE_MIN_SIZE) &
- FAKE_NODE_MIN_HASH_MASK;
+ if (!nr_nodes)
+ nr_nodes = MAX_NUMNODES;
+ else {
+ nodes_clear(physnode_mask);
+ node_set(pblk->nid, physnode_mask);
+ uniform = 1;
+ }
+
+ if (uniform) {
+ min_size = uniform_size(max_addr, addr, 0, nr_nodes);
+ size = min_size;
+ } else {
+ /*
+ * The limit on emulated nodes is MAX_NUMNODES, so the
+ * size per node is increased accordingly if the
+ * requested size is too small. This creates a uniform
+ * distribution of node sizes across the entire machine
+ * (but not necessarily over physical nodes).
+ */
+ min_size = uniform_size(max_addr, addr,
+ mem_hole_size(addr, max_addr), nr_nodes);
+ }
+ min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE);
if (size < min_size) {
pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
size >> 20, min_size >> 20);
size = min_size;
}
- size &= FAKE_NODE_MIN_HASH_MASK;
+ size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE);
/*
* Fill physical nodes with fake nodes of size until there is no memory
@@ -248,10 +280,14 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
node_clear(i, physnode_mask);
continue;
}
+
start = pi->blk[phys_blk].start;
limit = pi->blk[phys_blk].end;
- end = find_end_of_node(start, limit, size);
+ if (uniform)
+ end = start + size;
+ else
+ end = find_end_of_node(start, limit, size);
/*
* If there won't be at least FAKE_NODE_MIN_SIZE of
* non-reserved memory in ZONE_DMA32 for the next node,
@@ -266,7 +302,8 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
* next node, this one must extend to the end of the
* physical node.
*/
- if (limit - end - mem_hole_size(end, limit) < size)
+ if ((limit - end - mem_hole_size(end, limit) < size)
+ && !uniform)
end = limit;
ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
@@ -276,7 +313,15 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
return ret;
}
}
- return 0;
+ return nid;
+}
+
+static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
+ struct numa_meminfo *pi,
+ u64 addr, u64 max_addr, u64 size)
+{
+ return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size,
+ 0, NULL, NUMA_NO_NODE);
}
int __init setup_emu2phys_nid(int *dfl_phys_nid)
@@ -346,7 +391,36 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
* the fixed node size. Otherwise, if it is just a single number N,
* split the system RAM into N fake nodes.
*/
- if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
+ if (strchr(emu_cmdline, 'U')) {
+ nodemask_t physnode_mask = numa_nodes_parsed;
+ unsigned long n;
+ int nid = 0;
+
+ n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
+ ret = -1;
+ for_each_node_mask(i, physnode_mask) {
+ /*
+ * The reason we pass in blk[0] is due to
+ * numa_remove_memblk_from() called by
+ * emu_setup_memblk() will delete entry 0
+ * and then move everything else up in the pi.blk
+ * array. Therefore we should always be looking
+ * at blk[0].
+ */
+ ret = split_nodes_size_interleave_uniform(&ei, &pi,
+ pi.blk[0].start, pi.blk[0].end, 0,
+ n, &pi.blk[0], nid);
+ if (ret < 0)
+ break;
+ if (ret < n) {
+ pr_info("%s: phys: %d only got %d of %ld nodes, failing\n",
+ __func__, i, ret, n);
+ ret = -1;
+ break;
+ }
+ nid = ret;
+ }
+ } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
u64 size;
size = memparse(emu_cmdline, &emu_cmdline);
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index a25588ad75ef..08f8f76a4852 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -5,7 +5,7 @@
* Clears the a test pte bit on random pages in the direct mapping,
* then reverts and compares page tables forwards and afterwards.
*/
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/kthread.h>
#include <linux/random.h>
#include <linux/kernel.h>
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 85cf12219dea..db7a10082238 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -3,7 +3,7 @@
* Thanks to Ben LaHaise for precious feedback.
*/
#include <linux/highmem.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
@@ -37,11 +37,20 @@ struct cpa_data {
unsigned long numpages;
int flags;
unsigned long pfn;
- unsigned force_split : 1;
+ unsigned force_split : 1,
+ force_static_prot : 1;
int curpage;
struct page **pages;
};
+enum cpa_warn {
+ CPA_CONFLICT,
+ CPA_PROTECT,
+ CPA_DETECT,
+};
+
+static const int cpa_warn_level = CPA_PROTECT;
+
/*
* Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
* using cpa_lock. So that we don't allow any other cpu, with stale large tlb
@@ -53,6 +62,7 @@ static DEFINE_SPINLOCK(cpa_lock);
#define CPA_FLUSHTLB 1
#define CPA_ARRAY 2
#define CPA_PAGES_ARRAY 4
+#define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */
#ifdef CONFIG_PROC_FS
static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -93,21 +103,87 @@ void arch_report_meminfo(struct seq_file *m)
static inline void split_page_count(int level) { }
#endif
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_X86_CPA_STATISTICS
-static inline unsigned long highmap_start_pfn(void)
+static unsigned long cpa_1g_checked;
+static unsigned long cpa_1g_sameprot;
+static unsigned long cpa_1g_preserved;
+static unsigned long cpa_2m_checked;
+static unsigned long cpa_2m_sameprot;
+static unsigned long cpa_2m_preserved;
+static unsigned long cpa_4k_install;
+
+static inline void cpa_inc_1g_checked(void)
{
- return __pa_symbol(_text) >> PAGE_SHIFT;
+ cpa_1g_checked++;
}
-static inline unsigned long highmap_end_pfn(void)
+static inline void cpa_inc_2m_checked(void)
{
- /* Do not reference physical address outside the kernel. */
- return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT;
+ cpa_2m_checked++;
}
+static inline void cpa_inc_4k_install(void)
+{
+ cpa_4k_install++;
+}
+
+static inline void cpa_inc_lp_sameprot(int level)
+{
+ if (level == PG_LEVEL_1G)
+ cpa_1g_sameprot++;
+ else
+ cpa_2m_sameprot++;
+}
+
+static inline void cpa_inc_lp_preserved(int level)
+{
+ if (level == PG_LEVEL_1G)
+ cpa_1g_preserved++;
+ else
+ cpa_2m_preserved++;
+}
+
+static int cpastats_show(struct seq_file *m, void *p)
+{
+ seq_printf(m, "1G pages checked: %16lu\n", cpa_1g_checked);
+ seq_printf(m, "1G pages sameprot: %16lu\n", cpa_1g_sameprot);
+ seq_printf(m, "1G pages preserved: %16lu\n", cpa_1g_preserved);
+ seq_printf(m, "2M pages checked: %16lu\n", cpa_2m_checked);
+ seq_printf(m, "2M pages sameprot: %16lu\n", cpa_2m_sameprot);
+ seq_printf(m, "2M pages preserved: %16lu\n", cpa_2m_preserved);
+ seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install);
+ return 0;
+}
+
+static int cpastats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, cpastats_show, NULL);
+}
+
+static const struct file_operations cpastats_fops = {
+ .open = cpastats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init cpa_stats_init(void)
+{
+ debugfs_create_file("cpa_stats", S_IRUSR, arch_debugfs_dir, NULL,
+ &cpastats_fops);
+ return 0;
+}
+late_initcall(cpa_stats_init);
+#else
+static inline void cpa_inc_1g_checked(void) { }
+static inline void cpa_inc_2m_checked(void) { }
+static inline void cpa_inc_4k_install(void) { }
+static inline void cpa_inc_lp_sameprot(int level) { }
+static inline void cpa_inc_lp_preserved(int level) { }
#endif
+
static inline int
within(unsigned long addr, unsigned long start, unsigned long end)
{
@@ -120,6 +196,38 @@ within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
return addr >= start && addr <= end;
}
+#ifdef CONFIG_X86_64
+
+static inline unsigned long highmap_start_pfn(void)
+{
+ return __pa_symbol(_text) >> PAGE_SHIFT;
+}
+
+static inline unsigned long highmap_end_pfn(void)
+{
+ /* Do not reference physical address outside the kernel. */
+ return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT;
+}
+
+static bool __cpa_pfn_in_highmap(unsigned long pfn)
+{
+ /*
+ * Kernel text has an alias mapping at a high address, known
+ * here as "highmap".
+ */
+ return within_inclusive(pfn, highmap_start_pfn(), highmap_end_pfn());
+}
+
+#else
+
+static bool __cpa_pfn_in_highmap(unsigned long pfn)
+{
+ /* There is no highmap on 32-bit */
+ return false;
+}
+
+#endif
+
/*
* Flushing functions
*/
@@ -172,19 +280,25 @@ static void __cpa_flush_all(void *arg)
static void cpa_flush_all(unsigned long cache)
{
- BUG_ON(irqs_disabled());
+ BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
on_each_cpu(__cpa_flush_all, (void *) cache, 1);
}
-static void __cpa_flush_range(void *arg)
+static bool __cpa_flush_range(unsigned long start, int numpages, int cache)
{
- /*
- * We could optimize that further and do individual per page
- * tlb invalidates for a low number of pages. Caveat: we must
- * flush the high aliases on 64bit as well.
- */
- __flush_tlb_all();
+ BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
+
+ WARN_ON(PAGE_ALIGN(start) != start);
+
+ if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
+ cpa_flush_all(cache);
+ return true;
+ }
+
+ flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages);
+
+ return !cache;
}
static void cpa_flush_range(unsigned long start, int numpages, int cache)
@@ -192,12 +306,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
unsigned int i, level;
unsigned long addr;
- BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
- WARN_ON(PAGE_ALIGN(start) != start);
-
- on_each_cpu(__cpa_flush_range, NULL, 1);
-
- if (!cache)
+ if (__cpa_flush_range(start, numpages, cache))
return;
/*
@@ -217,30 +326,13 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
}
}
-static void cpa_flush_array(unsigned long *start, int numpages, int cache,
+static void cpa_flush_array(unsigned long baddr, unsigned long *start,
+ int numpages, int cache,
int in_flags, struct page **pages)
{
unsigned int i, level;
-#ifdef CONFIG_PREEMPT
- /*
- * Avoid wbinvd() because it causes latencies on all CPUs,
- * regardless of any CPU isolation that may be in effect.
- *
- * This should be extended for CAT enabled systems independent of
- * PREEMPT because wbinvd() does not respect the CAT partitions and
- * this is exposed to unpriviledged users through the graphics
- * subsystem.
- */
- unsigned long do_wbinvd = 0;
-#else
- unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
-#endif
-
- BUG_ON(irqs_disabled());
- on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
-
- if (!cache || do_wbinvd)
+ if (__cpa_flush_range(baddr, numpages, cache))
return;
/*
@@ -268,82 +360,179 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache,
}
}
-/*
- * Certain areas of memory on x86 require very specific protection flags,
- * for example the BIOS area or kernel text. Callers don't always get this
- * right (again, ioremap() on BIOS memory is not uncommon) so this function
- * checks and fixes these known static required protection bits.
- */
-static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
- unsigned long pfn)
+static bool overlaps(unsigned long r1_start, unsigned long r1_end,
+ unsigned long r2_start, unsigned long r2_end)
{
- pgprot_t forbidden = __pgprot(0);
+ return (r1_start <= r2_end && r1_end >= r2_start) ||
+ (r2_start <= r1_end && r2_end >= r1_start);
+}
- /*
- * The BIOS area between 640k and 1Mb needs to be executable for
- * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
- */
#ifdef CONFIG_PCI_BIOS
- if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
- pgprot_val(forbidden) |= _PAGE_NX;
+/*
+ * The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS
+ * based config access (CONFIG_PCI_GOBIOS) support.
+ */
+#define BIOS_PFN PFN_DOWN(BIOS_BEGIN)
+#define BIOS_PFN_END PFN_DOWN(BIOS_END - 1)
+
+static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
+{
+ if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END))
+ return _PAGE_NX;
+ return 0;
+}
+#else
+static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
+{
+ return 0;
+}
#endif
- /*
- * The kernel text needs to be executable for obvious reasons
- * Does not cover __inittext since that is gone later on. On
- * 64bit we do not enforce !NX on the low mapping
- */
- if (within(address, (unsigned long)_text, (unsigned long)_etext))
- pgprot_val(forbidden) |= _PAGE_NX;
+/*
+ * The .rodata section needs to be read-only. Using the pfn catches all
+ * aliases. This also includes __ro_after_init, so do not enforce until
+ * kernel_set_to_readonly is true.
+ */
+static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn)
+{
+ unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata));
/*
- * The .rodata section needs to be read-only. Using the pfn
- * catches all aliases.
+ * Note: __end_rodata is at page aligned and not inclusive, so
+ * subtract 1 to get the last enforced PFN in the rodata area.
*/
- if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
- __pa_symbol(__end_rodata) >> PAGE_SHIFT))
- pgprot_val(forbidden) |= _PAGE_RW;
+ epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1;
+
+ if (kernel_set_to_readonly && overlaps(spfn, epfn, spfn_ro, epfn_ro))
+ return _PAGE_RW;
+ return 0;
+}
+
+/*
+ * Protect kernel text against becoming non executable by forbidding
+ * _PAGE_NX. This protects only the high kernel mapping (_text -> _etext)
+ * out of which the kernel actually executes. Do not protect the low
+ * mapping.
+ *
+ * This does not cover __inittext since that is gone after boot.
+ */
+static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end)
+{
+ unsigned long t_end = (unsigned long)_etext - 1;
+ unsigned long t_start = (unsigned long)_text;
+
+ if (overlaps(start, end, t_start, t_end))
+ return _PAGE_NX;
+ return 0;
+}
#if defined(CONFIG_X86_64)
+/*
+ * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
+ * kernel text mappings for the large page aligned text, rodata sections
+ * will be always read-only. For the kernel identity mappings covering the
+ * holes caused by this alignment can be anything that user asks.
+ *
+ * This will preserve the large page mappings for kernel text/data at no
+ * extra cost.
+ */
+static pgprotval_t protect_kernel_text_ro(unsigned long start,
+ unsigned long end)
+{
+ unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1;
+ unsigned long t_start = (unsigned long)_text;
+ unsigned int level;
+
+ if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end))
+ return 0;
/*
- * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
- * kernel text mappings for the large page aligned text, rodata sections
- * will be always read-only. For the kernel identity mappings covering
- * the holes caused by this alignment can be anything that user asks.
+ * Don't enforce the !RW mapping for the kernel text mapping, if
+ * the current mapping is already using small page mapping. No
+ * need to work hard to preserve large page mappings in this case.
*
- * This will preserve the large page mappings for kernel text/data
- * at no extra cost.
+ * This also fixes the Linux Xen paravirt guest boot failure caused
+ * by unexpected read-only mappings for kernel identity
+ * mappings. In this paravirt guest case, the kernel text mapping
+ * and the kernel identity mapping share the same page-table pages,
+ * so the protections for kernel text and identity mappings have to
+ * be the same.
*/
- if (kernel_set_to_readonly &&
- within(address, (unsigned long)_text,
- (unsigned long)__end_rodata_hpage_align)) {
- unsigned int level;
-
- /*
- * Don't enforce the !RW mapping for the kernel text mapping,
- * if the current mapping is already using small page mapping.
- * No need to work hard to preserve large page mappings in this
- * case.
- *
- * This also fixes the Linux Xen paravirt guest boot failure
- * (because of unexpected read-only mappings for kernel identity
- * mappings). In this paravirt guest case, the kernel text
- * mapping and the kernel identity mapping share the same
- * page-table pages. Thus we can't really use different
- * protections for the kernel text and identity mappings. Also,
- * these shared mappings are made of small page mappings.
- * Thus this don't enforce !RW mapping for small page kernel
- * text mapping logic will help Linux Xen parvirt guest boot
- * as well.
- */
- if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
- pgprot_val(forbidden) |= _PAGE_RW;
- }
+ if (lookup_address(start, &level) && (level != PG_LEVEL_4K))
+ return _PAGE_RW;
+ return 0;
+}
+#else
+static pgprotval_t protect_kernel_text_ro(unsigned long start,
+ unsigned long end)
+{
+ return 0;
+}
#endif
- prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
+static inline bool conflicts(pgprot_t prot, pgprotval_t val)
+{
+ return (pgprot_val(prot) & ~val) != pgprot_val(prot);
+}
- return prot;
+static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val,
+ unsigned long start, unsigned long end,
+ unsigned long pfn, const char *txt)
+{
+ static const char *lvltxt[] = {
+ [CPA_CONFLICT] = "conflict",
+ [CPA_PROTECT] = "protect",
+ [CPA_DETECT] = "detect",
+ };
+
+ if (warnlvl > cpa_warn_level || !conflicts(prot, val))
+ return;
+
+ pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n",
+ lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot),
+ (unsigned long long)val);
+}
+
+/*
+ * Certain areas of memory on x86 require very specific protection flags,
+ * for example the BIOS area or kernel text. Callers don't always get this
+ * right (again, ioremap() on BIOS memory is not uncommon) so this function
+ * checks and fixes these known static required protection bits.
+ */
+static inline pgprot_t static_protections(pgprot_t prot, unsigned long start,
+ unsigned long pfn, unsigned long npg,
+ int warnlvl)
+{
+ pgprotval_t forbidden, res;
+ unsigned long end;
+
+ /*
+ * There is no point in checking RW/NX conflicts when the requested
+ * mapping is setting the page !PRESENT.
+ */
+ if (!(pgprot_val(prot) & _PAGE_PRESENT))
+ return prot;
+
+ /* Operate on the virtual address */
+ end = start + npg * PAGE_SIZE - 1;
+
+ res = protect_kernel_text(start, end);
+ check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX");
+ forbidden = res;
+
+ res = protect_kernel_text_ro(start, end);
+ check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO");
+ forbidden |= res;
+
+ /* Check the PFN directly */
+ res = protect_pci_bios(pfn, pfn + npg - 1);
+ check_conflict(warnlvl, prot, res, start, end, pfn, "PCIBIOS NX");
+ forbidden |= res;
+
+ res = protect_rodata(pfn, pfn + npg - 1);
+ check_conflict(warnlvl, prot, res, start, end, pfn, "Rodata RO");
+ forbidden |= res;
+
+ return __pgprot(pgprot_val(prot) & ~forbidden);
}
/*
@@ -401,18 +590,18 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
*/
pte_t *lookup_address(unsigned long address, unsigned int *level)
{
- return lookup_address_in_pgd(pgd_offset_k(address), address, level);
+ return lookup_address_in_pgd(pgd_offset_k(address), address, level);
}
EXPORT_SYMBOL_GPL(lookup_address);
static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
unsigned int *level)
{
- if (cpa->pgd)
+ if (cpa->pgd)
return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
address, level);
- return lookup_address(address, level);
+ return lookup_address(address, level);
}
/*
@@ -512,40 +701,52 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
#endif
}
-static int
-try_preserve_large_page(pte_t *kpte, unsigned long address,
- struct cpa_data *cpa)
+static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot)
{
- unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn;
+ /*
+ * _PAGE_GLOBAL means "global page" for present PTEs.
+ * But, it is also used to indicate _PAGE_PROTNONE
+ * for non-present PTEs.
+ *
+ * This ensures that a _PAGE_GLOBAL PTE going from
+ * present to non-present is not confused as
+ * _PAGE_PROTNONE.
+ */
+ if (!(pgprot_val(prot) & _PAGE_PRESENT))
+ pgprot_val(prot) &= ~_PAGE_GLOBAL;
+
+ return prot;
+}
+
+static int __should_split_large_page(pte_t *kpte, unsigned long address,
+ struct cpa_data *cpa)
+{
+ unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn;
+ pgprot_t old_prot, new_prot, req_prot, chk_prot;
pte_t new_pte, old_pte, *tmp;
- pgprot_t old_prot, new_prot, req_prot;
- int i, do_split = 1;
enum pg_level level;
- if (cpa->force_split)
- return 1;
-
- spin_lock(&pgd_lock);
/*
* Check for races, another CPU might have split this page
* up already:
*/
tmp = _lookup_address_cpa(cpa, address, &level);
if (tmp != kpte)
- goto out_unlock;
+ return 1;
switch (level) {
case PG_LEVEL_2M:
old_prot = pmd_pgprot(*(pmd_t *)kpte);
old_pfn = pmd_pfn(*(pmd_t *)kpte);
+ cpa_inc_2m_checked();
break;
case PG_LEVEL_1G:
old_prot = pud_pgprot(*(pud_t *)kpte);
old_pfn = pud_pfn(*(pud_t *)kpte);
+ cpa_inc_1g_checked();
break;
default:
- do_split = -EINVAL;
- goto out_unlock;
+ return -EINVAL;
}
psize = page_level_size(level);
@@ -555,8 +756,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
* Calculate the number of pages, which fit into this large
* page starting at address:
*/
- nextpage_addr = (address + psize) & pmask;
- numpages = (nextpage_addr - address) >> PAGE_SHIFT;
+ lpaddr = (address + psize) & pmask;
+ numpages = (lpaddr - address) >> PAGE_SHIFT;
if (numpages < cpa->numpages)
cpa->numpages = numpages;
@@ -566,6 +767,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
* up accordingly.
*/
old_pte = *kpte;
+ /* Clear PSE (aka _PAGE_PAT) and move PAT bit to correct position */
req_prot = pgprot_large_2_4k(old_prot);
pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
@@ -577,86 +779,147 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
* different bit positions in the two formats.
*/
req_prot = pgprot_4k_2_large(req_prot);
-
- /*
- * Set the PSE and GLOBAL flags only if the PRESENT flag is
- * set otherwise pmd_present/pmd_huge will return true even on
- * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL
- * for the ancient hardware that doesn't support it.
- */
+ req_prot = pgprot_clear_protnone_bits(req_prot);
if (pgprot_val(req_prot) & _PAGE_PRESENT)
- pgprot_val(req_prot) |= _PAGE_PSE | _PAGE_GLOBAL;
- else
- pgprot_val(req_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL);
-
- req_prot = canon_pgprot(req_prot);
+ pgprot_val(req_prot) |= _PAGE_PSE;
/*
- * old_pfn points to the large page base pfn. So we need
- * to add the offset of the virtual address:
+ * old_pfn points to the large page base pfn. So we need to add the
+ * offset of the virtual address:
*/
pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT);
cpa->pfn = pfn;
- new_prot = static_protections(req_prot, address, pfn);
+ /*
+ * Calculate the large page base address and the number of 4K pages
+ * in the large page
+ */
+ lpaddr = address & pmask;
+ numpages = psize >> PAGE_SHIFT;
/*
- * We need to check the full range, whether
- * static_protection() requires a different pgprot for one of
- * the pages in the range we try to preserve:
+ * Sanity check that the existing mapping is correct versus the static
+ * protections. static_protections() guards against !PRESENT, so no
+ * extra conditional required here.
*/
- addr = address & pmask;
- pfn = old_pfn;
- for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
- pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
+ chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages,
+ CPA_CONFLICT);
- if (pgprot_val(chk_prot) != pgprot_val(new_prot))
- goto out_unlock;
+ if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) {
+ /*
+ * Split the large page and tell the split code to
+ * enforce static protections.
+ */
+ cpa->force_static_prot = 1;
+ return 1;
}
/*
- * If there are no changes, return. maxpages has been updated
- * above:
+ * Optimization: If the requested pgprot is the same as the current
+ * pgprot, then the large page can be preserved and no updates are
+ * required independent of alignment and length of the requested
+ * range. The above already established that the current pgprot is
+ * correct, which in consequence makes the requested pgprot correct
+ * as well if it is the same. The static protection scan below will
+ * not come to a different conclusion.
*/
- if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
- do_split = 0;
- goto out_unlock;
+ if (pgprot_val(req_prot) == pgprot_val(old_prot)) {
+ cpa_inc_lp_sameprot(level);
+ return 0;
}
/*
- * We need to change the attributes. Check, whether we can
- * change the large page in one go. We request a split, when
- * the address is not aligned and the number of pages is
- * smaller than the number of pages in the large page. Note
- * that we limited the number of possible pages already to
- * the number of pages in the large page.
+ * If the requested range does not cover the full page, split it up
*/
- if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
- /*
- * The address is aligned and the number of pages
- * covers the full page.
- */
- new_pte = pfn_pte(old_pfn, new_prot);
- __set_pmd_pte(kpte, address, new_pte);
- cpa->flags |= CPA_FLUSHTLB;
- do_split = 0;
- }
+ if (address != lpaddr || cpa->numpages != numpages)
+ return 1;
+
+ /*
+ * Check whether the requested pgprot is conflicting with a static
+ * protection requirement in the large page.
+ */
+ new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
+ CPA_DETECT);
+
+ /*
+ * If there is a conflict, split the large page.
+ *
+ * There used to be a 4k wise evaluation trying really hard to
+ * preserve the large pages, but experimentation has shown, that this
+ * does not help at all. There might be corner cases which would
+ * preserve one large page occasionally, but it's really not worth the
+ * extra code and cycles for the common case.
+ */
+ if (pgprot_val(req_prot) != pgprot_val(new_prot))
+ return 1;
+
+ /* All checks passed. Update the large page mapping. */
+ new_pte = pfn_pte(old_pfn, new_prot);
+ __set_pmd_pte(kpte, address, new_pte);
+ cpa->flags |= CPA_FLUSHTLB;
+ cpa_inc_lp_preserved(level);
+ return 0;
+}
+
+static int should_split_large_page(pte_t *kpte, unsigned long address,
+ struct cpa_data *cpa)
+{
+ int do_split;
-out_unlock:
+ if (cpa->force_split)
+ return 1;
+
+ spin_lock(&pgd_lock);
+ do_split = __should_split_large_page(kpte, address, cpa);
spin_unlock(&pgd_lock);
return do_split;
}
+static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn,
+ pgprot_t ref_prot, unsigned long address,
+ unsigned long size)
+{
+ unsigned int npg = PFN_DOWN(size);
+ pgprot_t prot;
+
+ /*
+ * If should_split_large_page() discovered an inconsistent mapping,
+ * remove the invalid protection in the split mapping.
+ */
+ if (!cpa->force_static_prot)
+ goto set;
+
+ prot = static_protections(ref_prot, address, pfn, npg, CPA_PROTECT);
+
+ if (pgprot_val(prot) == pgprot_val(ref_prot))
+ goto set;
+
+ /*
+ * If this is splitting a PMD, fix it up. PUD splits cannot be
+ * fixed trivially as that would require to rescan the newly
+ * installed PMD mappings after returning from split_large_page()
+ * so an eventual further split can allocate the necessary PTE
+ * pages. Warn for now and revisit it in case this actually
+ * happens.
+ */
+ if (size == PAGE_SIZE)
+ ref_prot = prot;
+ else
+ pr_warn_once("CPA: Cannot fixup static protections for PUD split\n");
+set:
+ set_pte(pte, pfn_pte(pfn, ref_prot));
+}
+
static int
__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
struct page *base)
{
+ unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1;
pte_t *pbase = (pte_t *)page_address(base);
- unsigned long ref_pfn, pfn, pfninc = 1;
unsigned int i, level;
- pte_t *tmp;
pgprot_t ref_prot;
+ pte_t *tmp;
spin_lock(&pgd_lock);
/*
@@ -674,16 +937,22 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
switch (level) {
case PG_LEVEL_2M:
ref_prot = pmd_pgprot(*(pmd_t *)kpte);
- /* clear PSE and promote PAT bit to correct position */
+ /*
+ * Clear PSE (aka _PAGE_PAT) and move
+ * PAT bit to correct position.
+ */
ref_prot = pgprot_large_2_4k(ref_prot);
ref_pfn = pmd_pfn(*(pmd_t *)kpte);
+ lpaddr = address & PMD_MASK;
+ lpinc = PAGE_SIZE;
break;
case PG_LEVEL_1G:
ref_prot = pud_pgprot(*(pud_t *)kpte);
ref_pfn = pud_pfn(*(pud_t *)kpte);
pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
-
+ lpaddr = address & PUD_MASK;
+ lpinc = PMD_SIZE;
/*
* Clear the PSE flags if the PRESENT flag is not set
* otherwise pmd_present/pmd_huge will return true
@@ -698,23 +967,14 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
return 1;
}
- /*
- * Set the GLOBAL flags only if the PRESENT flag is set
- * otherwise pmd/pte_present will return true even on a non
- * present pmd/pte. The canon_pgprot will clear _PAGE_GLOBAL
- * for the ancient hardware that doesn't support it.
- */
- if (pgprot_val(ref_prot) & _PAGE_PRESENT)
- pgprot_val(ref_prot) |= _PAGE_GLOBAL;
- else
- pgprot_val(ref_prot) &= ~_PAGE_GLOBAL;
+ ref_prot = pgprot_clear_protnone_bits(ref_prot);
/*
* Get the target pfn from the original entry:
*/
pfn = ref_pfn;
- for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
- set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot)));
+ for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc)
+ split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc);
if (virt_addr_valid(address)) {
unsigned long pfn = PFN_DOWN(__pa(address));
@@ -733,14 +993,24 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
/*
- * Intel Atom errata AAH41 workaround.
+ * Do a global flush tlb after splitting the large page
+ * and before we do the actual change page attribute in the PTE.
*
- * The real fix should be in hw or in a microcode update, but
- * we also probabilistically try to reduce the window of having
- * a large TLB mixed with 4K TLBs while instruction fetches are
- * going on.
+ * Without this, we violate the TLB application note, that says:
+ * "The TLBs may contain both ordinary and large-page
+ * translations for a 4-KByte range of linear addresses. This
+ * may occur if software modifies the paging structures so that
+ * the page size used for the address range changes. If the two
+ * translations differ with respect to page frame or attributes
+ * (e.g., permissions), processor behavior is undefined and may
+ * be implementation-specific."
+ *
+ * We do this global tlb flush inside the cpa_lock, so that we
+ * don't allow any other cpu, with stale tlb entries change the
+ * page attribute in parallel, that also falls into the
+ * just split large page entry.
*/
- __flush_tlb_all();
+ flush_tlb_all();
spin_unlock(&pgd_lock);
return 0;
@@ -930,19 +1200,7 @@ static void populate_pte(struct cpa_data *cpa,
pte = pte_offset_kernel(pmd, start);
- /*
- * Set the GLOBAL flags only if the PRESENT flag is
- * set otherwise pte_present will return true even on
- * a non present pte. The canon_pgprot will clear
- * _PAGE_GLOBAL for the ancient hardware that doesn't
- * support it.
- */
- if (pgprot_val(pgprot) & _PAGE_PRESENT)
- pgprot_val(pgprot) |= _PAGE_GLOBAL;
- else
- pgprot_val(pgprot) &= ~_PAGE_GLOBAL;
-
- pgprot = canon_pgprot(pgprot);
+ pgprot = pgprot_clear_protnone_bits(pgprot);
while (num_pages-- && start < end) {
set_pte(pte, pfn_pte(cpa->pfn, pgprot));
@@ -1004,8 +1262,8 @@ static long populate_pmd(struct cpa_data *cpa,
pmd = pmd_offset(pud, start);
- set_pmd(pmd, __pmd(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
- massage_pgprot(pmd_pgprot)));
+ set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn,
+ canon_pgprot(pmd_pgprot))));
start += PMD_SIZE;
cpa->pfn += PMD_SIZE >> PAGE_SHIFT;
@@ -1077,8 +1335,8 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
* Map everything starting from the Gb boundary, possibly with 1G pages
*/
while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) {
- set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
- massage_pgprot(pud_pgprot)));
+ set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn,
+ canon_pgprot(pud_pgprot))));
start += PUD_SIZE;
cpa->pfn += PUD_SIZE >> PAGE_SHIFT;
@@ -1190,6 +1448,10 @@ static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
cpa->numpages = 1;
cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
return 0;
+
+ } else if (__cpa_pfn_in_highmap(cpa->pfn)) {
+ /* Faults in the highmap are OK, so do not warn: */
+ return -EFAULT;
} else {
WARN(1, KERN_WARNING "CPA: called for zero pte. "
"vaddr = %lx cpa->vaddr = %lx\n", vaddr,
@@ -1232,26 +1494,18 @@ repeat:
pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
- new_prot = static_protections(new_prot, address, pfn);
+ cpa_inc_4k_install();
+ new_prot = static_protections(new_prot, address, pfn, 1,
+ CPA_PROTECT);
- /*
- * Set the GLOBAL flags only if the PRESENT flag is
- * set otherwise pte_present will return true even on
- * a non present pte. The canon_pgprot will clear
- * _PAGE_GLOBAL for the ancient hardware that doesn't
- * support it.
- */
- if (pgprot_val(new_prot) & _PAGE_PRESENT)
- pgprot_val(new_prot) |= _PAGE_GLOBAL;
- else
- pgprot_val(new_prot) &= ~_PAGE_GLOBAL;
+ new_prot = pgprot_clear_protnone_bits(new_prot);
/*
* We need to keep the pfn from the existing PTE,
* after all we're only going to change it's attributes
* not the memory it points to
*/
- new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
+ new_pte = pfn_pte(pfn, new_prot);
cpa->pfn = pfn;
/*
* Do we really change anything ?
@@ -1268,7 +1522,7 @@ repeat:
* Check, whether we can keep the large page intact
* and just change the pte:
*/
- do_split = try_preserve_large_page(kpte, address, cpa);
+ do_split = should_split_large_page(kpte, address, cpa);
/*
* When the range fits into the existing large page,
* return. cp->numpages and cpa->tlbflush have been updated in
@@ -1281,28 +1535,8 @@ repeat:
* We have to split the large page:
*/
err = split_large_page(cpa, kpte, address);
- if (!err) {
- /*
- * Do a global flush tlb after splitting the large page
- * and before we do the actual change page attribute in the PTE.
- *
- * With out this, we violate the TLB application note, that says
- * "The TLBs may contain both ordinary and large-page
- * translations for a 4-KByte range of linear addresses. This
- * may occur if software modifies the paging structures so that
- * the page size used for the address range changes. If the two
- * translations differ with respect to page frame or attributes
- * (e.g., permissions), processor behavior is undefined and may
- * be implementation-specific."
- *
- * We do this global tlb flush inside the cpa_lock, so that we
- * don't allow any other cpu, with stale tlb entries change the
- * page attribute in parallel, that also falls into the
- * just split large page entry.
- */
- flush_tlb_all();
+ if (!err)
goto repeat;
- }
return err;
}
@@ -1352,8 +1586,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
* to touch the high mapped kernel as well:
*/
if (!within(vaddr, (unsigned long)_text, _brk_end) &&
- within_inclusive(cpa->pfn, highmap_start_pfn(),
- highmap_end_pfn())) {
+ __cpa_pfn_in_highmap(cpa->pfn)) {
unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
__START_KERNEL_map - phys_base;
alias_cpa = *cpa;
@@ -1416,6 +1649,29 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
return 0;
}
+/*
+ * Machine check recovery code needs to change cache mode of poisoned
+ * pages to UC to avoid speculative access logging another error. But
+ * passing the address of the 1:1 mapping to set_memory_uc() is a fine
+ * way to encourage a speculative access. So we cheat and flip the top
+ * bit of the address. This works fine for the code that updates the
+ * page tables. But at the end of the process we need to flush the cache
+ * and the non-canonical address causes a #GP fault when used by the
+ * CLFLUSH instruction.
+ *
+ * But in the common case we already have a canonical address. This code
+ * will fix the top bit if needed and is a no-op otherwise.
+ */
+static inline unsigned long make_addr_canonical_again(unsigned long addr)
+{
+#ifdef CONFIG_X86_64
+ return (long)(addr << 1) >> 1;
+#else
+ return addr;
+#endif
+}
+
+
static int change_page_attr_set_clr(unsigned long *addr, int numpages,
pgprot_t mask_set, pgprot_t mask_clr,
int force_split, int in_flag,
@@ -1428,11 +1684,11 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
memset(&cpa, 0, sizeof(cpa));
/*
- * Check, if we are requested to change a not supported
- * feature:
+ * Check, if we are requested to set a not supported
+ * feature. Clearing non-supported features is OK.
*/
mask_set = canon_pgprot(mask_set);
- mask_clr = canon_pgprot(mask_clr);
+
if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
return 0;
@@ -1461,7 +1717,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
* Save address for cache flush. *addr is modified in the call
* to __change_page_attr_set_clr() below.
*/
- baddr = *addr;
+ baddr = make_addr_canonical_again(*addr);
}
/* Must avoid aliasing mappings in the highmem code */
@@ -1483,6 +1739,9 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
/* No alias checking for _NX bit modifications */
checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
+ /* Has caller explicitly disabled alias checking? */
+ if (in_flag & CPA_NO_CHECK_ALIAS)
+ checkalias = 0;
ret = __change_page_attr_set_clr(&cpa, checkalias);
@@ -1499,19 +1758,19 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
cache = !!pgprot2cachemode(mask_set);
/*
- * On success we use CLFLUSH, when the CPU supports it to
- * avoid the WBINVD. If the CPU does not support it and in the
- * error case we fall back to cpa_flush_all (which uses
- * WBINVD):
+ * On error; flush everything to be sure.
*/
- if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) {
- if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
- cpa_flush_array(addr, numpages, cache,
- cpa.flags, pages);
- } else
- cpa_flush_range(baddr, numpages, cache);
- } else
+ if (ret) {
cpa_flush_all(cache);
+ goto out;
+ }
+
+ if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
+ cpa_flush_array(baddr, addr, numpages, cache,
+ cpa.flags, pages);
+ } else {
+ cpa_flush_range(baddr, numpages, cache);
+ }
out:
return ret;
@@ -1769,12 +2028,33 @@ int set_memory_np(unsigned long addr, int numpages)
return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
}
+int set_memory_np_noalias(unsigned long addr, int numpages)
+{
+ int cpa_flags = CPA_NO_CHECK_ALIAS;
+
+ return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
+ __pgprot(_PAGE_PRESENT), 0,
+ cpa_flags, NULL);
+}
+
int set_memory_4k(unsigned long addr, int numpages)
{
return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
__pgprot(0), 1, 0, NULL);
}
+int set_memory_nonglobal(unsigned long addr, int numpages)
+{
+ return change_page_attr_clear(&addr, numpages,
+ __pgprot(_PAGE_GLOBAL), 0);
+}
+
+int set_memory_global(unsigned long addr, int numpages)
+{
+ return change_page_attr_set(&addr, numpages,
+ __pgprot(_PAGE_GLOBAL), 0);
+}
+
static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
{
struct cpa_data cpa;
@@ -1805,10 +2085,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
/*
* Before changing the encryption attribute, we need to flush caches.
*/
- if (static_cpu_has(X86_FEATURE_CLFLUSH))
- cpa_flush_range(start, numpages, 1);
- else
- cpa_flush_all(1);
+ cpa_flush_range(start, numpages, 1);
ret = __change_page_attr_set_clr(&cpa, 1);
@@ -1819,10 +2096,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
* in case TLB flushing gets optimized in the cpa_flush_range()
* path use the same logic as above.
*/
- if (static_cpu_has(X86_FEATURE_CLFLUSH))
- cpa_flush_range(start, numpages, 0);
- else
- cpa_flush_all(0);
+ cpa_flush_range(start, numpages, 0);
return ret;
}
@@ -2035,9 +2309,13 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
/*
* We should perform an IPI and flush all tlbs,
- * but that can deadlock->flush only current cpu:
+ * but that can deadlock->flush only current cpu.
+ * Preemption needs to be disabled around __flush_tlb_all() due to
+ * CR3 reload in __native_flush_tlb().
*/
+ preempt_disable();
__flush_tlb_all();
+ preempt_enable();
arch_flush_lazy_mmu_mode();
}
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 1555bd7d3449..08013524fba1 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -8,7 +8,7 @@
*/
#include <linux/seq_file.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/debugfs.h>
#include <linux/ioport.h>
#include <linux/kernel.h>
@@ -512,6 +512,17 @@ static int free_ram_pages_type(u64 start, u64 end)
return 0;
}
+static u64 sanitize_phys(u64 address)
+{
+ /*
+ * When changing the memtype for pages containing poison allow
+ * for a "decoy" virtual address (bit 63 clear) passed to
+ * set_memory_X(). __pa() on a "decoy" address results in a
+ * physical address with bit 63 set.
+ */
+ return address & __PHYSICAL_MASK;
+}
+
/*
* req_type typically has one of the:
* - _PAGE_CACHE_MODE_WB
@@ -533,6 +544,8 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
int is_range_ram;
int err = 0;
+ start = sanitize_phys(start);
+ end = sanitize_phys(end);
BUG_ON(start >= end); /* end is exclusive */
if (!pat_enabled()) {
@@ -609,6 +622,9 @@ int free_memtype(u64 start, u64 end)
if (!pat_enabled())
return 0;
+ start = sanitize_phys(start);
+ end = sanitize_phys(end);
+
/* Low ISA region is always mapped WB. No need to track */
if (x86_platform.is_untracked_pat_range(start, end))
return 0;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 34cda7e0551b..59274e2c1ac4 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -1,12 +1,18 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/mm.h>
#include <linux/gfp.h>
+#include <linux/hugetlb.h>
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
#include <asm/tlb.h>
#include <asm/fixmap.h>
#include <asm/mtrr.h>
+#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
+phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
+EXPORT_SYMBOL(physical_mask);
+#endif
+
#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO)
#ifdef CONFIG_HIGHPTE
@@ -57,7 +63,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
pgtable_page_dtor(pte);
paravirt_release_pte(page_to_pfn(pte));
- tlb_remove_table(tlb, pte);
+ paravirt_tlb_remove_table(tlb, pte);
}
#if CONFIG_PGTABLE_LEVELS > 2
@@ -73,21 +79,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
tlb->need_flush_all = 1;
#endif
pgtable_pmd_page_dtor(page);
- tlb_remove_table(tlb, page);
+ paravirt_tlb_remove_table(tlb, page);
}
#if CONFIG_PGTABLE_LEVELS > 3
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
- tlb_remove_table(tlb, virt_to_page(pud));
+ paravirt_tlb_remove_table(tlb, virt_to_page(pud));
}
#if CONFIG_PGTABLE_LEVELS > 4
void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
{
paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
- tlb_remove_table(tlb, virt_to_page(p4d));
+ paravirt_tlb_remove_table(tlb, virt_to_page(p4d));
}
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
@@ -109,17 +115,18 @@ static inline void pgd_list_del(pgd_t *pgd)
#define UNSHARED_PTRS_PER_PGD \
(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
+#define MAX_UNSHARED_PTRS_PER_PGD \
+ max_t(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD)
static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
{
- BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
- virt_to_page(pgd)->index = (pgoff_t)mm;
+ virt_to_page(pgd)->pt_mm = mm;
}
struct mm_struct *pgd_page_get_mm(struct page *page)
{
- return (struct mm_struct *)page->index;
+ return page->pt_mm;
}
static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
@@ -176,6 +183,16 @@ static void pgd_dtor(pgd_t *pgd)
* and initialize the kernel pmds here.
*/
#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
+#define MAX_PREALLOCATED_PMDS MAX_UNSHARED_PTRS_PER_PGD
+
+/*
+ * We allocate separate PMDs for the kernel part of the user page-table
+ * when PTI is enabled. We need them to map the per-process LDT into the
+ * user-space page-table.
+ */
+#define PREALLOCATED_USER_PMDS (static_cpu_has(X86_FEATURE_PTI) ? \
+ KERNEL_PGD_PTRS : 0)
+#define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS
void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
{
@@ -197,14 +214,16 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
/* No need to prepopulate any pagetable entries in non-PAE modes. */
#define PREALLOCATED_PMDS 0
-
+#define MAX_PREALLOCATED_PMDS 0
+#define PREALLOCATED_USER_PMDS 0
+#define MAX_PREALLOCATED_USER_PMDS 0
#endif /* CONFIG_X86_PAE */
-static void free_pmds(struct mm_struct *mm, pmd_t *pmds[])
+static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
{
int i;
- for(i = 0; i < PREALLOCATED_PMDS; i++)
+ for (i = 0; i < count; i++)
if (pmds[i]) {
pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
free_page((unsigned long)pmds[i]);
@@ -212,7 +231,7 @@ static void free_pmds(struct mm_struct *mm, pmd_t *pmds[])
}
}
-static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
+static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
{
int i;
bool failed = false;
@@ -221,7 +240,7 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
if (mm == &init_mm)
gfp &= ~__GFP_ACCOUNT;
- for(i = 0; i < PREALLOCATED_PMDS; i++) {
+ for (i = 0; i < count; i++) {
pmd_t *pmd = (pmd_t *)__get_free_page(gfp);
if (!pmd)
failed = true;
@@ -236,7 +255,7 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
}
if (failed) {
- free_pmds(mm, pmds);
+ free_pmds(mm, pmds, count);
return -ENOMEM;
}
@@ -249,23 +268,38 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
* preallocate which never got a corresponding vma will need to be
* freed manually.
*/
+static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
+{
+ pgd_t pgd = *pgdp;
+
+ if (pgd_val(pgd) != 0) {
+ pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+
+ pgd_clear(pgdp);
+
+ paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
+ pmd_free(mm, pmd);
+ mm_dec_nr_pmds(mm);
+ }
+}
+
static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
{
int i;
- for(i = 0; i < PREALLOCATED_PMDS; i++) {
- pgd_t pgd = pgdp[i];
+ for (i = 0; i < PREALLOCATED_PMDS; i++)
+ mop_up_one_pmd(mm, &pgdp[i]);
- if (pgd_val(pgd) != 0) {
- pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
- pgdp[i] = native_make_pgd(0);
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
- paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
- pmd_free(mm, pmd);
- mm_dec_nr_pmds(mm);
- }
- }
+ pgdp = kernel_to_user_pgdp(pgdp);
+
+ for (i = 0; i < PREALLOCATED_USER_PMDS; i++)
+ mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]);
+#endif
}
static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
@@ -291,6 +325,38 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
}
}
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
+ pgd_t *k_pgd, pmd_t *pmds[])
+{
+ pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir);
+ pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
+ p4d_t *u_p4d;
+ pud_t *u_pud;
+ int i;
+
+ u_p4d = p4d_offset(u_pgd, 0);
+ u_pud = pud_offset(u_p4d, 0);
+
+ s_pgd += KERNEL_PGD_BOUNDARY;
+ u_pud += KERNEL_PGD_BOUNDARY;
+
+ for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) {
+ pmd_t *pmd = pmds[i];
+
+ memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd),
+ sizeof(pmd_t) * PTRS_PER_PMD);
+
+ pud_populate(mm, u_pud, pmd);
+ }
+
+}
+#else
+static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
+ pgd_t *k_pgd, pmd_t *pmds[])
+{
+}
+#endif
/*
* Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
* assumes that pgd should be in one page.
@@ -324,9 +390,6 @@ static int __init pgd_cache_init(void)
*/
pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
SLAB_PANIC, NULL);
- if (!pgd_cache)
- return -ENOMEM;
-
return 0;
}
core_initcall(pgd_cache_init);
@@ -338,7 +401,8 @@ static inline pgd_t *_pgd_alloc(void)
* We allocate one page for pgd.
*/
if (!SHARED_KERNEL_PMD)
- return (pgd_t *)__get_free_page(PGALLOC_GFP);
+ return (pgd_t *)__get_free_pages(PGALLOC_GFP,
+ PGD_ALLOCATION_ORDER);
/*
* Now PAE kernel is not running as a Xen domain. We can allocate
@@ -350,7 +414,7 @@ static inline pgd_t *_pgd_alloc(void)
static inline void _pgd_free(pgd_t *pgd)
{
if (!SHARED_KERNEL_PMD)
- free_page((unsigned long)pgd);
+ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
else
kmem_cache_free(pgd_cache, pgd);
}
@@ -370,7 +434,8 @@ static inline void _pgd_free(pgd_t *pgd)
pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *pgd;
- pmd_t *pmds[PREALLOCATED_PMDS];
+ pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS];
+ pmd_t *pmds[MAX_PREALLOCATED_PMDS];
pgd = _pgd_alloc();
@@ -379,12 +444,15 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
mm->pgd = pgd;
- if (preallocate_pmds(mm, pmds) != 0)
+ if (preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
goto out_free_pgd;
- if (paravirt_pgd_alloc(mm) != 0)
+ if (preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
goto out_free_pmds;
+ if (paravirt_pgd_alloc(mm) != 0)
+ goto out_free_user_pmds;
+
/*
* Make sure that pre-populating the pmds is atomic with
* respect to anything walking the pgd_list, so that they
@@ -394,13 +462,16 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
pgd_ctor(mm, pgd);
pgd_prepopulate_pmd(mm, pgd, pmds);
+ pgd_prepopulate_user_pmd(mm, pgd, u_pmds);
spin_unlock(&pgd_lock);
return pgd;
+out_free_user_pmds:
+ free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
out_free_pmds:
- free_pmds(mm, pmds);
+ free_pmds(mm, pmds, PREALLOCATED_PMDS);
out_free_pgd:
_pgd_free(pgd);
out:
@@ -429,7 +500,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
int changed = !pte_same(*ptep, entry);
if (changed && dirty)
- *ptep = entry;
+ set_pte(ptep, entry);
return changed;
}
@@ -444,7 +515,7 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
if (changed && dirty) {
- *pmdp = entry;
+ set_pmd(pmdp, entry);
/*
* We had a write-protection fault here and changed the pmd
* to to more permissive. No need to flush the TLB for that,
@@ -464,7 +535,7 @@ int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
VM_BUG_ON(address & ~HPAGE_PUD_MASK);
if (changed && dirty) {
- *pudp = entry;
+ set_pud(pudp, entry);
/*
* We had a write-protection fault here and changed the pud
* to to more permissive. No need to flush the TLB for that,
@@ -572,6 +643,15 @@ void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
{
unsigned long address = __fix_to_virt(idx);
+#ifdef CONFIG_X86_64
+ /*
+ * Ensure that the static initial page tables are covering the
+ * fixmap completely.
+ */
+ BUILD_BUG_ON(__end_of_permanent_fixed_addresses >
+ (FIXMAP_PMD_NUM * PTRS_PER_PTE));
+#endif
+
if (idx >= __end_of_fixed_addresses) {
BUG();
return;
@@ -583,6 +663,9 @@ void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
pgprot_t flags)
{
+ /* Sanitize 'prot' against any unsupported bits: */
+ pgprot_val(flags) &= __default_kernel_pte_mask;
+
__native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
}
@@ -636,6 +719,10 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
(mtrr != MTRR_TYPE_WRBACK))
return 0;
+ /* Bail out if we are we on a populated non-leaf entry: */
+ if (pud_present(*pud) && !pud_huge(*pud))
+ return 0;
+
prot = pgprot_4k_2_large(prot);
set_pte((pte_t *)pud, pfn_pte(
@@ -664,6 +751,10 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
return 0;
}
+ /* Bail out if we are we on a populated non-leaf entry: */
+ if (pmd_present(*pmd) && !pmd_huge(*pmd))
+ return 0;
+
prot = pgprot_4k_2_large(prot);
set_pte((pte_t *)pmd, pfn_pte(
@@ -703,28 +794,50 @@ int pmd_clear_huge(pmd_t *pmd)
return 0;
}
+#ifdef CONFIG_X86_64
/**
* pud_free_pmd_page - Clear pud entry and free pmd page.
* @pud: Pointer to a PUD.
+ * @addr: Virtual address associated with pud.
*
- * Context: The pud range has been unmaped and TLB purged.
+ * Context: The pud range has been unmapped and TLB purged.
* Return: 1 if clearing the entry succeeded. 0 otherwise.
+ *
+ * NOTE: Callers must allow a single page allocation.
*/
-int pud_free_pmd_page(pud_t *pud)
+int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
- pmd_t *pmd;
+ pmd_t *pmd, *pmd_sv;
+ pte_t *pte;
int i;
if (pud_none(*pud))
return 1;
pmd = (pmd_t *)pud_page_vaddr(*pud);
+ pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
+ if (!pmd_sv)
+ return 0;
- for (i = 0; i < PTRS_PER_PMD; i++)
- if (!pmd_free_pte_page(&pmd[i]))
- return 0;
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd_sv[i] = pmd[i];
+ if (!pmd_none(pmd[i]))
+ pmd_clear(&pmd[i]);
+ }
pud_clear(pud);
+
+ /* INVLPG to clear all paging-structure caches */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (!pmd_none(pmd_sv[i])) {
+ pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
+ free_page((unsigned long)pte);
+ }
+ }
+
+ free_page((unsigned long)pmd_sv);
free_page((unsigned long)pmd);
return 1;
@@ -733,11 +846,12 @@ int pud_free_pmd_page(pud_t *pud)
/**
* pmd_free_pte_page - Clear pmd entry and free pte page.
* @pmd: Pointer to a PMD.
+ * @addr: Virtual address associated with pmd.
*
- * Context: The pmd range has been unmaped and TLB purged.
+ * Context: The pmd range has been unmapped and TLB purged.
* Return: 1 if clearing the entry succeeded. 0 otherwise.
*/
-int pmd_free_pte_page(pmd_t *pmd)
+int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
pte_t *pte;
@@ -746,8 +860,30 @@ int pmd_free_pte_page(pmd_t *pmd)
pte = (pte_t *)pmd_page_vaddr(*pmd);
pmd_clear(pmd);
+
+ /* INVLPG to clear all paging-structure caches */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
+
free_page((unsigned long)pte);
return 1;
}
+
+#else /* !CONFIG_X86_64 */
+
+int pud_free_pmd_page(pud_t *pud, unsigned long addr)
+{
+ return pud_none(*pud);
+}
+
+/*
+ * Disable free page handling on x86-PAE. This assures that ioremap()
+ * does not update sync'd pmd entries. See vmalloc_sync_one().
+ */
+int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
+{
+ return pmd_none(*pmd);
+}
+
+#endif /* CONFIG_X86_64 */
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c
index 7f9acb68324c..bdc98150d4db 100644
--- a/arch/x86/mm/physaddr.c
+++ b/arch/x86/mm/physaddr.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/mmdebug.h>
#include <linux/export.h>
#include <linux/mm.h>
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index d7bc0eea20a5..6e98e0a7c923 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -94,26 +94,27 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey
*/
if (pkey != -1)
return pkey;
- /*
- * Look for a protection-key-drive execute-only mapping
- * which is now being given permissions that are not
- * execute-only. Move it back to the default pkey.
- */
- if (vma_is_pkey_exec_only(vma) &&
- (prot & (PROT_READ|PROT_WRITE))) {
- return 0;
- }
+
/*
* The mapping is execute-only. Go try to get the
* execute-only protection key. If we fail to do that,
* fall through as if we do not have execute-only
- * support.
+ * support in this mm.
*/
if (prot == PROT_EXEC) {
pkey = execute_only_pkey(vma->vm_mm);
if (pkey > 0)
return pkey;
+ } else if (vma_is_pkey_exec_only(vma)) {
+ /*
+ * Protections are *not* PROT_EXEC, but the mapping
+ * is using the exec-only pkey. This mapping was
+ * PROT_EXEC and will no longer be. Move back to
+ * the default pkey.
+ */
+ return ARCH_DEFAULT_PKEY;
}
+
/*
* This is a vanilla, non-pkey mprotect (or we failed to
* setup execute-only), inherit the pkey from the VMA we
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 631507f0c198..4fee5c3003ed 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -45,6 +45,7 @@
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/desc.h>
+#include <asm/sections.h>
#undef pr_fmt
#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
@@ -54,6 +55,16 @@
#define __GFP_NOTRACK 0
#endif
+/*
+ * Define the page-table levels we clone for user-space on 32
+ * and 64 bit.
+ */
+#ifdef CONFIG_X86_64
+#define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PMD
+#else
+#define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PTE
+#endif
+
static void __init pti_print_if_insecure(const char *reason)
{
if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
@@ -66,12 +77,22 @@ static void __init pti_print_if_secure(const char *reason)
pr_info("%s\n", reason);
}
+enum pti_mode {
+ PTI_AUTO = 0,
+ PTI_FORCE_OFF,
+ PTI_FORCE_ON
+} pti_mode;
+
void __init pti_check_boottime_disable(void)
{
char arg[5];
int ret;
+ /* Assume mode is auto unless overridden. */
+ pti_mode = PTI_AUTO;
+
if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
+ pti_mode = PTI_FORCE_OFF;
pti_print_if_insecure("disabled on XEN PV.");
return;
}
@@ -79,18 +100,23 @@ void __init pti_check_boottime_disable(void)
ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
if (ret > 0) {
if (ret == 3 && !strncmp(arg, "off", 3)) {
+ pti_mode = PTI_FORCE_OFF;
pti_print_if_insecure("disabled on command line.");
return;
}
if (ret == 2 && !strncmp(arg, "on", 2)) {
+ pti_mode = PTI_FORCE_ON;
pti_print_if_secure("force enabled on command line.");
goto enable;
}
- if (ret == 4 && !strncmp(arg, "auto", 4))
+ if (ret == 4 && !strncmp(arg, "auto", 4)) {
+ pti_mode = PTI_AUTO;
goto autosel;
+ }
}
if (cmdline_find_option_bool(boot_command_line, "nopti")) {
+ pti_mode = PTI_FORCE_OFF;
pti_print_if_insecure("disabled on command line.");
return;
}
@@ -102,7 +128,7 @@ enable:
setup_force_cpu_cap(X86_FEATURE_PTI);
}
-pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
+pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
{
/*
* Changes to the high (kernel) portion of the kernelmode page
@@ -149,7 +175,7 @@ pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
*
* Returns a pointer to a P4D on success, or NULL on failure.
*/
-static __init p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
+static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
{
pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
@@ -161,7 +187,7 @@ static __init p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
if (pgd_none(*pgd)) {
unsigned long new_p4d_page = __get_free_page(gfp);
- if (!new_p4d_page)
+ if (WARN_ON_ONCE(!new_p4d_page))
return NULL;
set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
@@ -177,16 +203,20 @@ static __init p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
*
* Returns a pointer to a PMD on success, or NULL on failure.
*/
-static __init pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
+static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
{
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
- p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
+ p4d_t *p4d;
pud_t *pud;
+ p4d = pti_user_pagetable_walk_p4d(address);
+ if (!p4d)
+ return NULL;
+
BUILD_BUG_ON(p4d_large(*p4d) != 0);
if (p4d_none(*p4d)) {
unsigned long new_pud_page = __get_free_page(gfp);
- if (!new_pud_page)
+ if (WARN_ON_ONCE(!new_pud_page))
return NULL;
set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
@@ -200,7 +230,7 @@ static __init pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
}
if (pud_none(*pud)) {
unsigned long new_pmd_page = __get_free_page(gfp);
- if (!new_pmd_page)
+ if (WARN_ON_ONCE(!new_pmd_page))
return NULL;
set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
@@ -209,7 +239,6 @@ static __init pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
return pmd_offset(pud, address);
}
-#ifdef CONFIG_X86_VSYSCALL_EMULATION
/*
* Walk the shadow copy of the page tables (optionally) trying to allocate
* page table pages on the way down. Does not support large pages.
@@ -219,12 +248,16 @@ static __init pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
*
* Returns a pointer to a PTE on success, or NULL on failure.
*/
-static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
+static pte_t *pti_user_pagetable_walk_pte(unsigned long address)
{
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
- pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
+ pmd_t *pmd;
pte_t *pte;
+ pmd = pti_user_pagetable_walk_pmd(address);
+ if (!pmd)
+ return NULL;
+
/* We can't do anything sensible if we hit a large mapping. */
if (pmd_large(*pmd)) {
WARN_ON(1);
@@ -247,6 +280,7 @@ static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
return pte;
}
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
static void __init pti_setup_vsyscall(void)
{
pte_t *pte, *target_pte;
@@ -267,8 +301,14 @@ static void __init pti_setup_vsyscall(void)
static void __init pti_setup_vsyscall(void) { }
#endif
-static void __init
-pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
+enum pti_clone_level {
+ PTI_CLONE_PMD,
+ PTI_CLONE_PTE,
+};
+
+static void
+pti_clone_pgtable(unsigned long start, unsigned long end,
+ enum pti_clone_level level)
{
unsigned long addr;
@@ -276,38 +316,105 @@ pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
* Clone the populated PMDs which cover start to end. These PMD areas
* can have holes.
*/
- for (addr = start; addr < end; addr += PMD_SIZE) {
+ for (addr = start; addr < end;) {
+ pte_t *pte, *target_pte;
pmd_t *pmd, *target_pmd;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
+ /* Overflow check */
+ if (addr < start)
+ break;
+
pgd = pgd_offset_k(addr);
if (WARN_ON(pgd_none(*pgd)))
return;
p4d = p4d_offset(pgd, addr);
if (WARN_ON(p4d_none(*p4d)))
return;
+
pud = pud_offset(p4d, addr);
- if (pud_none(*pud))
+ if (pud_none(*pud)) {
+ addr += PUD_SIZE;
continue;
+ }
+
pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd))
+ if (pmd_none(*pmd)) {
+ addr += PMD_SIZE;
continue;
+ }
- target_pmd = pti_user_pagetable_walk_pmd(addr);
- if (WARN_ON(!target_pmd))
- return;
-
- /*
- * Copy the PMD. That is, the kernelmode and usermode
- * tables will share the last-level page tables of this
- * address range
- */
- *target_pmd = pmd_clear_flags(*pmd, clear);
+ if (pmd_large(*pmd) || level == PTI_CLONE_PMD) {
+ target_pmd = pti_user_pagetable_walk_pmd(addr);
+ if (WARN_ON(!target_pmd))
+ return;
+
+ /*
+ * Only clone present PMDs. This ensures only setting
+ * _PAGE_GLOBAL on present PMDs. This should only be
+ * called on well-known addresses anyway, so a non-
+ * present PMD would be a surprise.
+ */
+ if (WARN_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT)))
+ return;
+
+ /*
+ * Setting 'target_pmd' below creates a mapping in both
+ * the user and kernel page tables. It is effectively
+ * global, so set it as global in both copies. Note:
+ * the X86_FEATURE_PGE check is not _required_ because
+ * the CPU ignores _PAGE_GLOBAL when PGE is not
+ * supported. The check keeps consistentency with
+ * code that only set this bit when supported.
+ */
+ if (boot_cpu_has(X86_FEATURE_PGE))
+ *pmd = pmd_set_flags(*pmd, _PAGE_GLOBAL);
+
+ /*
+ * Copy the PMD. That is, the kernelmode and usermode
+ * tables will share the last-level page tables of this
+ * address range
+ */
+ *target_pmd = *pmd;
+
+ addr += PMD_SIZE;
+
+ } else if (level == PTI_CLONE_PTE) {
+
+ /* Walk the page-table down to the pte level */
+ pte = pte_offset_kernel(pmd, addr);
+ if (pte_none(*pte)) {
+ addr += PAGE_SIZE;
+ continue;
+ }
+
+ /* Only clone present PTEs */
+ if (WARN_ON(!(pte_flags(*pte) & _PAGE_PRESENT)))
+ return;
+
+ /* Allocate PTE in the user page-table */
+ target_pte = pti_user_pagetable_walk_pte(addr);
+ if (WARN_ON(!target_pte))
+ return;
+
+ /* Set GLOBAL bit in both PTEs */
+ if (boot_cpu_has(X86_FEATURE_PGE))
+ *pte = pte_set_flags(*pte, _PAGE_GLOBAL);
+
+ /* Clone the PTE */
+ *target_pte = *pte;
+
+ addr += PAGE_SIZE;
+
+ } else {
+ BUG();
+ }
}
}
+#ifdef CONFIG_X86_64
/*
* Clone a single p4d (i.e. a top-level entry on 4-level systems and a
* next-level entry on 5-level systems.
@@ -318,18 +425,71 @@ static void __init pti_clone_p4d(unsigned long addr)
pgd_t *kernel_pgd;
user_p4d = pti_user_pagetable_walk_p4d(addr);
+ if (!user_p4d)
+ return;
+
kernel_pgd = pgd_offset_k(addr);
kernel_p4d = p4d_offset(kernel_pgd, addr);
*user_p4d = *kernel_p4d;
}
/*
- * Clone the CPU_ENTRY_AREA into the user space visible page table.
+ * Clone the CPU_ENTRY_AREA and associated data into the user space visible
+ * page table.
*/
static void __init pti_clone_user_shared(void)
{
+ unsigned int cpu;
+
pti_clone_p4d(CPU_ENTRY_AREA_BASE);
+
+ for_each_possible_cpu(cpu) {
+ /*
+ * The SYSCALL64 entry code needs to be able to find the
+ * thread stack and needs one word of scratch space in which
+ * to spill a register. All of this lives in the TSS, in
+ * the sp1 and sp2 slots.
+ *
+ * This is done for all possible CPUs during boot to ensure
+ * that it's propagated to all mms. If we were to add one of
+ * these mappings during CPU hotplug, we would need to take
+ * some measure to make sure that every mm that subsequently
+ * ran on that CPU would have the relevant PGD entry in its
+ * pagetables. The usual vmalloc_fault() mechanism would not
+ * work for page faults taken in entry_SYSCALL_64 before RSP
+ * is set up.
+ */
+
+ unsigned long va = (unsigned long)&per_cpu(cpu_tss_rw, cpu);
+ phys_addr_t pa = per_cpu_ptr_to_phys((void *)va);
+ pte_t *target_pte;
+
+ target_pte = pti_user_pagetable_walk_pte(va);
+ if (WARN_ON(!target_pte))
+ return;
+
+ *target_pte = pfn_pte(pa >> PAGE_SHIFT, PAGE_KERNEL);
+ }
+}
+
+#else /* CONFIG_X86_64 */
+
+/*
+ * On 32 bit PAE systems with 1GB of Kernel address space there is only
+ * one pgd/p4d for the whole kernel. Cloning that would map the whole
+ * address space into the user page-tables, making PTI useless. So clone
+ * the page-table on the PMD level to prevent that.
+ */
+static void __init pti_clone_user_shared(void)
+{
+ unsigned long start, end;
+
+ start = CPU_ENTRY_AREA_BASE;
+ end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES);
+
+ pti_clone_pgtable(start, end, PTI_CLONE_PMD);
}
+#endif /* CONFIG_X86_64 */
/*
* Clone the ESPFIX P4D into the user space visible page table
@@ -344,11 +504,121 @@ static void __init pti_setup_espfix64(void)
/*
* Clone the populated PMDs of the entry and irqentry text and force it RO.
*/
-static void __init pti_clone_entry_text(void)
+static void pti_clone_entry_text(void)
+{
+ pti_clone_pgtable((unsigned long) __entry_text_start,
+ (unsigned long) __irqentry_text_end,
+ PTI_CLONE_PMD);
+}
+
+/*
+ * Global pages and PCIDs are both ways to make kernel TLB entries
+ * live longer, reduce TLB misses and improve kernel performance.
+ * But, leaving all kernel text Global makes it potentially accessible
+ * to Meltdown-style attacks which make it trivial to find gadgets or
+ * defeat KASLR.
+ *
+ * Only use global pages when it is really worth it.
+ */
+static inline bool pti_kernel_image_global_ok(void)
+{
+ /*
+ * Systems with PCIDs get litlle benefit from global
+ * kernel text and are not worth the downsides.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_PCID))
+ return false;
+
+ /*
+ * Only do global kernel image for pti=auto. Do the most
+ * secure thing (not global) if pti=on specified.
+ */
+ if (pti_mode != PTI_AUTO)
+ return false;
+
+ /*
+ * K8 may not tolerate the cleared _PAGE_RW on the userspace
+ * global kernel image pages. Do the safe thing (disable
+ * global kernel image). This is unlikely to ever be
+ * noticed because PTI is disabled by default on AMD CPUs.
+ */
+ if (boot_cpu_has(X86_FEATURE_K8))
+ return false;
+
+ /*
+ * RANDSTRUCT derives its hardening benefits from the
+ * attacker's lack of knowledge about the layout of kernel
+ * data structures. Keep the kernel image non-global in
+ * cases where RANDSTRUCT is in use to help keep the layout a
+ * secret.
+ */
+ if (IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT))
+ return false;
+
+ return true;
+}
+
+/*
+ * This is the only user for these and it is not arch-generic
+ * like the other set_memory.h functions. Just extern them.
+ */
+extern int set_memory_nonglobal(unsigned long addr, int numpages);
+extern int set_memory_global(unsigned long addr, int numpages);
+
+/*
+ * For some configurations, map all of kernel text into the user page
+ * tables. This reduces TLB misses, especially on non-PCID systems.
+ */
+static void pti_clone_kernel_text(void)
{
- pti_clone_pmds((unsigned long) __entry_text_start,
- (unsigned long) __irqentry_text_end,
- _PAGE_RW | _PAGE_GLOBAL);
+ /*
+ * rodata is part of the kernel image and is normally
+ * readable on the filesystem or on the web. But, do not
+ * clone the areas past rodata, they might contain secrets.
+ */
+ unsigned long start = PFN_ALIGN(_text);
+ unsigned long end_clone = (unsigned long)__end_rodata_aligned;
+ unsigned long end_global = PFN_ALIGN((unsigned long)__stop___ex_table);
+
+ if (!pti_kernel_image_global_ok())
+ return;
+
+ pr_debug("mapping partial kernel image into user address space\n");
+
+ /*
+ * Note that this will undo _some_ of the work that
+ * pti_set_kernel_image_nonglobal() did to clear the
+ * global bit.
+ */
+ pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE);
+
+ /*
+ * pti_clone_pgtable() will set the global bit in any PMDs
+ * that it clones, but we also need to get any PTEs in
+ * the last level for areas that are not huge-page-aligned.
+ */
+
+ /* Set the global bit for normal non-__init kernel text: */
+ set_memory_global(start, (end_global - start) >> PAGE_SHIFT);
+}
+
+void pti_set_kernel_image_nonglobal(void)
+{
+ /*
+ * The identity map is created with PMDs, regardless of the
+ * actual length of the kernel. We need to clear
+ * _PAGE_GLOBAL up to a PMD boundary, not just to the end
+ * of the image.
+ */
+ unsigned long start = PFN_ALIGN(_text);
+ unsigned long end = ALIGN((unsigned long)_end, PMD_PAGE_SIZE);
+
+ /*
+ * This clears _PAGE_GLOBAL from the entire kernel image.
+ * pti_clone_kernel_text() map put _PAGE_GLOBAL back for
+ * areas that are mapped to userspace.
+ */
+ set_memory_nonglobal(start, (end - start) >> PAGE_SHIFT);
}
/*
@@ -361,8 +631,53 @@ void __init pti_init(void)
pr_info("enabled\n");
+#ifdef CONFIG_X86_32
+ /*
+ * We check for X86_FEATURE_PCID here. But the init-code will
+ * clear the feature flag on 32 bit because the feature is not
+ * supported on 32 bit anyway. To print the warning we need to
+ * check with cpuid directly again.
+ */
+ if (cpuid_ecx(0x1) & BIT(17)) {
+ /* Use printk to work around pr_fmt() */
+ printk(KERN_WARNING "\n");
+ printk(KERN_WARNING "************************************************************\n");
+ printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n");
+ printk(KERN_WARNING "** **\n");
+ printk(KERN_WARNING "** You are using 32-bit PTI on a 64-bit PCID-capable CPU. **\n");
+ printk(KERN_WARNING "** Your performance will increase dramatically if you **\n");
+ printk(KERN_WARNING "** switch to a 64-bit kernel! **\n");
+ printk(KERN_WARNING "** **\n");
+ printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n");
+ printk(KERN_WARNING "************************************************************\n");
+ }
+#endif
+
pti_clone_user_shared();
+
+ /* Undo all global bits from the init pagetables in head_64.S: */
+ pti_set_kernel_image_nonglobal();
+ /* Replace some of the global bits just for shared entry text: */
pti_clone_entry_text();
pti_setup_espfix64();
pti_setup_vsyscall();
}
+
+/*
+ * Finalize the kernel mappings in the userspace page-table. Some of the
+ * mappings for the kernel image might have changed since pti_init()
+ * cloned them. This is because parts of the kernel image have been
+ * mapped RO and/or NX. These changes need to be cloned again to the
+ * userspace page-table.
+ */
+void pti_finalize(void)
+{
+ /*
+ * We need to clone everything (again) that maps parts of the
+ * kernel image.
+ */
+ pti_clone_entry_text();
+ pti_clone_kernel_text();
+
+ debug_checkwx_user();
+}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 7f1a51399674..bddd6b3cee1d 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/cpu.h>
#include <linux/debugfs.h>
+#include <linux/ptrace.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -35,7 +36,7 @@
* necessary invalidation by clearing out the 'ctx_id' which
* forces a TLB flush when the context is loaded.
*/
-void clear_asid_other(void)
+static void clear_asid_other(void)
{
u16 asid;
@@ -157,7 +158,7 @@ static void sync_current_stack_to_mm(struct mm_struct *mm)
unsigned long sp = current_stack_pointer;
pgd_t *pgd = pgd_offset(mm, sp);
- if (CONFIG_PGTABLE_LEVELS > 4) {
+ if (pgtable_l5_enabled()) {
if (unlikely(pgd_none(*pgd))) {
pgd_t *pgd_ref = pgd_offset_k(sp);
@@ -180,13 +181,29 @@ static void sync_current_stack_to_mm(struct mm_struct *mm)
}
}
+static bool ibpb_needed(struct task_struct *tsk, u64 last_ctx_id)
+{
+ /*
+ * Check if the current (previous) task has access to the memory
+ * of the @tsk (next) task. If access is denied, make sure to
+ * issue a IBPB to stop user->user Spectre-v2 attacks.
+ *
+ * Note: __ptrace_may_access() returns 0 or -ERRNO.
+ */
+ return (tsk && tsk->mm && tsk->mm->context.ctx_id != last_ctx_id &&
+ ptrace_may_access_sched(tsk, PTRACE_MODE_SPEC_IBPB));
+}
+
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
+ bool need_flush;
+ u16 new_asid;
/*
* NB: The scheduler will call us with prev == next when switching
@@ -240,20 +257,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
next->context.ctx_id);
/*
- * We don't currently support having a real mm loaded without
- * our cpu set in mm_cpumask(). We have all the bookkeeping
- * in place to figure out whether we would need to flush
- * if our cpu were cleared in mm_cpumask(), but we don't
- * currently use it.
+ * Even in lazy TLB mode, the CPU should stay set in the
+ * mm_cpumask. The TLB shootdown code can figure out from
+ * from cpu_tlbstate.is_lazy whether or not to send an IPI.
*/
if (WARN_ON_ONCE(real_prev != &init_mm &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));
- return;
+ /*
+ * If the CPU is not in lazy TLB mode, we are just switching
+ * from one thread in a process to another thread in the same
+ * process. No TLB flush required.
+ */
+ if (!was_lazy)
+ return;
+
+ /*
+ * Read the tlb_gen to check whether a flush is needed.
+ * If the TLB is up to date, just use it.
+ * The barrier synchronizes with the tlb_gen increment in
+ * the TLB shootdown code.
+ */
+ smp_mb();
+ next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+ if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
+ next_tlb_gen)
+ return;
+
+ /*
+ * TLB contents went out of date while we were in lazy
+ * mode. Fall through to the TLB switching code below.
+ */
+ new_asid = prev_asid;
+ need_flush = true;
} else {
- u16 new_asid;
- bool need_flush;
u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
/*
@@ -262,18 +300,13 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* one process from doing Spectre-v2 attacks on another.
*
* As an optimization, flush indirect branches only when
- * switching into processes that disable dumping. This
- * protects high value processes like gpg, without having
- * too high performance overhead. IBPB is *expensive*!
- *
- * This will not flush branches when switching into kernel
- * threads. It will also not flush if we switch to idle
- * thread and back to the same process. It will flush if we
- * switch to a different non-dumpable process.
+ * switching into a processes that can't be ptrace by the
+ * current one (as in such case, attacker has much more
+ * convenient way how to tamper with the next process than
+ * branch buffer poisoning).
*/
- if (tsk && tsk->mm &&
- tsk->mm->context.ctx_id != last_ctx_id &&
- get_dumpable(tsk->mm) != SUID_DUMP_USER)
+ if (static_cpu_has(X86_FEATURE_USE_IBPB) &&
+ ibpb_needed(tsk, last_ctx_id))
indirect_branch_prediction_barrier();
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
@@ -285,55 +318,71 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
sync_current_stack_to_mm(next);
}
- /* Stop remote flushes for the previous mm */
- VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
- real_prev != &init_mm);
- cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+ /*
+ * Stop remote flushes for the previous mm.
+ * Skip kernel threads; we never send init_mm TLB flushing IPIs,
+ * but the bitmap manipulation can cause cache line contention.
+ */
+ if (real_prev != &init_mm) {
+ VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
+ mm_cpumask(real_prev)));
+ cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+ }
/*
* Start remote flushes and then read tlb_gen.
*/
- cpumask_set_cpu(cpu, mm_cpumask(next));
+ if (next != &init_mm)
+ cpumask_set_cpu(cpu, mm_cpumask(next));
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
- if (need_flush) {
- this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
- this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
- load_new_mm_cr3(next->pgd, new_asid, true);
-
- /*
- * NB: This gets called via leave_mm() in the idle path
- * where RCU functions differently. Tracing normally
- * uses RCU, so we need to use the _rcuidle variant.
- *
- * (There is no good reason for this. The idle code should
- * be rearranged to call this before rcu_idle_enter().)
- */
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
- } else {
- /* The new ASID is already up to date. */
- load_new_mm_cr3(next->pgd, new_asid, false);
+ /* Let nmi_uaccess_okay() know that we're changing CR3. */
+ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
+ barrier();
+ }
- /* See above wrt _rcuidle. */
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
- }
+ if (need_flush) {
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+ load_new_mm_cr3(next->pgd, new_asid, true);
/*
- * Record last user mm's context id, so we can avoid
- * flushing branch buffer with IBPB if we switch back
- * to the same user.
+ * NB: This gets called via leave_mm() in the idle path
+ * where RCU functions differently. Tracing normally
+ * uses RCU, so we need to use the _rcuidle variant.
+ *
+ * (There is no good reason for this. The idle code should
+ * be rearranged to call this before rcu_idle_enter().)
*/
- if (next != &init_mm)
- this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ } else {
+ /* The new ASID is already up to date. */
+ load_new_mm_cr3(next->pgd, new_asid, false);
- this_cpu_write(cpu_tlbstate.loaded_mm, next);
- this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+ /* See above wrt _rcuidle. */
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
- load_mm_cr4(next);
- switch_ldt(real_prev, next);
+ /*
+ * Record last user mm's context id, so we can avoid
+ * flushing branch buffer with IBPB if we switch back
+ * to the same user.
+ */
+ if (next != &init_mm)
+ this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
+
+ /* Make sure we write CR3 before loaded_mm. */
+ barrier();
+
+ this_cpu_write(cpu_tlbstate.loaded_mm, next);
+ this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+
+ if (next != real_prev) {
+ load_mm_cr4(next);
+ switch_ldt(real_prev, next);
+ }
}
/*
@@ -354,20 +403,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
return;
- if (tlb_defer_switch_to_init_mm()) {
- /*
- * There's a significant optimization that may be possible
- * here. We have accurate enough TLB flush tracking that we
- * don't need to maintain coherence of TLB per se when we're
- * lazy. We do, however, need to maintain coherence of
- * paging-structure caches. We could, in principle, leave our
- * old mm loaded and only switch to init_mm when
- * tlb_remove_page() happens.
- */
- this_cpu_write(cpu_tlbstate.is_lazy, true);
- } else {
- switch_mm(NULL, &init_mm, NULL);
- }
+ this_cpu_write(cpu_tlbstate.is_lazy, true);
}
/*
@@ -454,6 +490,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
* paging-structure cache to avoid speculatively reading
* garbage into our TLB. Since switching to init_mm is barely
* slower than a minimal flush, just switch to init_mm.
+ *
+ * This should be rare, with native_flush_tlb_others skipping
+ * IPIs to lazy TLB mode CPUs.
*/
switch_mm_irqs_off(NULL, &init_mm, NULL);
return;
@@ -514,17 +553,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
f->new_tlb_gen == local_tlb_gen + 1 &&
f->new_tlb_gen == mm_tlb_gen) {
/* Partial flush */
- unsigned long addr;
- unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
+ unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift;
+ unsigned long addr = f->start;
- addr = f->start;
while (addr < f->end) {
__flush_tlb_one_user(addr);
- addr += PAGE_SIZE;
+ addr += 1UL << f->stride_shift;
}
if (local)
- count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
- trace_tlb_flush(reason, nr_pages);
+ count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
+ trace_tlb_flush(reason, nr_invalidate);
} else {
/* Full flush. */
local_flush_tlb();
@@ -557,6 +595,11 @@ static void flush_tlb_func_remote(void *info)
flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
}
+static bool tlb_is_not_lazy(int cpu, void *data)
+{
+ return !per_cpu(cpu_tlbstate.is_lazy, cpu);
+}
+
void native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
@@ -592,8 +635,23 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
(void *)info, 1);
return;
}
- smp_call_function_many(cpumask, flush_tlb_func_remote,
+
+ /*
+ * If no page tables were freed, we can skip sending IPIs to
+ * CPUs in lazy TLB mode. They will flush the CPU themselves
+ * at the next context switch.
+ *
+ * However, if page tables are getting freed, we need to send the
+ * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
+ * up on the new contents of what used to be page tables, while
+ * doing a speculative memory access.
+ */
+ if (info->freed_tables)
+ smp_call_function_many(cpumask, flush_tlb_func_remote,
(void *)info, 1);
+ else
+ on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
+ (void *)info, 1, GFP_ATOMIC, cpumask);
}
/*
@@ -609,12 +667,15 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
- unsigned long end, unsigned long vmflag)
+ unsigned long end, unsigned int stride_shift,
+ bool freed_tables)
{
int cpu;
- struct flush_tlb_info info = {
+ struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
.mm = mm,
+ .stride_shift = stride_shift,
+ .freed_tables = freed_tables,
};
cpu = get_cpu();
@@ -624,8 +685,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
/* Should we flush just the requested range? */
if ((end != TLB_FLUSH_ALL) &&
- !(vmflag & VM_HUGETLB) &&
- ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
+ ((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) {
info.start = start;
info.end = end;
} else {
OpenPOWER on IntegriCloud