summaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile16
-rw-r--r--arch/x86/mm/amdtopology.c1
-rw-r--r--arch/x86/mm/dump_pagetables.c93
-rw-r--r--arch/x86/mm/extable.c73
-rw-r--r--arch/x86/mm/fault.c229
-rw-r--r--arch/x86/mm/hugetlbpage.c39
-rw-r--r--arch/x86/mm/ident_map.c13
-rw-r--r--arch/x86/mm/init.c47
-rw-r--r--arch/x86/mm/init_64.c45
-rw-r--r--arch/x86/mm/ioremap.c380
-rw-r--r--arch/x86/mm/kasan_init_64.c251
-rw-r--r--arch/x86/mm/kaslr.c1
-rw-r--r--arch/x86/mm/kmemcheck/Makefile1
-rw-r--r--arch/x86/mm/kmemcheck/error.c228
-rw-r--r--arch/x86/mm/kmemcheck/error.h16
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c658
-rw-r--r--arch/x86/mm/kmemcheck/opcode.c107
-rw-r--r--arch/x86/mm/kmemcheck/opcode.h10
-rw-r--r--arch/x86/mm/kmemcheck/pte.c23
-rw-r--r--arch/x86/mm/kmemcheck/pte.h11
-rw-r--r--arch/x86/mm/kmemcheck/selftest.c71
-rw-r--r--arch/x86/mm/kmemcheck/selftest.h7
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c173
-rw-r--r--arch/x86/mm/kmemcheck/shadow.h19
-rw-r--r--arch/x86/mm/kmmio.c1
-rw-r--r--arch/x86/mm/mem_encrypt.c872
-rw-r--r--arch/x86/mm/mem_encrypt_boot.S149
-rw-r--r--arch/x86/mm/mm_internal.h1
-rw-r--r--arch/x86/mm/mmap.c81
-rw-r--r--arch/x86/mm/mpx.c154
-rw-r--r--arch/x86/mm/numa_64.c1
-rw-r--r--arch/x86/mm/numa_emulation.c56
-rw-r--r--arch/x86/mm/numa_internal.h1
-rw-r--r--arch/x86/mm/pageattr-test.c1
-rw-r--r--arch/x86/mm/pageattr.c77
-rw-r--r--arch/x86/mm/pat.c9
-rw-r--r--arch/x86/mm/pat_internal.h1
-rw-r--r--arch/x86/mm/pat_rbtree.c1
-rw-r--r--arch/x86/mm/pgtable.c18
-rw-r--r--arch/x86/mm/pgtable_32.c1
-rw-r--r--arch/x86/mm/physaddr.c1
-rw-r--r--arch/x86/mm/physaddr.h1
-rw-r--r--arch/x86/mm/pkeys.c3
-rw-r--r--arch/x86/mm/setup_nx.c1
-rw-r--r--arch/x86/mm/srat.c1
-rw-r--r--arch/x86/mm/tlb.c404
46 files changed, 2517 insertions, 1830 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 0fbdcb64f9f8..8e13b8cc6bed 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,13 @@
-# Kernel does not boot with instrumentation of tlb.c.
-KCOV_INSTRUMENT_tlb.o := n
+# SPDX-License-Identifier: GPL-2.0
+# Kernel does not boot with instrumentation of tlb.c and mem_encrypt.c
+KCOV_INSTRUMENT_tlb.o := n
+KCOV_INSTRUMENT_mem_encrypt.o := n
+
+KASAN_SANITIZE_mem_encrypt.o := n
+
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_mem_encrypt.o = -pg
+endif
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
pat.o pgtable.o physaddr.o setup_nx.o tlb.o
@@ -21,8 +29,6 @@ obj-$(CONFIG_X86_PTDUMP) += debug_pagetables.o
obj-$(CONFIG_HIGHMEM) += highmem_32.o
-obj-$(CONFIG_KMEMCHECK) += kmemcheck/
-
KASAN_SANITIZE_kasan_init_$(BITS).o := n
obj-$(CONFIG_KASAN) += kasan_init_$(BITS).o
@@ -39,3 +45,5 @@ obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
+obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
+obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
index 91f501b2da3b..048c761d97b0 100644
--- a/arch/x86/mm/amdtopology.c
+++ b/arch/x86/mm/amdtopology.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* AMD NUMA support.
* Discover the memory map and associated nodes.
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 0470826d2bdc..5e3ac6fe6c9e 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -13,12 +13,12 @@
*/
#include <linux/debugfs.h>
+#include <linux/kasan.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
-#include <asm/kasan.h>
#include <asm/pgtable.h>
/*
@@ -138,7 +138,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
{
pgprotval_t pr = pgprot_val(prot);
static const char * const level_name[] =
- { "cr3", "pgd", "pud", "pmd", "pte" };
+ { "cr3", "pgd", "p4d", "pud", "pmd", "pte" };
if (!pgprot_val(prot)) {
/* Not present */
@@ -162,12 +162,12 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
pt_dump_cont_printf(m, dmsg, " ");
/* Bit 7 has a different meaning on level 3 vs 4 */
- if (level <= 3 && pr & _PAGE_PSE)
+ if (level <= 4 && pr & _PAGE_PSE)
pt_dump_cont_printf(m, dmsg, "PSE ");
else
pt_dump_cont_printf(m, dmsg, " ");
- if ((level == 4 && pr & _PAGE_PAT) ||
- ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE))
+ if ((level == 5 && pr & _PAGE_PAT) ||
+ ((level == 4 || level == 3) && pr & _PAGE_PAT_LARGE))
pt_dump_cont_printf(m, dmsg, "PAT ");
else
pt_dump_cont_printf(m, dmsg, " ");
@@ -188,11 +188,12 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
*/
static unsigned long normalize_addr(unsigned long u)
{
-#ifdef CONFIG_X86_64
- return (signed long)(u << 16) >> 16;
-#else
- return u;
-#endif
+ int shift;
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return u;
+
+ shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
+ return (signed long)(u << shift) >> shift;
}
/*
@@ -297,32 +298,62 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
for (i = 0; i < PTRS_PER_PTE; i++) {
prot = pte_flags(*start);
st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
- note_page(m, st, __pgprot(prot), 4);
+ note_page(m, st, __pgprot(prot), 5);
start++;
}
}
+#ifdef CONFIG_KASAN
+
+/*
+ * This is an optimization for KASAN=y case. Since all kasan page tables
+ * eventually point to the kasan_zero_page we could call note_page()
+ * right away without walking through lower level page tables. This saves
+ * us dozens of seconds (minutes for 5-level config) while checking for
+ * W+X mapping or reading kernel_page_tables debugfs file.
+ */
+static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
+ void *pt)
+{
+ if (__pa(pt) == __pa(kasan_zero_pmd) ||
+#ifdef CONFIG_X86_5LEVEL
+ __pa(pt) == __pa(kasan_zero_p4d) ||
+#endif
+ __pa(pt) == __pa(kasan_zero_pud)) {
+ pgprotval_t prot = pte_flags(kasan_zero_pte[0]);
+ note_page(m, st, __pgprot(prot), 5);
+ return true;
+ }
+ return false;
+}
+#else
+static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
+ void *pt)
+{
+ return false;
+}
+#endif
#if PTRS_PER_PMD > 1
static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P)
{
int i;
- pmd_t *start;
+ pmd_t *start, *pmd_start;
pgprotval_t prot;
- start = (pmd_t *)pud_page_vaddr(addr);
+ pmd_start = start = (pmd_t *)pud_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PMD; i++) {
st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
if (!pmd_none(*start)) {
if (pmd_large(*start) || !pmd_present(*start)) {
prot = pmd_flags(*start);
- note_page(m, st, __pgprot(prot), 3);
- } else {
+ note_page(m, st, __pgprot(prot), 4);
+ } else if (!kasan_page_table(m, st, pmd_start)) {
walk_pte_level(m, st, *start,
P + i * PMD_LEVEL_MULT);
}
} else
- note_page(m, st, __pgprot(0), 3);
+ note_page(m, st, __pgprot(0), 4);
start++;
}
}
@@ -335,39 +366,27 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
#if PTRS_PER_PUD > 1
-/*
- * This is an optimization for CONFIG_DEBUG_WX=y + CONFIG_KASAN=y
- * KASAN fills page tables with the same values. Since there is no
- * point in checking page table more than once we just skip repeated
- * entries. This saves us dozens of seconds during boot.
- */
-static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx)
-{
- return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud));
-}
-
static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P)
{
int i;
- pud_t *start;
+ pud_t *start, *pud_start;
pgprotval_t prot;
pud_t *prev_pud = NULL;
- start = (pud_t *)p4d_page_vaddr(addr);
+ pud_start = start = (pud_t *)p4d_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PUD; i++) {
st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
- if (!pud_none(*start) &&
- !pud_already_checked(prev_pud, start, st->check_wx)) {
+ if (!pud_none(*start)) {
if (pud_large(*start) || !pud_present(*start)) {
prot = pud_flags(*start);
- note_page(m, st, __pgprot(prot), 2);
- } else {
+ note_page(m, st, __pgprot(prot), 3);
+ } else if (!kasan_page_table(m, st, pud_start)) {
walk_pmd_level(m, st, *start,
P + i * PUD_LEVEL_MULT);
}
} else
- note_page(m, st, __pgprot(0), 2);
+ note_page(m, st, __pgprot(0), 3);
prev_pud = start;
start++;
@@ -385,10 +404,10 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)
{
int i;
- p4d_t *start;
+ p4d_t *start, *p4d_start;
pgprotval_t prot;
- start = (p4d_t *)pgd_page_vaddr(addr);
+ p4d_start = start = (p4d_t *)pgd_page_vaddr(addr);
for (i = 0; i < PTRS_PER_P4D; i++) {
st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
@@ -396,7 +415,7 @@ static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
if (p4d_large(*start) || !p4d_present(*start)) {
prot = p4d_flags(*start);
note_page(m, st, __pgprot(prot), 2);
- } else {
+ } else if (!kasan_page_table(m, st, p4d_start)) {
walk_pud_level(m, st, *start,
P + i * P4D_LEVEL_MULT);
}
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 0ea8afcb929c..3321b446b66c 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -2,6 +2,7 @@
#include <linux/uaccess.h>
#include <linux/sched/debug.h>
+#include <asm/fpu/internal.h>
#include <asm/traps.h>
#include <asm/kdebug.h>
@@ -36,6 +37,76 @@ bool ex_handler_fault(const struct exception_table_entry *fixup,
}
EXPORT_SYMBOL_GPL(ex_handler_fault);
+/*
+ * Handler for UD0 exception following a failed test against the
+ * result of a refcount inc/dec/add/sub.
+ */
+bool ex_handler_refcount(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr)
+{
+ /* First unconditionally saturate the refcount. */
+ *(int *)regs->cx = INT_MIN / 2;
+
+ /*
+ * Strictly speaking, this reports the fixup destination, not
+ * the fault location, and not the actually overflowing
+ * instruction, which is the instruction before the "js", but
+ * since that instruction could be a variety of lengths, just
+ * report the location after the overflow, which should be close
+ * enough for finding the overflow, as it's at least back in
+ * the function, having returned from .text.unlikely.
+ */
+ regs->ip = ex_fixup_addr(fixup);
+
+ /*
+ * This function has been called because either a negative refcount
+ * value was seen by any of the refcount functions, or a zero
+ * refcount value was seen by refcount_dec().
+ *
+ * If we crossed from INT_MAX to INT_MIN, OF (Overflow Flag: result
+ * wrapped around) will be set. Additionally, seeing the refcount
+ * reach 0 will set ZF (Zero Flag: result was zero). In each of
+ * these cases we want a report, since it's a boundary condition.
+ * The SF case is not reported since it indicates post-boundary
+ * manipulations below zero or above INT_MAX. And if none of the
+ * flags are set, something has gone very wrong, so report it.
+ */
+ if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) {
+ bool zero = regs->flags & X86_EFLAGS_ZF;
+
+ refcount_error_report(regs, zero ? "hit zero" : "overflow");
+ } else if ((regs->flags & X86_EFLAGS_SF) == 0) {
+ /* Report if none of OF, ZF, nor SF are set. */
+ refcount_error_report(regs, "unexpected saturation");
+ }
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(ex_handler_refcount);
+
+/*
+ * Handler for when we fail to restore a task's FPU state. We should never get
+ * here because the FPU state of a task using the FPU (task->thread.fpu.state)
+ * should always be valid. However, past bugs have allowed userspace to set
+ * reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn().
+ * These caused XRSTOR to fail when switching to the task, leaking the FPU
+ * registers of the task previously executing on the CPU. Mitigate this class
+ * of vulnerability by restoring from the initial state (essentially, zeroing
+ * out all the FPU registers) if we can't restore from the task's FPU state.
+ */
+bool ex_handler_fprestore(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr)
+{
+ regs->ip = ex_fixup_addr(fixup);
+
+ WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
+ (void *)instruction_pointer(regs));
+
+ __copy_kernel_to_fpregs(&init_fpstate, -1);
+ return true;
+}
+EXPORT_SYMBOL_GPL(ex_handler_fprestore);
+
bool ex_handler_ext(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
@@ -142,7 +213,7 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr)
* undefined. I'm not sure which CPUs do this, but at least
* the 486 DX works this way.
*/
- if ((regs->cs & 0xFFFF) != __KERNEL_CS)
+ if (regs->cs != __KERNEL_CS)
goto fail;
/*
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2a1fa10c6a98..78ca9a8ee454 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1995 Linus Torvalds
* Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
@@ -19,7 +20,6 @@
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
-#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
#include <asm/fixmap.h> /* VSYSCALL_ADDR */
#include <asm/vsyscall.h> /* emulate_vsyscall */
#include <asm/vm86.h> /* struct vm86 */
@@ -29,26 +29,6 @@
#include <asm/trace/exceptions.h>
/*
- * Page fault error code bits:
- *
- * bit 0 == 0: no page found 1: protection fault
- * bit 1 == 0: read access 1: write access
- * bit 2 == 0: kernel-mode access 1: user-mode access
- * bit 3 == 1: use of reserved bit detected
- * bit 4 == 1: fault was an instruction fetch
- * bit 5 == 1: protection keys block access
- */
-enum x86_pf_error_code {
-
- PF_PROT = 1 << 0,
- PF_WRITE = 1 << 1,
- PF_USER = 1 << 2,
- PF_RSVD = 1 << 3,
- PF_INSTR = 1 << 4,
- PF_PK = 1 << 5,
-};
-
-/*
* Returns 0 if mmiotrace is disabled, or if the fault is not
* handled by mmiotrace:
*/
@@ -149,7 +129,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
* If it was a exec (instruction fetch) fault on NX page, then
* do not ignore the fault:
*/
- if (error_code & PF_INSTR)
+ if (error_code & X86_PF_INSTR)
return 0;
instr = (void *)convert_ip_to_linear(current, regs);
@@ -179,7 +159,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
* siginfo so userspace can discover which protection key was set
* on the PTE.
*
- * If we get here, we know that the hardware signaled a PF_PK
+ * If we get here, we know that the hardware signaled a X86_PF_PK
* fault and that there was a VMA once we got in the fault
* handler. It does *not* guarantee that the VMA we find here
* was the one that we faulted on.
@@ -192,8 +172,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
* 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
* faulted on a pte with its pkey=4.
*/
-static void fill_sig_info_pkey(int si_code, siginfo_t *info,
- struct vm_area_struct *vma)
+static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey)
{
/* This is effectively an #ifdef */
if (!boot_cpu_has(X86_FEATURE_OSPKE))
@@ -205,11 +184,11 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info,
/*
* force_sig_info_fault() is called from a number of
* contexts, some of which have a VMA and some of which
- * do not. The PF_PK handing happens after we have a
+ * do not. The X86_PF_PK handing happens after we have a
* valid VMA, so we should never reach this without a
* valid VMA.
*/
- if (!vma) {
+ if (!pkey) {
WARN_ONCE(1, "PKU fault with no VMA passed in");
info->si_pkey = 0;
return;
@@ -219,13 +198,12 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info,
* absolutely guranteed to be 100% accurate because of
* the race explained above.
*/
- info->si_pkey = vma_pkey(vma);
+ info->si_pkey = *pkey;
}
static void
force_sig_info_fault(int si_signo, int si_code, unsigned long address,
- struct task_struct *tsk, struct vm_area_struct *vma,
- int fault)
+ struct task_struct *tsk, u32 *pkey, int fault)
{
unsigned lsb = 0;
siginfo_t info;
@@ -240,7 +218,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
lsb = PAGE_SHIFT;
info.si_addr_lsb = lsb;
- fill_sig_info_pkey(si_code, &info, vma);
+ fill_sig_info_pkey(si_code, &info, pkey);
force_sig_info(si_signo, &info, tsk);
}
@@ -396,14 +374,18 @@ static void dump_pagetable(unsigned long address)
pte_t *pte;
#ifdef CONFIG_X86_PAE
- printk("*pdpt = %016Lx ", pgd_val(*pgd));
+ pr_info("*pdpt = %016Lx ", pgd_val(*pgd));
if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
goto out;
+#define pr_pde pr_cont
+#else
+#define pr_pde pr_info
#endif
p4d = p4d_offset(pgd, address);
pud = pud_offset(p4d, address);
pmd = pmd_offset(pud, address);
- printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
+ pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
+#undef pr_pde
/*
* We must not directly access the pte in the highpte
@@ -415,9 +397,9 @@ static void dump_pagetable(unsigned long address)
goto out;
pte = pte_offset_kernel(pmd, address);
- printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
+ pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
out:
- printk("\n");
+ pr_cont("\n");
}
#else /* CONFIG_X86_64: */
@@ -565,7 +547,7 @@ static void dump_pagetable(unsigned long address)
if (bad_address(pgd))
goto bad;
- printk("PGD %lx ", pgd_val(*pgd));
+ pr_info("PGD %lx ", pgd_val(*pgd));
if (!pgd_present(*pgd))
goto out;
@@ -574,7 +556,7 @@ static void dump_pagetable(unsigned long address)
if (bad_address(p4d))
goto bad;
- printk("P4D %lx ", p4d_val(*p4d));
+ pr_cont("P4D %lx ", p4d_val(*p4d));
if (!p4d_present(*p4d) || p4d_large(*p4d))
goto out;
@@ -582,7 +564,7 @@ static void dump_pagetable(unsigned long address)
if (bad_address(pud))
goto bad;
- printk("PUD %lx ", pud_val(*pud));
+ pr_cont("PUD %lx ", pud_val(*pud));
if (!pud_present(*pud) || pud_large(*pud))
goto out;
@@ -590,7 +572,7 @@ static void dump_pagetable(unsigned long address)
if (bad_address(pmd))
goto bad;
- printk("PMD %lx ", pmd_val(*pmd));
+ pr_cont("PMD %lx ", pmd_val(*pmd));
if (!pmd_present(*pmd) || pmd_large(*pmd))
goto out;
@@ -598,12 +580,12 @@ static void dump_pagetable(unsigned long address)
if (bad_address(pte))
goto bad;
- printk("PTE %lx", pte_val(*pte));
+ pr_cont("PTE %lx", pte_val(*pte));
out:
- printk("\n");
+ pr_cont("\n");
return;
bad:
- printk("BAD\n");
+ pr_info("BAD\n");
}
#endif /* CONFIG_X86_64 */
@@ -695,7 +677,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
if (!oops_may_print())
return;
- if (error_code & PF_INSTR) {
+ if (error_code & X86_PF_INSTR) {
unsigned int level;
pgd_t *pgd;
pte_t *pte;
@@ -758,8 +740,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,
struct task_struct *tsk = current;
unsigned long flags;
int sig;
- /* No context means no VMA to pass down */
- struct vm_area_struct *vma = NULL;
/* Are we prepared to handle this kernel fault? */
if (fixup_exception(regs, X86_TRAP_PF)) {
@@ -779,12 +759,12 @@ no_context(struct pt_regs *regs, unsigned long error_code,
*/
if (current->thread.sig_on_uaccess_err && signal) {
tsk->thread.trap_nr = X86_TRAP_PF;
- tsk->thread.error_code = error_code | PF_USER;
+ tsk->thread.error_code = error_code | X86_PF_USER;
tsk->thread.cr2 = address;
/* XXX: hwpoison faults will set the wrong code. */
force_sig_info_fault(signal, si_code, address,
- tsk, vma, 0);
+ tsk, NULL, 0);
}
/*
@@ -802,7 +782,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,
if (is_vmalloc_addr((void *)address) &&
(((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
- register void *__sp asm("rsp");
unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *);
/*
* We're likely to be running with very little stack space
@@ -817,7 +796,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
asm volatile ("movq %[stack], %%rsp\n\t"
"call handle_stack_overflow\n\t"
"1: jmp 1b"
- : "+r" (__sp)
+ : ASM_CALL_CONSTRAINT
: "D" ("kernel stack overflow (page fault)"),
"S" (regs), "d" (address),
[stack] "rm" (stack));
@@ -893,13 +872,12 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
static void
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, struct vm_area_struct *vma,
- int si_code)
+ unsigned long address, u32 *pkey, int si_code)
{
struct task_struct *tsk = current;
/* User mode accesses just cause a SIGSEGV */
- if (error_code & PF_USER) {
+ if (error_code & X86_PF_USER) {
/*
* It's possible to have interrupts off here:
*/
@@ -920,7 +898,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
* Instruction fetch faults in the vsyscall page might need
* emulation.
*/
- if (unlikely((error_code & PF_INSTR) &&
+ if (unlikely((error_code & X86_PF_INSTR) &&
((address & ~0xfff) == VSYSCALL_ADDR))) {
if (emulate_vsyscall(regs, address))
return;
@@ -933,7 +911,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
* are always protection faults.
*/
if (address >= TASK_SIZE_MAX)
- error_code |= PF_PROT;
+ error_code |= X86_PF_PROT;
if (likely(show_unhandled_signals))
show_signal_msg(regs, error_code, address, tsk);
@@ -942,7 +920,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_PF;
- force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0);
+ force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0);
return;
}
@@ -955,9 +933,9 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
static noinline void
bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, struct vm_area_struct *vma)
+ unsigned long address, u32 *pkey)
{
- __bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR);
+ __bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR);
}
static void
@@ -965,6 +943,10 @@ __bad_area(struct pt_regs *regs, unsigned long error_code,
unsigned long address, struct vm_area_struct *vma, int si_code)
{
struct mm_struct *mm = current->mm;
+ u32 pkey;
+
+ if (vma)
+ pkey = vma_pkey(vma);
/*
* Something tried to access memory that isn't in our memory map..
@@ -972,7 +954,8 @@ __bad_area(struct pt_regs *regs, unsigned long error_code,
*/
up_read(&mm->mmap_sem);
- __bad_area_nosemaphore(regs, error_code, address, vma, si_code);
+ __bad_area_nosemaphore(regs, error_code, address,
+ (vma) ? &pkey : NULL, si_code);
}
static noinline void
@@ -989,11 +972,11 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code,
if (!boot_cpu_has(X86_FEATURE_OSPKE))
return false;
- if (error_code & PF_PK)
+ if (error_code & X86_PF_PK)
return true;
/* this checks permission keys on the VMA: */
- if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
- (error_code & PF_INSTR), foreign))
+ if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
+ (error_code & X86_PF_INSTR), foreign))
return true;
return false;
}
@@ -1015,13 +998,13 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
static void
do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
- struct vm_area_struct *vma, unsigned int fault)
+ u32 *pkey, unsigned int fault)
{
struct task_struct *tsk = current;
int code = BUS_ADRERR;
/* Kernel mode? Handle exceptions or die: */
- if (!(error_code & PF_USER)) {
+ if (!(error_code & X86_PF_USER)) {
no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
return;
}
@@ -1042,22 +1025,21 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
code = BUS_MCEERR_AR;
}
#endif
- force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault);
+ force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault);
}
static noinline void
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, struct vm_area_struct *vma,
- unsigned int fault)
+ unsigned long address, u32 *pkey, unsigned int fault)
{
- if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
+ if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
no_context(regs, error_code, address, 0, 0);
return;
}
if (fault & VM_FAULT_OOM) {
/* Kernel mode? Handle exceptions or die: */
- if (!(error_code & PF_USER)) {
+ if (!(error_code & X86_PF_USER)) {
no_context(regs, error_code, address,
SIGSEGV, SEGV_MAPERR);
return;
@@ -1072,9 +1054,9 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
} else {
if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
VM_FAULT_HWPOISON_LARGE))
- do_sigbus(regs, error_code, address, vma, fault);
+ do_sigbus(regs, error_code, address, pkey, fault);
else if (fault & VM_FAULT_SIGSEGV)
- bad_area_nosemaphore(regs, error_code, address, vma);
+ bad_area_nosemaphore(regs, error_code, address, pkey);
else
BUG();
}
@@ -1082,16 +1064,16 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
static int spurious_fault_check(unsigned long error_code, pte_t *pte)
{
- if ((error_code & PF_WRITE) && !pte_write(*pte))
+ if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
return 0;
- if ((error_code & PF_INSTR) && !pte_exec(*pte))
+ if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
return 0;
/*
* Note: We do not do lazy flushing on protection key
- * changes, so no spurious fault will ever set PF_PK.
+ * changes, so no spurious fault will ever set X86_PF_PK.
*/
- if ((error_code & PF_PK))
+ if ((error_code & X86_PF_PK))
return 1;
return 1;
@@ -1137,8 +1119,8 @@ spurious_fault(unsigned long error_code, unsigned long address)
* change, so user accesses are not expected to cause spurious
* faults.
*/
- if (error_code != (PF_WRITE | PF_PROT)
- && error_code != (PF_INSTR | PF_PROT))
+ if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
+ error_code != (X86_PF_INSTR | X86_PF_PROT))
return 0;
pgd = init_mm.pgd + pgd_index(address);
@@ -1198,19 +1180,19 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
* always an unconditional error and can never result in
* a follow-up action to resolve the fault, like a COW.
*/
- if (error_code & PF_PK)
+ if (error_code & X86_PF_PK)
return 1;
/*
* Make sure to check the VMA so that we do not perform
- * faults just to hit a PF_PK as soon as we fill in a
+ * faults just to hit a X86_PF_PK as soon as we fill in a
* page.
*/
- if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
- (error_code & PF_INSTR), foreign))
+ if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
+ (error_code & X86_PF_INSTR), foreign))
return 1;
- if (error_code & PF_WRITE) {
+ if (error_code & X86_PF_WRITE) {
/* write, present and write, not present: */
if (unlikely(!(vma->vm_flags & VM_WRITE)))
return 1;
@@ -1218,7 +1200,7 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
}
/* read, present: */
- if (unlikely(error_code & PF_PROT))
+ if (unlikely(error_code & X86_PF_PROT))
return 1;
/* read, not present: */
@@ -1241,7 +1223,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
if (!static_cpu_has(X86_FEATURE_SMAP))
return false;
- if (error_code & PF_USER)
+ if (error_code & X86_PF_USER)
return false;
if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
@@ -1254,10 +1236,6 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
* routines.
- *
- * This function must have noinline because both callers
- * {,trace_}do_page_fault() have notrace on. Having this an actual function
- * guarantees there's a function trace entry.
*/
static noinline void
__do_page_fault(struct pt_regs *regs, unsigned long error_code,
@@ -1268,6 +1246,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
struct mm_struct *mm;
int fault, major = 0;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+ u32 pkey;
tsk = current;
mm = tsk->mm;
@@ -1276,8 +1255,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* Detect and handle instructions that would cause a page fault for
* both a tracked kernel page and a userspace page.
*/
- if (kmemcheck_active(regs))
- kmemcheck_hide(regs);
prefetchw(&mm->mmap_sem);
if (unlikely(kmmio_fault(regs, address)))
@@ -1297,12 +1274,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* protection error (error_code & 9) == 0.
*/
if (unlikely(fault_in_kernel_space(address))) {
- if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
+ if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
if (vmalloc_fault(address) >= 0)
return;
-
- if (kmemcheck_fault(regs, address, error_code))
- return;
}
/* Can handle a stale RO->RW TLB: */
@@ -1325,7 +1299,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
if (unlikely(kprobes_fault(regs)))
return;
- if (unlikely(error_code & PF_RSVD))
+ if (unlikely(error_code & X86_PF_RSVD))
pgtable_bad(regs, error_code, address);
if (unlikely(smap_violation(error_code, regs))) {
@@ -1351,7 +1325,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
*/
if (user_mode(regs)) {
local_irq_enable();
- error_code |= PF_USER;
+ error_code |= X86_PF_USER;
flags |= FAULT_FLAG_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
@@ -1360,9 +1334,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
- if (error_code & PF_WRITE)
+ if (error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
- if (error_code & PF_INSTR)
+ if (error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
/*
@@ -1382,7 +1356,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* space check, thus avoiding the deadlock:
*/
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
- if ((error_code & PF_USER) == 0 &&
+ if (!(error_code & X86_PF_USER) &&
!search_exception_tables(regs->ip)) {
bad_area_nosemaphore(regs, error_code, address, NULL);
return;
@@ -1409,7 +1383,7 @@ retry:
bad_area(regs, error_code, address);
return;
}
- if (error_code & PF_USER) {
+ if (error_code & X86_PF_USER) {
/*
* Accessing the stack below %sp is always a bug.
* The large cushion allows instructions like enter
@@ -1441,7 +1415,17 @@ good_area:
* make sure we exit gracefully rather than endlessly redo
* the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
* we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
+ *
+ * Note that handle_userfault() may also release and reacquire mmap_sem
+ * (and not return with VM_FAULT_RETRY), when returning to userland to
+ * repeat the page fault later with a VM_FAULT_NOPAGE retval
+ * (potentially after handling any pending signal during the return to
+ * userland). The return to userland is identified whenever
+ * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
+ * Thus we have to be careful about not touching vma after handling the
+ * fault, so we read the pkey beforehand.
*/
+ pkey = vma_pkey(vma);
fault = handle_mm_fault(vma, address, flags);
major |= fault & VM_FAULT_MAJOR;
@@ -1470,7 +1454,7 @@ good_area:
up_read(&mm->mmap_sem);
if (unlikely(fault & VM_FAULT_ERROR)) {
- mm_fault_error(regs, error_code, address, vma, fault);
+ mm_fault_error(regs, error_code, address, &pkey, fault);
return;
}
@@ -1490,27 +1474,6 @@ good_area:
}
NOKPROBE_SYMBOL(__do_page_fault);
-dotraplinkage void notrace
-do_page_fault(struct pt_regs *regs, unsigned long error_code)
-{
- unsigned long address = read_cr2(); /* Get the faulting address */
- enum ctx_state prev_state;
-
- /*
- * We must have this function tagged with __kprobes, notrace and call
- * read_cr2() before calling anything else. To avoid calling any kind
- * of tracing machinery before we've observed the CR2 value.
- *
- * exception_{enter,exit}() contain all sorts of tracepoints.
- */
-
- prev_state = exception_enter();
- __do_page_fault(regs, error_code, address);
- exception_exit(prev_state);
-}
-NOKPROBE_SYMBOL(do_page_fault);
-
-#ifdef CONFIG_TRACING
static nokprobe_inline void
trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
unsigned long error_code)
@@ -1521,22 +1484,24 @@ trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
trace_page_fault_kernel(address, regs, error_code);
}
+/*
+ * We must have this function blacklisted from kprobes, tagged with notrace
+ * and call read_cr2() before calling anything else. To avoid calling any
+ * kind of tracing machinery before we've observed the CR2 value.
+ *
+ * exception_{enter,exit}() contains all sorts of tracepoints.
+ */
dotraplinkage void notrace
-trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
+do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
- /*
- * The exception_enter and tracepoint processing could
- * trigger another page faults (user space callchain
- * reading) and destroy the original cr2 value, so read
- * the faulting address now.
- */
- unsigned long address = read_cr2();
+ unsigned long address = read_cr2(); /* Get the faulting address */
enum ctx_state prev_state;
prev_state = exception_enter();
- trace_page_fault_entries(address, regs, error_code);
+ if (trace_pagefault_enabled())
+ trace_page_fault_entries(address, regs, error_code);
+
__do_page_fault(regs, error_code, address);
exception_exit(prev_state);
}
-NOKPROBE_SYMBOL(trace_do_page_fault);
-#endif /* CONFIG_TRACING */
+NOKPROBE_SYMBOL(do_page_fault);
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 2824607df108..00b296617ca4 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* IA-32 Huge TLB Page Support for Kernel.
*
@@ -18,6 +19,7 @@
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
#include <asm/elf.h>
+#include <asm/mpx.h>
#if 0 /* This is just for testing */
struct page *
@@ -85,25 +87,38 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
info.flags = 0;
info.length = len;
info.low_limit = get_mmap_base(1);
+
+ /*
+ * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
+ * in the full address space.
+ */
info.high_limit = in_compat_syscall() ?
- tasksize_32bit() : tasksize_64bit();
+ task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW);
+
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
return vm_unmapped_area(&info);
}
static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
- unsigned long addr0, unsigned long len,
+ unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
struct hstate *h = hstate_file(file);
struct vm_unmapped_area_info info;
- unsigned long addr;
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.low_limit = PAGE_SIZE;
info.high_limit = get_mmap_base(0);
+
+ /*
+ * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
+ * in the full address space.
+ */
+ if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall())
+ info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
+
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
addr = vm_unmapped_area(&info);
@@ -118,7 +133,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
VM_BUG_ON(addr != -ENOMEM);
info.flags = 0;
info.low_limit = TASK_UNMAPPED_BASE;
- info.high_limit = TASK_SIZE;
+ info.high_limit = TASK_SIZE_LOW;
addr = vm_unmapped_area(&info);
}
@@ -135,9 +150,15 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
if (len & ~huge_page_mask(h))
return -EINVAL;
+
+ addr = mpx_unmapped_area_check(addr, len, flags);
+ if (IS_ERR_VALUE(addr))
+ return addr;
+
if (len > TASK_SIZE)
return -ENOMEM;
+ /* No address checking. See comment at mmap_address_hint_valid() */
if (flags & MAP_FIXED) {
if (prepare_hugepage_range(file, addr, len))
return -EINVAL;
@@ -145,12 +166,16 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
}
if (addr) {
- addr = ALIGN(addr, huge_page_size(h));
+ addr &= huge_page_mask(h);
+ if (!mmap_address_hint_valid(addr, len))
+ goto get_unmapped_area;
+
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vm_start_gap(vma)))
+ if (!vma || addr + len <= vm_start_gap(vma))
return addr;
}
+
+get_unmapped_area:
if (mm->get_unmapped_area == arch_get_unmapped_area)
return hugetlb_get_unmapped_area_bottomup(file, addr, len,
pgoff, flags);
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index adab1595f4bd..ab33a32df2a8 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Helper routines for building identity mapping page tables. This is
* included by both the compressed kernel and the regular kernel.
@@ -51,7 +52,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
if (!pmd)
return -ENOMEM;
ident_pmd_init(info, pmd, addr, next);
- set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ set_pud(pud, __pud(__pa(pmd) | info->kernpg_flag));
}
return 0;
@@ -79,7 +80,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
if (!pud)
return -ENOMEM;
ident_pud_init(info, pud, addr, next);
- set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
+ set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag));
}
return 0;
@@ -93,6 +94,10 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
unsigned long next;
int result;
+ /* Set the default pagetable flags if not supplied */
+ if (!info->kernpg_flag)
+ info->kernpg_flag = _KERNPG_TABLE;
+
for (; addr < end; addr = next) {
pgd_t *pgd = pgd_page + pgd_index(addr);
p4d_t *p4d;
@@ -116,14 +121,14 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
if (result)
return result;
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
- set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
+ set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag));
} else {
/*
* With p4d folded, pgd is equal to p4d.
* The pgd entry has to point to the pud page table in this case.
*/
pud_t *pud = pud_offset(p4d, 0);
- set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+ set_pgd(pgd, __pgd(__pa(pud) | info->kernpg_flag));
}
}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 673541eb3b3f..6fdf91ef130a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -18,6 +18,8 @@
#include <asm/dma.h> /* for MAX_DMA_PFN */
#include <asm/microcode.h>
#include <asm/kaslr.h>
+#include <asm/hypervisor.h>
+#include <asm/cpufeature.h>
/*
* We need to define the tracepoints somewhere, and tlb.c
@@ -90,8 +92,7 @@ __ref void *alloc_low_pages(unsigned int num)
unsigned int order;
order = get_order((unsigned long)num << PAGE_SHIFT);
- return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
- __GFP_ZERO, order);
+ return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
}
if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
@@ -162,12 +163,11 @@ static int page_size_mask;
static void __init probe_page_size_mask(void)
{
/*
- * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
- * use small pages.
+ * For pagealloc debugging, identity mapping will use small pages.
* This will simplify cpa(), which otherwise needs to support splitting
* large pages into small in interrupt context, etc.
*/
- if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled() && !IS_ENABLED(CONFIG_KMEMCHECK))
+ if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled())
page_size_mask |= 1 << PG_LEVEL_2M;
else
direct_gbpages = 0;
@@ -192,6 +192,38 @@ static void __init probe_page_size_mask(void)
}
}
+static void setup_pcid(void)
+{
+#ifdef CONFIG_X86_64
+ if (boot_cpu_has(X86_FEATURE_PCID)) {
+ if (boot_cpu_has(X86_FEATURE_PGE)) {
+ /*
+ * This can't be cr4_set_bits_and_update_boot() --
+ * the trampoline code can't handle CR4.PCIDE and
+ * it wouldn't do any good anyway. Despite the name,
+ * cr4_set_bits_and_update_boot() doesn't actually
+ * cause the bits in question to remain set all the
+ * way through the secondary boot asm.
+ *
+ * Instead, we brute-force it and set CR4.PCIDE
+ * manually in start_secondary().
+ */
+ cr4_set_bits(X86_CR4_PCIDE);
+ } else {
+ /*
+ * flush_tlb_all(), as currently implemented, won't
+ * work if PCID is on but PGE is not. Since that
+ * combination doesn't exist on real hardware, there's
+ * no reason to try to fully support it, but it's
+ * polite to avoid corrupting data if we're on
+ * an improperly configured VM.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+ }
+ }
+#endif
+}
+
#ifdef CONFIG_X86_32
#define NR_RANGE_MR 3
#else /* CONFIG_X86_64 */
@@ -591,6 +623,7 @@ void __init init_mem_mapping(void)
unsigned long end;
probe_page_size_mask();
+ setup_pcid();
#ifdef CONFIG_X86_64
end = max_pfn << PAGE_SHIFT;
@@ -636,6 +669,8 @@ void __init init_mem_mapping(void)
load_cr3(swapper_pg_dir);
__flush_tlb_all();
+ x86_init.hyper.init_mem_mapping();
+
early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
}
@@ -812,7 +847,7 @@ void __init zone_sizes_init(void)
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
.loaded_mm = &init_mm,
- .state = 0,
+ .next_asid = 1,
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
};
EXPORT_SYMBOL_GPL(cpu_tlbstate);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 136422d7d539..4a837289f2ad 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -184,7 +184,7 @@ static __ref void *spp_getpage(void)
void *ptr;
if (after_bootmem)
- ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
+ ptr = (void *) get_zeroed_page(GFP_ATOMIC);
else
ptr = alloc_bootmem_pages(PAGE_SIZE);
@@ -761,7 +761,7 @@ void __init paging_init(void)
* After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
* updating.
*/
-static void update_end_of_memory_vars(u64 start, u64 size)
+static void update_end_of_memory_vars(u64 start, u64 size)
{
unsigned long end_pfn = PFN_UP(start + size);
@@ -772,22 +772,30 @@ static void update_end_of_memory_vars(u64 start, u64 size)
}
}
-int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
+int add_pages(int nid, unsigned long start_pfn,
+ unsigned long nr_pages, bool want_memblock)
{
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
- init_memory_mapping(start, start + size);
-
ret = __add_pages(nid, start_pfn, nr_pages, want_memblock);
WARN_ON_ONCE(ret);
/* update max_pfn, max_low_pfn and high_memory */
- update_end_of_memory_vars(start, size);
+ update_end_of_memory_vars(start_pfn << PAGE_SHIFT,
+ nr_pages << PAGE_SHIFT);
return ret;
}
+
+int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+
+ init_memory_mapping(start, start + size);
+
+ return add_pages(nid, start_pfn, nr_pages, want_memblock);
+}
EXPORT_SYMBOL_GPL(arch_add_memory);
#define PAGE_INUSE 0xFD
@@ -1165,12 +1173,18 @@ void __init mem_init(void)
/* clear_bss() already clear the empty_zero_page */
- register_page_bootmem_info();
-
/* this will put all memory onto the freelists */
free_all_bootmem();
after_bootmem = 1;
+ /*
+ * Must be done after boot memory is put on freelist, because here we
+ * might set fields in deferred struct pages that have not yet been
+ * initialized, and free_all_bootmem() initializes all the reserved
+ * deferred pages for us.
+ */
+ register_page_bootmem_info();
+
/* Register memory areas for /proc/kcore */
kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR,
PAGE_SIZE, KCORE_OTHER);
@@ -1391,7 +1405,6 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
vmemmap_verify((pte_t *)pmd, node, addr, next);
continue;
}
- pr_warn_once("vmemmap: falling back to regular page backing\n");
if (vmemmap_populate_basepages(addr, next, node))
return -ENOMEM;
}
@@ -1418,16 +1431,16 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
void register_page_bootmem_memmap(unsigned long section_nr,
- struct page *start_page, unsigned long size)
+ struct page *start_page, unsigned long nr_pages)
{
unsigned long addr = (unsigned long)start_page;
- unsigned long end = (unsigned long)(start_page + size);
+ unsigned long end = (unsigned long)(start_page + nr_pages);
unsigned long next;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
- unsigned int nr_pages;
+ unsigned int nr_pmd_pages;
struct page *page;
for (; addr < end; addr = next) {
@@ -1474,9 +1487,9 @@ void register_page_bootmem_memmap(unsigned long section_nr,
if (pmd_none(*pmd))
continue;
- nr_pages = 1 << (get_order(PMD_SIZE));
+ nr_pmd_pages = 1 << get_order(PMD_SIZE);
page = pmd_page(*pmd);
- while (nr_pages--)
+ while (nr_pmd_pages--)
get_page_bootmem(section_nr, page++,
SECTION_INFO);
}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 4c1b5fd0c7ad..6e4573b1da34 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -13,6 +13,8 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/mmiotrace.h>
+#include <linux/mem_encrypt.h>
+#include <linux/efi.h>
#include <asm/set_memory.h>
#include <asm/e820/api.h>
@@ -21,9 +23,15 @@
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
#include <asm/pat.h>
+#include <asm/setup.h>
#include "physaddr.h"
+struct ioremap_mem_flags {
+ bool system_ram;
+ bool desc_other;
+};
+
/*
* Fix up the linear direct mapping of the kernel to avoid cache attribute
* conflicts.
@@ -53,17 +61,59 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
return err;
}
-static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
- void *arg)
+static bool __ioremap_check_ram(struct resource *res)
{
+ unsigned long start_pfn, stop_pfn;
unsigned long i;
- for (i = 0; i < nr_pages; ++i)
- if (pfn_valid(start_pfn + i) &&
- !PageReserved(pfn_to_page(start_pfn + i)))
- return 1;
+ if ((res->flags & IORESOURCE_SYSTEM_RAM) != IORESOURCE_SYSTEM_RAM)
+ return false;
- return 0;
+ start_pfn = (res->start + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ stop_pfn = (res->end + 1) >> PAGE_SHIFT;
+ if (stop_pfn > start_pfn) {
+ for (i = 0; i < (stop_pfn - start_pfn); ++i)
+ if (pfn_valid(start_pfn + i) &&
+ !PageReserved(pfn_to_page(start_pfn + i)))
+ return true;
+ }
+
+ return false;
+}
+
+static int __ioremap_check_desc_other(struct resource *res)
+{
+ return (res->desc != IORES_DESC_NONE);
+}
+
+static int __ioremap_res_check(struct resource *res, void *arg)
+{
+ struct ioremap_mem_flags *flags = arg;
+
+ if (!flags->system_ram)
+ flags->system_ram = __ioremap_check_ram(res);
+
+ if (!flags->desc_other)
+ flags->desc_other = __ioremap_check_desc_other(res);
+
+ return flags->system_ram && flags->desc_other;
+}
+
+/*
+ * To avoid multiple resource walks, this function walks resources marked as
+ * IORESOURCE_MEM and IORESOURCE_BUSY and looking for system RAM and/or a
+ * resource described not as IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES).
+ */
+static void __ioremap_check_mem(resource_size_t addr, unsigned long size,
+ struct ioremap_mem_flags *flags)
+{
+ u64 start, end;
+
+ start = (u64)addr;
+ end = start + size - 1;
+ memset(flags, 0, sizeof(*flags));
+
+ walk_mem_res(start, end, flags, __ioremap_res_check);
}
/*
@@ -84,9 +134,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
unsigned long size, enum page_cache_mode pcm, void *caller)
{
unsigned long offset, vaddr;
- resource_size_t pfn, last_pfn, last_addr;
+ resource_size_t last_addr;
const resource_size_t unaligned_phys_addr = phys_addr;
const unsigned long unaligned_size = size;
+ struct ioremap_mem_flags mem_flags;
struct vm_struct *area;
enum page_cache_mode new_pcm;
pgprot_t prot;
@@ -105,19 +156,12 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
return NULL;
}
- /*
- * Don't remap the low PCI/ISA area, it's always mapped..
- */
- if (is_ISA_range(phys_addr, last_addr))
- return (__force void __iomem *)phys_to_virt(phys_addr);
+ __ioremap_check_mem(phys_addr, size, &mem_flags);
/*
* Don't allow anybody to remap normal RAM that we're using..
*/
- pfn = phys_addr >> PAGE_SHIFT;
- last_pfn = last_addr >> PAGE_SHIFT;
- if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
- __ioremap_check_ram) == 1) {
+ if (mem_flags.system_ram) {
WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n",
&phys_addr, &last_addr);
return NULL;
@@ -149,7 +193,15 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
pcm = new_pcm;
}
+ /*
+ * If the page being mapped is in memory and SEV is active then
+ * make sure the memory encryption attribute is enabled in the
+ * resulting mapping.
+ */
prot = PAGE_KERNEL_IO;
+ if (sev_active() && mem_flags.desc_other)
+ prot = pgprot_encrypted(prot);
+
switch (pcm) {
case _PAGE_CACHE_MODE_UC:
default:
@@ -340,13 +392,17 @@ void iounmap(volatile void __iomem *addr)
return;
/*
- * __ioremap special-cases the PCI/ISA range by not instantiating a
- * vm_area and by simply returning an address into the kernel mapping
- * of ISA space. So handle that here.
+ * The PCI/ISA range special-casing was removed from __ioremap()
+ * so this check, in theory, can be removed. However, there are
+ * cases where iounmap() is called for addresses not obtained via
+ * ioremap() (vga16fb for example). Add a warning so that these
+ * cases can be caught and fixed.
*/
if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) &&
- (void __force *)addr < phys_to_virt(ISA_END_ADDRESS))
+ (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) {
+ WARN(1, "iounmap() called for ISA range not obtained using ioremap()\n");
return;
+ }
addr = (volatile void __iomem *)
(PAGE_MASK & (unsigned long __force)addr);
@@ -399,12 +455,10 @@ void *xlate_dev_mem_ptr(phys_addr_t phys)
unsigned long offset = phys & ~PAGE_MASK;
void *vaddr;
- /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
- if (page_is_ram(start >> PAGE_SHIFT))
- return __va(phys);
+ /* memremap() maps if RAM, otherwise falls back to ioremap() */
+ vaddr = memremap(start, PAGE_SIZE, MEMREMAP_WB);
- vaddr = ioremap_cache(start, PAGE_SIZE);
- /* Only add the offset on success and return NULL if the ioremap() failed: */
+ /* Only add the offset on success and return NULL if memremap() failed */
if (vaddr)
vaddr += offset;
@@ -413,11 +467,279 @@ void *xlate_dev_mem_ptr(phys_addr_t phys)
void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
{
- if (page_is_ram(phys >> PAGE_SHIFT))
- return;
+ memunmap((void *)((unsigned long)addr & PAGE_MASK));
+}
+
+/*
+ * Examine the physical address to determine if it is an area of memory
+ * that should be mapped decrypted. If the memory is not part of the
+ * kernel usable area it was accessed and created decrypted, so these
+ * areas should be mapped decrypted. And since the encryption key can
+ * change across reboots, persistent memory should also be mapped
+ * decrypted.
+ *
+ * If SEV is active, that implies that BIOS/UEFI also ran encrypted so
+ * only persistent memory should be mapped decrypted.
+ */
+static bool memremap_should_map_decrypted(resource_size_t phys_addr,
+ unsigned long size)
+{
+ int is_pmem;
+
+ /*
+ * Check if the address is part of a persistent memory region.
+ * This check covers areas added by E820, EFI and ACPI.
+ */
+ is_pmem = region_intersects(phys_addr, size, IORESOURCE_MEM,
+ IORES_DESC_PERSISTENT_MEMORY);
+ if (is_pmem != REGION_DISJOINT)
+ return true;
+
+ /*
+ * Check if the non-volatile attribute is set for an EFI
+ * reserved area.
+ */
+ if (efi_enabled(EFI_BOOT)) {
+ switch (efi_mem_type(phys_addr)) {
+ case EFI_RESERVED_TYPE:
+ if (efi_mem_attributes(phys_addr) & EFI_MEMORY_NV)
+ return true;
+ break;
+ default:
+ break;
+ }
+ }
+
+ /* Check if the address is outside kernel usable area */
+ switch (e820__get_entry_type(phys_addr, phys_addr + size - 1)) {
+ case E820_TYPE_RESERVED:
+ case E820_TYPE_ACPI:
+ case E820_TYPE_NVS:
+ case E820_TYPE_UNUSABLE:
+ /* For SEV, these areas are encrypted */
+ if (sev_active())
+ break;
+ /* Fallthrough */
+
+ case E820_TYPE_PRAM:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+/*
+ * Examine the physical address to determine if it is EFI data. Check
+ * it against the boot params structure and EFI tables and memory types.
+ */
+static bool memremap_is_efi_data(resource_size_t phys_addr,
+ unsigned long size)
+{
+ u64 paddr;
+
+ /* Check if the address is part of EFI boot/runtime data */
+ if (!efi_enabled(EFI_BOOT))
+ return false;
+
+ paddr = boot_params.efi_info.efi_memmap_hi;
+ paddr <<= 32;
+ paddr |= boot_params.efi_info.efi_memmap;
+ if (phys_addr == paddr)
+ return true;
+
+ paddr = boot_params.efi_info.efi_systab_hi;
+ paddr <<= 32;
+ paddr |= boot_params.efi_info.efi_systab;
+ if (phys_addr == paddr)
+ return true;
+
+ if (efi_is_table_address(phys_addr))
+ return true;
+
+ switch (efi_mem_type(phys_addr)) {
+ case EFI_BOOT_SERVICES_DATA:
+ case EFI_RUNTIME_SERVICES_DATA:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+/*
+ * Examine the physical address to determine if it is boot data by checking
+ * it against the boot params setup_data chain.
+ */
+static bool memremap_is_setup_data(resource_size_t phys_addr,
+ unsigned long size)
+{
+ struct setup_data *data;
+ u64 paddr, paddr_next;
+
+ paddr = boot_params.hdr.setup_data;
+ while (paddr) {
+ unsigned int len;
+
+ if (phys_addr == paddr)
+ return true;
+
+ data = memremap(paddr, sizeof(*data),
+ MEMREMAP_WB | MEMREMAP_DEC);
+
+ paddr_next = data->next;
+ len = data->len;
+
+ memunmap(data);
+
+ if ((phys_addr > paddr) && (phys_addr < (paddr + len)))
+ return true;
+
+ paddr = paddr_next;
+ }
+
+ return false;
+}
+
+/*
+ * Examine the physical address to determine if it is boot data by checking
+ * it against the boot params setup_data chain (early boot version).
+ */
+static bool __init early_memremap_is_setup_data(resource_size_t phys_addr,
+ unsigned long size)
+{
+ struct setup_data *data;
+ u64 paddr, paddr_next;
+
+ paddr = boot_params.hdr.setup_data;
+ while (paddr) {
+ unsigned int len;
+
+ if (phys_addr == paddr)
+ return true;
+
+ data = early_memremap_decrypted(paddr, sizeof(*data));
+
+ paddr_next = data->next;
+ len = data->len;
+
+ early_memunmap(data, sizeof(*data));
+
+ if ((phys_addr > paddr) && (phys_addr < (paddr + len)))
+ return true;
+
+ paddr = paddr_next;
+ }
+
+ return false;
+}
+
+/*
+ * Architecture function to determine if RAM remap is allowed. By default, a
+ * RAM remap will map the data as encrypted. Determine if a RAM remap should
+ * not be done so that the data will be mapped decrypted.
+ */
+bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size,
+ unsigned long flags)
+{
+ if (!mem_encrypt_active())
+ return true;
+
+ if (flags & MEMREMAP_ENC)
+ return true;
+
+ if (flags & MEMREMAP_DEC)
+ return false;
+
+ if (sme_active()) {
+ if (memremap_is_setup_data(phys_addr, size) ||
+ memremap_is_efi_data(phys_addr, size))
+ return false;
+ }
+
+ return !memremap_should_map_decrypted(phys_addr, size);
+}
+
+/*
+ * Architecture override of __weak function to adjust the protection attributes
+ * used when remapping memory. By default, early_memremap() will map the data
+ * as encrypted. Determine if an encrypted mapping should not be done and set
+ * the appropriate protection attributes.
+ */
+pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr,
+ unsigned long size,
+ pgprot_t prot)
+{
+ bool encrypted_prot;
+
+ if (!mem_encrypt_active())
+ return prot;
+
+ encrypted_prot = true;
+
+ if (sme_active()) {
+ if (early_memremap_is_setup_data(phys_addr, size) ||
+ memremap_is_efi_data(phys_addr, size))
+ encrypted_prot = false;
+ }
+
+ if (encrypted_prot && memremap_should_map_decrypted(phys_addr, size))
+ encrypted_prot = false;
+
+ return encrypted_prot ? pgprot_encrypted(prot)
+ : pgprot_decrypted(prot);
+}
+
+bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size)
+{
+ return arch_memremap_can_ram_remap(phys_addr, size, 0);
+}
+
+#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT
+/* Remap memory with encryption */
+void __init *early_memremap_encrypted(resource_size_t phys_addr,
+ unsigned long size)
+{
+ return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC);
+}
+
+/*
+ * Remap memory with encryption and write-protected - cannot be called
+ * before pat_init() is called
+ */
+void __init *early_memremap_encrypted_wp(resource_size_t phys_addr,
+ unsigned long size)
+{
+ /* Be sure the write-protect PAT entry is set for write-protect */
+ if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
+ return NULL;
+
+ return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP);
+}
+
+/* Remap memory without encryption */
+void __init *early_memremap_decrypted(resource_size_t phys_addr,
+ unsigned long size)
+{
+ return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC);
+}
+
+/*
+ * Remap memory without encryption and write-protected - cannot be called
+ * before pat_init() is called
+ */
+void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
+ unsigned long size)
+{
+ /* Be sure the write-protect PAT entry is set for write-protect */
+ if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
+ return NULL;
- iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
+ return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP);
}
+#endif /* CONFIG_ARCH_USE_MEMREMAP_PROT */
static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 02c9d7553409..99dfed6dfef8 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -1,21 +1,153 @@
+// SPDX-License-Identifier: GPL-2.0
#define DISABLE_BRANCH_PROFILING
#define pr_fmt(fmt) "kasan: " fmt
#include <linux/bootmem.h>
#include <linux/kasan.h>
#include <linux/kdebug.h>
+#include <linux/memblock.h>
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/vmalloc.h>
#include <asm/e820/types.h>
+#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
+#include <asm/pgtable.h>
-extern pgd_t early_top_pgt[PTRS_PER_PGD];
extern struct range pfn_mapped[E820_MAX_ENTRIES];
-static int __init map_range(struct range *range)
+static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
+
+static __init void *early_alloc(size_t size, int nid)
+{
+ return memblock_virt_alloc_try_nid_nopanic(size, size,
+ __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid);
+}
+
+static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
+ unsigned long end, int nid)
+{
+ pte_t *pte;
+
+ if (pmd_none(*pmd)) {
+ void *p;
+
+ if (boot_cpu_has(X86_FEATURE_PSE) &&
+ ((end - addr) == PMD_SIZE) &&
+ IS_ALIGNED(addr, PMD_SIZE)) {
+ p = early_alloc(PMD_SIZE, nid);
+ if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL))
+ return;
+ else if (p)
+ memblock_free(__pa(p), PMD_SIZE);
+ }
+
+ p = early_alloc(PAGE_SIZE, nid);
+ pmd_populate_kernel(&init_mm, pmd, p);
+ }
+
+ pte = pte_offset_kernel(pmd, addr);
+ do {
+ pte_t entry;
+ void *p;
+
+ if (!pte_none(*pte))
+ continue;
+
+ p = early_alloc(PAGE_SIZE, nid);
+ entry = pfn_pte(PFN_DOWN(__pa(p)), PAGE_KERNEL);
+ set_pte_at(&init_mm, addr, pte, entry);
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+}
+
+static void __init kasan_populate_pud(pud_t *pud, unsigned long addr,
+ unsigned long end, int nid)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ if (pud_none(*pud)) {
+ void *p;
+
+ if (boot_cpu_has(X86_FEATURE_GBPAGES) &&
+ ((end - addr) == PUD_SIZE) &&
+ IS_ALIGNED(addr, PUD_SIZE)) {
+ p = early_alloc(PUD_SIZE, nid);
+ if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL))
+ return;
+ else if (p)
+ memblock_free(__pa(p), PUD_SIZE);
+ }
+
+ p = early_alloc(PAGE_SIZE, nid);
+ pud_populate(&init_mm, pud, p);
+ }
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (!pmd_large(*pmd))
+ kasan_populate_pmd(pmd, addr, next, nid);
+ } while (pmd++, addr = next, addr != end);
+}
+
+static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr,
+ unsigned long end, int nid)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ if (p4d_none(*p4d)) {
+ void *p = early_alloc(PAGE_SIZE, nid);
+
+ p4d_populate(&init_mm, p4d, p);
+ }
+
+ pud = pud_offset(p4d, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (!pud_large(*pud))
+ kasan_populate_pud(pud, addr, next, nid);
+ } while (pud++, addr = next, addr != end);
+}
+
+static void __init kasan_populate_pgd(pgd_t *pgd, unsigned long addr,
+ unsigned long end, int nid)
+{
+ void *p;
+ p4d_t *p4d;
+ unsigned long next;
+
+ if (pgd_none(*pgd)) {
+ p = early_alloc(PAGE_SIZE, nid);
+ pgd_populate(&init_mm, pgd, p);
+ }
+
+ p4d = p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+ kasan_populate_p4d(p4d, addr, next, nid);
+ } while (p4d++, addr = next, addr != end);
+}
+
+static void __init kasan_populate_shadow(unsigned long addr, unsigned long end,
+ int nid)
+{
+ pgd_t *pgd;
+ unsigned long next;
+
+ addr = addr & PAGE_MASK;
+ end = round_up(end, PAGE_SIZE);
+ pgd = pgd_offset_k(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ kasan_populate_pgd(pgd, addr, next, nid);
+ } while (pgd++, addr = next, addr != end);
+}
+
+static void __init map_range(struct range *range)
{
unsigned long start;
unsigned long end;
@@ -23,15 +155,17 @@ static int __init map_range(struct range *range)
start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start));
end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end));
- return vmemmap_populate(start, end, NUMA_NO_NODE);
+ kasan_populate_shadow(start, end, early_pfn_to_nid(range->start));
}
static void __init clear_pgds(unsigned long start,
unsigned long end)
{
pgd_t *pgd;
+ /* See comment in kasan_init() */
+ unsigned long pgd_end = end & PGDIR_MASK;
- for (; start < end; start += PGDIR_SIZE) {
+ for (; start < pgd_end; start += PGDIR_SIZE) {
pgd = pgd_offset_k(start);
/*
* With folded p4d, pgd_clear() is nop, use p4d_clear()
@@ -42,29 +176,61 @@ static void __init clear_pgds(unsigned long start,
else
pgd_clear(pgd);
}
+
+ pgd = pgd_offset_k(start);
+ for (; start < end; start += P4D_SIZE)
+ p4d_clear(p4d_offset(pgd, start));
+}
+
+static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr)
+{
+ unsigned long p4d;
+
+ if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+ return (p4d_t *)pgd;
+
+ p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK;
+ p4d += __START_KERNEL_map - phys_base;
+ return (p4d_t *)p4d + p4d_index(addr);
+}
+
+static void __init kasan_early_p4d_populate(pgd_t *pgd,
+ unsigned long addr,
+ unsigned long end)
+{
+ pgd_t pgd_entry;
+ p4d_t *p4d, p4d_entry;
+ unsigned long next;
+
+ if (pgd_none(*pgd)) {
+ pgd_entry = __pgd(_KERNPG_TABLE | __pa_nodebug(kasan_zero_p4d));
+ set_pgd(pgd, pgd_entry);
+ }
+
+ p4d = early_p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+
+ if (!p4d_none(*p4d))
+ continue;
+
+ p4d_entry = __p4d(_KERNPG_TABLE | __pa_nodebug(kasan_zero_pud));
+ set_p4d(p4d, p4d_entry);
+ } while (p4d++, addr = next, addr != end && p4d_none(*p4d));
}
static void __init kasan_map_early_shadow(pgd_t *pgd)
{
- int i;
- unsigned long start = KASAN_SHADOW_START;
+ /* See comment in kasan_init() */
+ unsigned long addr = KASAN_SHADOW_START & PGDIR_MASK;
unsigned long end = KASAN_SHADOW_END;
+ unsigned long next;
- for (i = pgd_index(start); start < end; i++) {
- switch (CONFIG_PGTABLE_LEVELS) {
- case 4:
- pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) |
- _KERNPG_TABLE);
- break;
- case 5:
- pgd[i] = __pgd(__pa_nodebug(kasan_zero_p4d) |
- _KERNPG_TABLE);
- break;
- default:
- BUILD_BUG();
- }
- start += PGDIR_SIZE;
- }
+ pgd += pgd_index(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ kasan_early_p4d_populate(pgd, addr, next);
+ } while (pgd++, addr = next, addr != end);
}
#ifdef CONFIG_KASAN_INLINE
@@ -87,7 +253,7 @@ static struct notifier_block kasan_die_notifier = {
void __init kasan_early_init(void)
{
int i;
- pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL;
+ pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL | _PAGE_ENC;
pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE;
pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE;
p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE;
@@ -101,7 +267,7 @@ void __init kasan_early_init(void)
for (i = 0; i < PTRS_PER_PUD; i++)
kasan_zero_pud[i] = __pud(pud_val);
- for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++)
+ for (i = 0; IS_ENABLED(CONFIG_X86_5LEVEL) && i < PTRS_PER_P4D; i++)
kasan_zero_p4d[i] = __p4d(p4d_val);
kasan_map_early_shadow(early_top_pgt);
@@ -117,28 +283,51 @@ void __init kasan_init(void)
#endif
memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
+
+ /*
+ * We use the same shadow offset for 4- and 5-level paging to
+ * facilitate boot-time switching between paging modes.
+ * As result in 5-level paging mode KASAN_SHADOW_START and
+ * KASAN_SHADOW_END are not aligned to PGD boundary.
+ *
+ * KASAN_SHADOW_START doesn't share PGD with anything else.
+ * We claim whole PGD entry to make things easier.
+ *
+ * KASAN_SHADOW_END lands in the last PGD entry and it collides with
+ * bunch of things like kernel code, modules, EFI mapping, etc.
+ * We need to take extra steps to not overwrite them.
+ */
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ void *ptr;
+
+ ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END));
+ memcpy(tmp_p4d_table, (void *)ptr, sizeof(tmp_p4d_table));
+ set_pgd(&early_top_pgt[pgd_index(KASAN_SHADOW_END)],
+ __pgd(__pa(tmp_p4d_table) | _KERNPG_TABLE));
+ }
+
load_cr3(early_top_pgt);
__flush_tlb_all();
- clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
+ clear_pgds(KASAN_SHADOW_START & PGDIR_MASK, KASAN_SHADOW_END);
- kasan_populate_zero_shadow((void *)KASAN_SHADOW_START,
+ kasan_populate_zero_shadow((void *)(KASAN_SHADOW_START & PGDIR_MASK),
kasan_mem_to_shadow((void *)PAGE_OFFSET));
for (i = 0; i < E820_MAX_ENTRIES; i++) {
if (pfn_mapped[i].end == 0)
break;
- if (map_range(&pfn_mapped[i]))
- panic("kasan: unable to allocate shadow!");
+ map_range(&pfn_mapped[i]);
}
+
kasan_populate_zero_shadow(
kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
kasan_mem_to_shadow((void *)__START_KERNEL_map));
- vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext),
- (unsigned long)kasan_mem_to_shadow(_end),
- NUMA_NO_NODE);
+ kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext),
+ (unsigned long)kasan_mem_to_shadow(_end),
+ early_pfn_to_nid(__pa(_stext)));
kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
(void *)KASAN_SHADOW_END);
@@ -153,7 +342,7 @@ void __init kasan_init(void)
*/
memset(kasan_zero_page, 0, PAGE_SIZE);
for (i = 0; i < PTRS_PER_PTE; i++) {
- pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO);
+ pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO | _PAGE_ENC);
set_pte(&kasan_zero_pte[i], pte);
}
/* Flush TLBs again to be sure that write protection applied. */
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index af599167fe3c..879ef930e2c2 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This file implements KASLR memory randomization for x86_64. It randomizes
* the virtual address space of kernel memory regions (physical memory
diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile
deleted file mode 100644
index 520b3bce4095..000000000000
--- a/arch/x86/mm/kmemcheck/Makefile
+++ /dev/null
@@ -1 +0,0 @@
-obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index dab41876cdd5..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -1,227 +1 @@
-#include <linux/interrupt.h>
-#include <linux/kdebug.h>
-#include <linux/kmemcheck.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/stacktrace.h>
-#include <linux/string.h>
-
-#include "error.h"
-#include "shadow.h"
-
-enum kmemcheck_error_type {
- KMEMCHECK_ERROR_INVALID_ACCESS,
- KMEMCHECK_ERROR_BUG,
-};
-
-#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT)
-
-struct kmemcheck_error {
- enum kmemcheck_error_type type;
-
- union {
- /* KMEMCHECK_ERROR_INVALID_ACCESS */
- struct {
- /* Kind of access that caused the error */
- enum kmemcheck_shadow state;
- /* Address and size of the erroneous read */
- unsigned long address;
- unsigned int size;
- };
- };
-
- struct pt_regs regs;
- struct stack_trace trace;
- unsigned long trace_entries[32];
-
- /* We compress it to a char. */
- unsigned char shadow_copy[SHADOW_COPY_SIZE];
- unsigned char memory_copy[SHADOW_COPY_SIZE];
-};
-
-/*
- * Create a ring queue of errors to output. We can't call printk() directly
- * from the kmemcheck traps, since this may call the console drivers and
- * result in a recursive fault.
- */
-static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE];
-static unsigned int error_count;
-static unsigned int error_rd;
-static unsigned int error_wr;
-static unsigned int error_missed_count;
-
-static struct kmemcheck_error *error_next_wr(void)
-{
- struct kmemcheck_error *e;
-
- if (error_count == ARRAY_SIZE(error_fifo)) {
- ++error_missed_count;
- return NULL;
- }
-
- e = &error_fifo[error_wr];
- if (++error_wr == ARRAY_SIZE(error_fifo))
- error_wr = 0;
- ++error_count;
- return e;
-}
-
-static struct kmemcheck_error *error_next_rd(void)
-{
- struct kmemcheck_error *e;
-
- if (error_count == 0)
- return NULL;
-
- e = &error_fifo[error_rd];
- if (++error_rd == ARRAY_SIZE(error_fifo))
- error_rd = 0;
- --error_count;
- return e;
-}
-
-void kmemcheck_error_recall(void)
-{
- static const char *desc[] = {
- [KMEMCHECK_SHADOW_UNALLOCATED] = "unallocated",
- [KMEMCHECK_SHADOW_UNINITIALIZED] = "uninitialized",
- [KMEMCHECK_SHADOW_INITIALIZED] = "initialized",
- [KMEMCHECK_SHADOW_FREED] = "freed",
- };
-
- static const char short_desc[] = {
- [KMEMCHECK_SHADOW_UNALLOCATED] = 'a',
- [KMEMCHECK_SHADOW_UNINITIALIZED] = 'u',
- [KMEMCHECK_SHADOW_INITIALIZED] = 'i',
- [KMEMCHECK_SHADOW_FREED] = 'f',
- };
-
- struct kmemcheck_error *e;
- unsigned int i;
-
- e = error_next_rd();
- if (!e)
- return;
-
- switch (e->type) {
- case KMEMCHECK_ERROR_INVALID_ACCESS:
- printk(KERN_WARNING "WARNING: kmemcheck: Caught %d-bit read from %s memory (%p)\n",
- 8 * e->size, e->state < ARRAY_SIZE(desc) ?
- desc[e->state] : "(invalid shadow state)",
- (void *) e->address);
-
- printk(KERN_WARNING);
- for (i = 0; i < SHADOW_COPY_SIZE; ++i)
- printk(KERN_CONT "%02x", e->memory_copy[i]);
- printk(KERN_CONT "\n");
-
- printk(KERN_WARNING);
- for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
- if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
- printk(KERN_CONT " %c", short_desc[e->shadow_copy[i]]);
- else
- printk(KERN_CONT " ?");
- }
- printk(KERN_CONT "\n");
- printk(KERN_WARNING "%*c\n", 2 + 2
- * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
- break;
- case KMEMCHECK_ERROR_BUG:
- printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n");
- break;
- }
-
- __show_regs(&e->regs, 1);
- print_stack_trace(&e->trace, 0);
-}
-
-static void do_wakeup(unsigned long data)
-{
- while (error_count > 0)
- kmemcheck_error_recall();
-
- if (error_missed_count > 0) {
- printk(KERN_WARNING "kmemcheck: Lost %d error reports because "
- "the queue was too small\n", error_missed_count);
- error_missed_count = 0;
- }
-}
-
-static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0);
-
-/*
- * Save the context of an error report.
- */
-void kmemcheck_error_save(enum kmemcheck_shadow state,
- unsigned long address, unsigned int size, struct pt_regs *regs)
-{
- static unsigned long prev_ip;
-
- struct kmemcheck_error *e;
- void *shadow_copy;
- void *memory_copy;
-
- /* Don't report several adjacent errors from the same EIP. */
- if (regs->ip == prev_ip)
- return;
- prev_ip = regs->ip;
-
- e = error_next_wr();
- if (!e)
- return;
-
- e->type = KMEMCHECK_ERROR_INVALID_ACCESS;
-
- e->state = state;
- e->address = address;
- e->size = size;
-
- /* Save regs */
- memcpy(&e->regs, regs, sizeof(*regs));
-
- /* Save stack trace */
- e->trace.nr_entries = 0;
- e->trace.entries = e->trace_entries;
- e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
- e->trace.skip = 0;
- save_stack_trace_regs(regs, &e->trace);
-
- /* Round address down to nearest 16 bytes */
- shadow_copy = kmemcheck_shadow_lookup(address
- & ~(SHADOW_COPY_SIZE - 1));
- BUG_ON(!shadow_copy);
-
- memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE);
-
- kmemcheck_show_addr(address);
- memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1));
- memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE);
- kmemcheck_hide_addr(address);
-
- tasklet_hi_schedule_first(&kmemcheck_tasklet);
-}
-
-/*
- * Save the context of a kmemcheck bug.
- */
-void kmemcheck_error_save_bug(struct pt_regs *regs)
-{
- struct kmemcheck_error *e;
-
- e = error_next_wr();
- if (!e)
- return;
-
- e->type = KMEMCHECK_ERROR_BUG;
-
- memcpy(&e->regs, regs, sizeof(*regs));
-
- e->trace.nr_entries = 0;
- e->trace.entries = e->trace_entries;
- e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
- e->trace.skip = 1;
- save_stack_trace(&e->trace);
-
- tasklet_hi_schedule_first(&kmemcheck_tasklet);
-}
+// SPDX-License-Identifier: GPL-2.0
diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h
index 0efc2e8d0a20..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/error.h
+++ b/arch/x86/mm/kmemcheck/error.h
@@ -1,15 +1 @@
-#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H
-#define ARCH__X86__MM__KMEMCHECK__ERROR_H
-
-#include <linux/ptrace.h>
-
-#include "shadow.h"
-
-void kmemcheck_error_save(enum kmemcheck_shadow state,
- unsigned long address, unsigned int size, struct pt_regs *regs);
-
-void kmemcheck_error_save_bug(struct pt_regs *regs);
-
-void kmemcheck_error_recall(void);
-
-#endif
+/* SPDX-License-Identifier: GPL-2.0 */
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
deleted file mode 100644
index 4515bae36bbe..000000000000
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ /dev/null
@@ -1,658 +0,0 @@
-/**
- * kmemcheck - a heavyweight memory checker for the linux kernel
- * Copyright (C) 2007, 2008 Vegard Nossum <vegardno@ifi.uio.no>
- * (With a lot of help from Ingo Molnar and Pekka Enberg.)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License (version 2) as
- * published by the Free Software Foundation.
- */
-
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/kallsyms.h>
-#include <linux/kernel.h>
-#include <linux/kmemcheck.h>
-#include <linux/mm.h>
-#include <linux/page-flags.h>
-#include <linux/percpu.h>
-#include <linux/ptrace.h>
-#include <linux/string.h>
-#include <linux/types.h>
-
-#include <asm/cacheflush.h>
-#include <asm/kmemcheck.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-
-#include "error.h"
-#include "opcode.h"
-#include "pte.h"
-#include "selftest.h"
-#include "shadow.h"
-
-
-#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT
-# define KMEMCHECK_ENABLED 0
-#endif
-
-#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT
-# define KMEMCHECK_ENABLED 1
-#endif
-
-#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT
-# define KMEMCHECK_ENABLED 2
-#endif
-
-int kmemcheck_enabled = KMEMCHECK_ENABLED;
-
-int __init kmemcheck_init(void)
-{
-#ifdef CONFIG_SMP
- /*
- * Limit SMP to use a single CPU. We rely on the fact that this code
- * runs before SMP is set up.
- */
- if (setup_max_cpus > 1) {
- printk(KERN_INFO
- "kmemcheck: Limiting number of CPUs to 1.\n");
- setup_max_cpus = 1;
- }
-#endif
-
- if (!kmemcheck_selftest()) {
- printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n");
- kmemcheck_enabled = 0;
- return -EINVAL;
- }
-
- printk(KERN_INFO "kmemcheck: Initialized\n");
- return 0;
-}
-
-early_initcall(kmemcheck_init);
-
-/*
- * We need to parse the kmemcheck= option before any memory is allocated.
- */
-static int __init param_kmemcheck(char *str)
-{
- int val;
- int ret;
-
- if (!str)
- return -EINVAL;
-
- ret = kstrtoint(str, 0, &val);
- if (ret)
- return ret;
- kmemcheck_enabled = val;
- return 0;
-}
-
-early_param("kmemcheck", param_kmemcheck);
-
-int kmemcheck_show_addr(unsigned long address)
-{
- pte_t *pte;
-
- pte = kmemcheck_pte_lookup(address);
- if (!pte)
- return 0;
-
- set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
- __flush_tlb_one(address);
- return 1;
-}
-
-int kmemcheck_hide_addr(unsigned long address)
-{
- pte_t *pte;
-
- pte = kmemcheck_pte_lookup(address);
- if (!pte)
- return 0;
-
- set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
- __flush_tlb_one(address);
- return 1;
-}
-
-struct kmemcheck_context {
- bool busy;
- int balance;
-
- /*
- * There can be at most two memory operands to an instruction, but
- * each address can cross a page boundary -- so we may need up to
- * four addresses that must be hidden/revealed for each fault.
- */
- unsigned long addr[4];
- unsigned long n_addrs;
- unsigned long flags;
-
- /* Data size of the instruction that caused a fault. */
- unsigned int size;
-};
-
-static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context);
-
-bool kmemcheck_active(struct pt_regs *regs)
-{
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
- return data->balance > 0;
-}
-
-/* Save an address that needs to be shown/hidden */
-static void kmemcheck_save_addr(unsigned long addr)
-{
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
- BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr));
- data->addr[data->n_addrs++] = addr;
-}
-
-static unsigned int kmemcheck_show_all(void)
-{
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
- unsigned int i;
- unsigned int n;
-
- n = 0;
- for (i = 0; i < data->n_addrs; ++i)
- n += kmemcheck_show_addr(data->addr[i]);
-
- return n;
-}
-
-static unsigned int kmemcheck_hide_all(void)
-{
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
- unsigned int i;
- unsigned int n;
-
- n = 0;
- for (i = 0; i < data->n_addrs; ++i)
- n += kmemcheck_hide_addr(data->addr[i]);
-
- return n;
-}
-
-/*
- * Called from the #PF handler.
- */
-void kmemcheck_show(struct pt_regs *regs)
-{
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
- BUG_ON(!irqs_disabled());
-
- if (unlikely(data->balance != 0)) {
- kmemcheck_show_all();
- kmemcheck_error_save_bug(regs);
- data->balance = 0;
- return;
- }
-
- /*
- * None of the addresses actually belonged to kmemcheck. Note that
- * this is not an error.
- */
- if (kmemcheck_show_all() == 0)
- return;
-
- ++data->balance;
-
- /*
- * The IF needs to be cleared as well, so that the faulting
- * instruction can run "uninterrupted". Otherwise, we might take
- * an interrupt and start executing that before we've had a chance
- * to hide the page again.
- *
- * NOTE: In the rare case of multiple faults, we must not override
- * the original flags:
- */
- if (!(regs->flags & X86_EFLAGS_TF))
- data->flags = regs->flags;
-
- regs->flags |= X86_EFLAGS_TF;
- regs->flags &= ~X86_EFLAGS_IF;
-}
-
-/*
- * Called from the #DB handler.
- */
-void kmemcheck_hide(struct pt_regs *regs)
-{
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
- int n;
-
- BUG_ON(!irqs_disabled());
-
- if (unlikely(data->balance != 1)) {
- kmemcheck_show_all();
- kmemcheck_error_save_bug(regs);
- data->n_addrs = 0;
- data->balance = 0;
-
- if (!(data->flags & X86_EFLAGS_TF))
- regs->flags &= ~X86_EFLAGS_TF;
- if (data->flags & X86_EFLAGS_IF)
- regs->flags |= X86_EFLAGS_IF;
- return;
- }
-
- if (kmemcheck_enabled)
- n = kmemcheck_hide_all();
- else
- n = kmemcheck_show_all();
-
- if (n == 0)
- return;
-
- --data->balance;
-
- data->n_addrs = 0;
-
- if (!(data->flags & X86_EFLAGS_TF))
- regs->flags &= ~X86_EFLAGS_TF;
- if (data->flags & X86_EFLAGS_IF)
- regs->flags |= X86_EFLAGS_IF;
-}
-
-void kmemcheck_show_pages(struct page *p, unsigned int n)
-{
- unsigned int i;
-
- for (i = 0; i < n; ++i) {
- unsigned long address;
- pte_t *pte;
- unsigned int level;
-
- address = (unsigned long) page_address(&p[i]);
- pte = lookup_address(address, &level);
- BUG_ON(!pte);
- BUG_ON(level != PG_LEVEL_4K);
-
- set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
- set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN));
- __flush_tlb_one(address);
- }
-}
-
-bool kmemcheck_page_is_tracked(struct page *p)
-{
- /* This will also check the "hidden" flag of the PTE. */
- return kmemcheck_pte_lookup((unsigned long) page_address(p));
-}
-
-void kmemcheck_hide_pages(struct page *p, unsigned int n)
-{
- unsigned int i;
-
- for (i = 0; i < n; ++i) {
- unsigned long address;
- pte_t *pte;
- unsigned int level;
-
- address = (unsigned long) page_address(&p[i]);
- pte = lookup_address(address, &level);
- BUG_ON(!pte);
- BUG_ON(level != PG_LEVEL_4K);
-
- set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
- set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN));
- __flush_tlb_one(address);
- }
-}
-
-/* Access may NOT cross page boundary */
-static void kmemcheck_read_strict(struct pt_regs *regs,
- unsigned long addr, unsigned int size)
-{
- void *shadow;
- enum kmemcheck_shadow status;
-
- shadow = kmemcheck_shadow_lookup(addr);
- if (!shadow)
- return;
-
- kmemcheck_save_addr(addr);
- status = kmemcheck_shadow_test(shadow, size);
- if (status == KMEMCHECK_SHADOW_INITIALIZED)
- return;
-
- if (kmemcheck_enabled)
- kmemcheck_error_save(status, addr, size, regs);
-
- if (kmemcheck_enabled == 2)
- kmemcheck_enabled = 0;
-
- /* Don't warn about it again. */
- kmemcheck_shadow_set(shadow, size);
-}
-
-bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
-{
- enum kmemcheck_shadow status;
- void *shadow;
-
- shadow = kmemcheck_shadow_lookup(addr);
- if (!shadow)
- return true;
-
- status = kmemcheck_shadow_test_all(shadow, size);
-
- return status == KMEMCHECK_SHADOW_INITIALIZED;
-}
-
-/* Access may cross page boundary */
-static void kmemcheck_read(struct pt_regs *regs,
- unsigned long addr, unsigned int size)
-{
- unsigned long page = addr & PAGE_MASK;
- unsigned long next_addr = addr + size - 1;
- unsigned long next_page = next_addr & PAGE_MASK;
-
- if (likely(page == next_page)) {
- kmemcheck_read_strict(regs, addr, size);
- return;
- }
-
- /*
- * What we do is basically to split the access across the
- * two pages and handle each part separately. Yes, this means
- * that we may now see reads that are 3 + 5 bytes, for
- * example (and if both are uninitialized, there will be two
- * reports), but it makes the code a lot simpler.
- */
- kmemcheck_read_strict(regs, addr, next_page - addr);
- kmemcheck_read_strict(regs, next_page, next_addr - next_page);
-}
-
-static void kmemcheck_write_strict(struct pt_regs *regs,
- unsigned long addr, unsigned int size)
-{
- void *shadow;
-
- shadow = kmemcheck_shadow_lookup(addr);
- if (!shadow)
- return;
-
- kmemcheck_save_addr(addr);
- kmemcheck_shadow_set(shadow, size);
-}
-
-static void kmemcheck_write(struct pt_regs *regs,
- unsigned long addr, unsigned int size)
-{
- unsigned long page = addr & PAGE_MASK;
- unsigned long next_addr = addr + size - 1;
- unsigned long next_page = next_addr & PAGE_MASK;
-
- if (likely(page == next_page)) {
- kmemcheck_write_strict(regs, addr, size);
- return;
- }
-
- /* See comment in kmemcheck_read(). */
- kmemcheck_write_strict(regs, addr, next_page - addr);
- kmemcheck_write_strict(regs, next_page, next_addr - next_page);
-}
-
-/*
- * Copying is hard. We have two addresses, each of which may be split across
- * a page (and each page will have different shadow addresses).
- */
-static void kmemcheck_copy(struct pt_regs *regs,
- unsigned long src_addr, unsigned long dst_addr, unsigned int size)
-{
- uint8_t shadow[8];
- enum kmemcheck_shadow status;
-
- unsigned long page;
- unsigned long next_addr;
- unsigned long next_page;
-
- uint8_t *x;
- unsigned int i;
- unsigned int n;
-
- BUG_ON(size > sizeof(shadow));
-
- page = src_addr & PAGE_MASK;
- next_addr = src_addr + size - 1;
- next_page = next_addr & PAGE_MASK;
-
- if (likely(page == next_page)) {
- /* Same page */
- x = kmemcheck_shadow_lookup(src_addr);
- if (x) {
- kmemcheck_save_addr(src_addr);
- for (i = 0; i < size; ++i)
- shadow[i] = x[i];
- } else {
- for (i = 0; i < size; ++i)
- shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
- }
- } else {
- n = next_page - src_addr;
- BUG_ON(n > sizeof(shadow));
-
- /* First page */
- x = kmemcheck_shadow_lookup(src_addr);
- if (x) {
- kmemcheck_save_addr(src_addr);
- for (i = 0; i < n; ++i)
- shadow[i] = x[i];
- } else {
- /* Not tracked */
- for (i = 0; i < n; ++i)
- shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
- }
-
- /* Second page */
- x = kmemcheck_shadow_lookup(next_page);
- if (x) {
- kmemcheck_save_addr(next_page);
- for (i = n; i < size; ++i)
- shadow[i] = x[i - n];
- } else {
- /* Not tracked */
- for (i = n; i < size; ++i)
- shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
- }
- }
-
- page = dst_addr & PAGE_MASK;
- next_addr = dst_addr + size - 1;
- next_page = next_addr & PAGE_MASK;
-
- if (likely(page == next_page)) {
- /* Same page */
- x = kmemcheck_shadow_lookup(dst_addr);
- if (x) {
- kmemcheck_save_addr(dst_addr);
- for (i = 0; i < size; ++i) {
- x[i] = shadow[i];
- shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
- }
- }
- } else {
- n = next_page - dst_addr;
- BUG_ON(n > sizeof(shadow));
-
- /* First page */
- x = kmemcheck_shadow_lookup(dst_addr);
- if (x) {
- kmemcheck_save_addr(dst_addr);
- for (i = 0; i < n; ++i) {
- x[i] = shadow[i];
- shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
- }
- }
-
- /* Second page */
- x = kmemcheck_shadow_lookup(next_page);
- if (x) {
- kmemcheck_save_addr(next_page);
- for (i = n; i < size; ++i) {
- x[i - n] = shadow[i];
- shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
- }
- }
- }
-
- status = kmemcheck_shadow_test(shadow, size);
- if (status == KMEMCHECK_SHADOW_INITIALIZED)
- return;
-
- if (kmemcheck_enabled)
- kmemcheck_error_save(status, src_addr, size, regs);
-
- if (kmemcheck_enabled == 2)
- kmemcheck_enabled = 0;
-}
-
-enum kmemcheck_method {
- KMEMCHECK_READ,
- KMEMCHECK_WRITE,
-};
-
-static void kmemcheck_access(struct pt_regs *regs,
- unsigned long fallback_address, enum kmemcheck_method fallback_method)
-{
- const uint8_t *insn;
- const uint8_t *insn_primary;
- unsigned int size;
-
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
- /* Recursive fault -- ouch. */
- if (data->busy) {
- kmemcheck_show_addr(fallback_address);
- kmemcheck_error_save_bug(regs);
- return;
- }
-
- data->busy = true;
-
- insn = (const uint8_t *) regs->ip;
- insn_primary = kmemcheck_opcode_get_primary(insn);
-
- kmemcheck_opcode_decode(insn, &size);
-
- switch (insn_primary[0]) {
-#ifdef CONFIG_KMEMCHECK_BITOPS_OK
- /* AND, OR, XOR */
- /*
- * Unfortunately, these instructions have to be excluded from
- * our regular checking since they access only some (and not
- * all) bits. This clears out "bogus" bitfield-access warnings.
- */
- case 0x80:
- case 0x81:
- case 0x82:
- case 0x83:
- switch ((insn_primary[1] >> 3) & 7) {
- /* OR */
- case 1:
- /* AND */
- case 4:
- /* XOR */
- case 6:
- kmemcheck_write(regs, fallback_address, size);
- goto out;
-
- /* ADD */
- case 0:
- /* ADC */
- case 2:
- /* SBB */
- case 3:
- /* SUB */
- case 5:
- /* CMP */
- case 7:
- break;
- }
- break;
-#endif
-
- /* MOVS, MOVSB, MOVSW, MOVSD */
- case 0xa4:
- case 0xa5:
- /*
- * These instructions are special because they take two
- * addresses, but we only get one page fault.
- */
- kmemcheck_copy(regs, regs->si, regs->di, size);
- goto out;
-
- /* CMPS, CMPSB, CMPSW, CMPSD */
- case 0xa6:
- case 0xa7:
- kmemcheck_read(regs, regs->si, size);
- kmemcheck_read(regs, regs->di, size);
- goto out;
- }
-
- /*
- * If the opcode isn't special in any way, we use the data from the
- * page fault handler to determine the address and type of memory
- * access.
- */
- switch (fallback_method) {
- case KMEMCHECK_READ:
- kmemcheck_read(regs, fallback_address, size);
- goto out;
- case KMEMCHECK_WRITE:
- kmemcheck_write(regs, fallback_address, size);
- goto out;
- }
-
-out:
- data->busy = false;
-}
-
-bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
- unsigned long error_code)
-{
- pte_t *pte;
-
- /*
- * XXX: Is it safe to assume that memory accesses from virtual 86
- * mode or non-kernel code segments will _never_ access kernel
- * memory (e.g. tracked pages)? For now, we need this to avoid
- * invoking kmemcheck for PnP BIOS calls.
- */
- if (regs->flags & X86_VM_MASK)
- return false;
- if (regs->cs != __KERNEL_CS)
- return false;
-
- pte = kmemcheck_pte_lookup(address);
- if (!pte)
- return false;
-
- WARN_ON_ONCE(in_nmi());
-
- if (error_code & 2)
- kmemcheck_access(regs, address, KMEMCHECK_WRITE);
- else
- kmemcheck_access(regs, address, KMEMCHECK_READ);
-
- kmemcheck_show(regs);
- return true;
-}
-
-bool kmemcheck_trap(struct pt_regs *regs)
-{
- if (!kmemcheck_active(regs))
- return false;
-
- /* We're done. */
- kmemcheck_hide(regs);
- return true;
-}
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
index 324aa3f07237..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/opcode.c
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -1,106 +1 @@
-#include <linux/types.h>
-
-#include "opcode.h"
-
-static bool opcode_is_prefix(uint8_t b)
-{
- return
- /* Group 1 */
- b == 0xf0 || b == 0xf2 || b == 0xf3
- /* Group 2 */
- || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
- || b == 0x64 || b == 0x65
- /* Group 3 */
- || b == 0x66
- /* Group 4 */
- || b == 0x67;
-}
-
-#ifdef CONFIG_X86_64
-static bool opcode_is_rex_prefix(uint8_t b)
-{
- return (b & 0xf0) == 0x40;
-}
-#else
-static bool opcode_is_rex_prefix(uint8_t b)
-{
- return false;
-}
-#endif
-
-#define REX_W (1 << 3)
-
-/*
- * This is a VERY crude opcode decoder. We only need to find the size of the
- * load/store that caused our #PF and this should work for all the opcodes
- * that we care about. Moreover, the ones who invented this instruction set
- * should be shot.
- */
-void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size)
-{
- /* Default operand size */
- int operand_size_override = 4;
-
- /* prefixes */
- for (; opcode_is_prefix(*op); ++op) {
- if (*op == 0x66)
- operand_size_override = 2;
- }
-
- /* REX prefix */
- if (opcode_is_rex_prefix(*op)) {
- uint8_t rex = *op;
-
- ++op;
- if (rex & REX_W) {
- switch (*op) {
- case 0x63:
- *size = 4;
- return;
- case 0x0f:
- ++op;
-
- switch (*op) {
- case 0xb6:
- case 0xbe:
- *size = 1;
- return;
- case 0xb7:
- case 0xbf:
- *size = 2;
- return;
- }
-
- break;
- }
-
- *size = 8;
- return;
- }
- }
-
- /* escape opcode */
- if (*op == 0x0f) {
- ++op;
-
- /*
- * This is move with zero-extend and sign-extend, respectively;
- * we don't have to think about 0xb6/0xbe, because this is
- * already handled in the conditional below.
- */
- if (*op == 0xb7 || *op == 0xbf)
- operand_size_override = 2;
- }
-
- *size = (*op & 1) ? operand_size_override : 1;
-}
-
-const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op)
-{
- /* skip prefixes */
- while (opcode_is_prefix(*op))
- ++op;
- if (opcode_is_rex_prefix(*op))
- ++op;
- return op;
-}
+// SPDX-License-Identifier: GPL-2.0
diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h
index 6956aad66b5b..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/opcode.h
+++ b/arch/x86/mm/kmemcheck/opcode.h
@@ -1,9 +1 @@
-#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H
-#define ARCH__X86__MM__KMEMCHECK__OPCODE_H
-
-#include <linux/types.h>
-
-void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size);
-const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op);
-
-#endif
+/* SPDX-License-Identifier: GPL-2.0 */
diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c
index 4ead26eeaf96..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/pte.c
+++ b/arch/x86/mm/kmemcheck/pte.c
@@ -1,22 +1 @@
-#include <linux/mm.h>
-
-#include <asm/pgtable.h>
-
-#include "pte.h"
-
-pte_t *kmemcheck_pte_lookup(unsigned long address)
-{
- pte_t *pte;
- unsigned int level;
-
- pte = lookup_address(address, &level);
- if (!pte)
- return NULL;
- if (level != PG_LEVEL_4K)
- return NULL;
- if (!pte_hidden(*pte))
- return NULL;
-
- return pte;
-}
-
+// SPDX-License-Identifier: GPL-2.0
diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h
index 9f5966456492..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/pte.h
+++ b/arch/x86/mm/kmemcheck/pte.h
@@ -1,10 +1 @@
-#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H
-#define ARCH__X86__MM__KMEMCHECK__PTE_H
-
-#include <linux/mm.h>
-
-#include <asm/pgtable.h>
-
-pte_t *kmemcheck_pte_lookup(unsigned long address);
-
-#endif
+/* SPDX-License-Identifier: GPL-2.0 */
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
index aef7140c0063..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/selftest.c
+++ b/arch/x86/mm/kmemcheck/selftest.c
@@ -1,70 +1 @@
-#include <linux/bug.h>
-#include <linux/kernel.h>
-
-#include "opcode.h"
-#include "selftest.h"
-
-struct selftest_opcode {
- unsigned int expected_size;
- const uint8_t *insn;
- const char *desc;
-};
-
-static const struct selftest_opcode selftest_opcodes[] = {
- /* REP MOVS */
- {1, "\xf3\xa4", "rep movsb <mem8>, <mem8>"},
- {4, "\xf3\xa5", "rep movsl <mem32>, <mem32>"},
-
- /* MOVZX / MOVZXD */
- {1, "\x66\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg16>"},
- {1, "\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg32>"},
-
- /* MOVSX / MOVSXD */
- {1, "\x66\x0f\xbe\x51\xf8", "movswq <mem8>, <reg16>"},
- {1, "\x0f\xbe\x51\xf8", "movswq <mem8>, <reg32>"},
-
-#ifdef CONFIG_X86_64
- /* MOVZX / MOVZXD */
- {1, "\x49\x0f\xb6\x51\xf8", "movzbq <mem8>, <reg64>"},
- {2, "\x49\x0f\xb7\x51\xf8", "movzbq <mem16>, <reg64>"},
-
- /* MOVSX / MOVSXD */
- {1, "\x49\x0f\xbe\x51\xf8", "movsbq <mem8>, <reg64>"},
- {2, "\x49\x0f\xbf\x51\xf8", "movsbq <mem16>, <reg64>"},
- {4, "\x49\x63\x51\xf8", "movslq <mem32>, <reg64>"},
-#endif
-};
-
-static bool selftest_opcode_one(const struct selftest_opcode *op)
-{
- unsigned size;
-
- kmemcheck_opcode_decode(op->insn, &size);
-
- if (size == op->expected_size)
- return true;
-
- printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n",
- op->desc, op->expected_size, size);
- return false;
-}
-
-static bool selftest_opcodes_all(void)
-{
- bool pass = true;
- unsigned int i;
-
- for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i)
- pass = pass && selftest_opcode_one(&selftest_opcodes[i]);
-
- return pass;
-}
-
-bool kmemcheck_selftest(void)
-{
- bool pass = true;
-
- pass = pass && selftest_opcodes_all();
-
- return pass;
-}
+// SPDX-License-Identifier: GPL-2.0
diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h
index 8fed4fe11f95..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/selftest.h
+++ b/arch/x86/mm/kmemcheck/selftest.h
@@ -1,6 +1 @@
-#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H
-#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H
-
-bool kmemcheck_selftest(void);
-
-#endif
+/* SPDX-License-Identifier: GPL-2.0 */
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
deleted file mode 100644
index c2638a7d2c10..000000000000
--- a/arch/x86/mm/kmemcheck/shadow.c
+++ /dev/null
@@ -1,173 +0,0 @@
-#include <linux/kmemcheck.h>
-#include <linux/export.h>
-#include <linux/mm.h>
-
-#include <asm/page.h>
-#include <asm/pgtable.h>
-
-#include "pte.h"
-#include "shadow.h"
-
-/*
- * Return the shadow address for the given address. Returns NULL if the
- * address is not tracked.
- *
- * We need to be extremely careful not to follow any invalid pointers,
- * because this function can be called for *any* possible address.
- */
-void *kmemcheck_shadow_lookup(unsigned long address)
-{
- pte_t *pte;
- struct page *page;
-
- if (!virt_addr_valid(address))
- return NULL;
-
- pte = kmemcheck_pte_lookup(address);
- if (!pte)
- return NULL;
-
- page = virt_to_page(address);
- if (!page->shadow)
- return NULL;
- return page->shadow + (address & (PAGE_SIZE - 1));
-}
-
-static void mark_shadow(void *address, unsigned int n,
- enum kmemcheck_shadow status)
-{
- unsigned long addr = (unsigned long) address;
- unsigned long last_addr = addr + n - 1;
- unsigned long page = addr & PAGE_MASK;
- unsigned long last_page = last_addr & PAGE_MASK;
- unsigned int first_n;
- void *shadow;
-
- /* If the memory range crosses a page boundary, stop there. */
- if (page == last_page)
- first_n = n;
- else
- first_n = page + PAGE_SIZE - addr;
-
- shadow = kmemcheck_shadow_lookup(addr);
- if (shadow)
- memset(shadow, status, first_n);
-
- addr += first_n;
- n -= first_n;
-
- /* Do full-page memset()s. */
- while (n >= PAGE_SIZE) {
- shadow = kmemcheck_shadow_lookup(addr);
- if (shadow)
- memset(shadow, status, PAGE_SIZE);
-
- addr += PAGE_SIZE;
- n -= PAGE_SIZE;
- }
-
- /* Do the remaining page, if any. */
- if (n > 0) {
- shadow = kmemcheck_shadow_lookup(addr);
- if (shadow)
- memset(shadow, status, n);
- }
-}
-
-void kmemcheck_mark_unallocated(void *address, unsigned int n)
-{
- mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED);
-}
-
-void kmemcheck_mark_uninitialized(void *address, unsigned int n)
-{
- mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED);
-}
-
-/*
- * Fill the shadow memory of the given address such that the memory at that
- * address is marked as being initialized.
- */
-void kmemcheck_mark_initialized(void *address, unsigned int n)
-{
- mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED);
-}
-EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized);
-
-void kmemcheck_mark_freed(void *address, unsigned int n)
-{
- mark_shadow(address, n, KMEMCHECK_SHADOW_FREED);
-}
-
-void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n)
-{
- unsigned int i;
-
- for (i = 0; i < n; ++i)
- kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE);
-}
-
-void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n)
-{
- unsigned int i;
-
- for (i = 0; i < n; ++i)
- kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE);
-}
-
-void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
-{
- unsigned int i;
-
- for (i = 0; i < n; ++i)
- kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE);
-}
-
-enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
-{
-#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
- uint8_t *x;
- unsigned int i;
-
- x = shadow;
-
- /*
- * Make sure _some_ bytes are initialized. Gcc frequently generates
- * code to access neighboring bytes.
- */
- for (i = 0; i < size; ++i) {
- if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
- return x[i];
- }
-
- return x[0];
-#else
- return kmemcheck_shadow_test_all(shadow, size);
-#endif
-}
-
-enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size)
-{
- uint8_t *x;
- unsigned int i;
-
- x = shadow;
-
- /* All bytes must be initialized. */
- for (i = 0; i < size; ++i) {
- if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
- return x[i];
- }
-
- return x[0];
-}
-
-void kmemcheck_shadow_set(void *shadow, unsigned int size)
-{
- uint8_t *x;
- unsigned int i;
-
- x = shadow;
- for (i = 0; i < size; ++i)
- x[i] = KMEMCHECK_SHADOW_INITIALIZED;
-}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
index ff0b2f70fbcb..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/shadow.h
+++ b/arch/x86/mm/kmemcheck/shadow.h
@@ -1,18 +1 @@
-#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H
-#define ARCH__X86__MM__KMEMCHECK__SHADOW_H
-
-enum kmemcheck_shadow {
- KMEMCHECK_SHADOW_UNALLOCATED,
- KMEMCHECK_SHADOW_UNINITIALIZED,
- KMEMCHECK_SHADOW_INITIALIZED,
- KMEMCHECK_SHADOW_FREED,
-};
-
-void *kmemcheck_shadow_lookup(unsigned long address);
-
-enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
-enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow,
- unsigned int size);
-void kmemcheck_shadow_set(void *shadow, unsigned int size);
-
-#endif
+/* SPDX-License-Identifier: GPL-2.0 */
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index afc47f5c9531..c21c2ed04612 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/* Support for MMIO probes.
* Benfit many code from kprobes
* (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
new file mode 100644
index 000000000000..d9a9e9fc75dd
--- /dev/null
+++ b/arch/x86/mm/mem_encrypt.c
@@ -0,0 +1,872 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/linkage.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/swiotlb.h>
+#include <linux/mem_encrypt.h>
+
+#include <asm/tlbflush.h>
+#include <asm/fixmap.h>
+#include <asm/setup.h>
+#include <asm/bootparam.h>
+#include <asm/set_memory.h>
+#include <asm/cacheflush.h>
+#include <asm/sections.h>
+#include <asm/processor-flags.h>
+#include <asm/msr.h>
+#include <asm/cmdline.h>
+
+#include "mm_internal.h"
+
+static char sme_cmdline_arg[] __initdata = "mem_encrypt";
+static char sme_cmdline_on[] __initdata = "on";
+static char sme_cmdline_off[] __initdata = "off";
+
+/*
+ * Since SME related variables are set early in the boot process they must
+ * reside in the .data section so as not to be zeroed out when the .bss
+ * section is later cleared.
+ */
+u64 sme_me_mask __section(.data) = 0;
+EXPORT_SYMBOL(sme_me_mask);
+DEFINE_STATIC_KEY_FALSE(sev_enable_key);
+EXPORT_SYMBOL_GPL(sev_enable_key);
+
+static bool sev_enabled __section(.data);
+
+/* Buffer used for early in-place encryption by BSP, no locking needed */
+static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE);
+
+/*
+ * This routine does not change the underlying encryption setting of the
+ * page(s) that map this memory. It assumes that eventually the memory is
+ * meant to be accessed as either encrypted or decrypted but the contents
+ * are currently not in the desired state.
+ *
+ * This routine follows the steps outlined in the AMD64 Architecture
+ * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place.
+ */
+static void __init __sme_early_enc_dec(resource_size_t paddr,
+ unsigned long size, bool enc)
+{
+ void *src, *dst;
+ size_t len;
+
+ if (!sme_me_mask)
+ return;
+
+ wbinvd();
+
+ /*
+ * There are limited number of early mapping slots, so map (at most)
+ * one page at time.
+ */
+ while (size) {
+ len = min_t(size_t, sizeof(sme_early_buffer), size);
+
+ /*
+ * Create mappings for the current and desired format of
+ * the memory. Use a write-protected mapping for the source.
+ */
+ src = enc ? early_memremap_decrypted_wp(paddr, len) :
+ early_memremap_encrypted_wp(paddr, len);
+
+ dst = enc ? early_memremap_encrypted(paddr, len) :
+ early_memremap_decrypted(paddr, len);
+
+ /*
+ * If a mapping can't be obtained to perform the operation,
+ * then eventual access of that area in the desired mode
+ * will cause a crash.
+ */
+ BUG_ON(!src || !dst);
+
+ /*
+ * Use a temporary buffer, of cache-line multiple size, to
+ * avoid data corruption as documented in the APM.
+ */
+ memcpy(sme_early_buffer, src, len);
+ memcpy(dst, sme_early_buffer, len);
+
+ early_memunmap(dst, len);
+ early_memunmap(src, len);
+
+ paddr += len;
+ size -= len;
+ }
+}
+
+void __init sme_early_encrypt(resource_size_t paddr, unsigned long size)
+{
+ __sme_early_enc_dec(paddr, size, true);
+}
+
+void __init sme_early_decrypt(resource_size_t paddr, unsigned long size)
+{
+ __sme_early_enc_dec(paddr, size, false);
+}
+
+static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size,
+ bool map)
+{
+ unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET;
+ pmdval_t pmd_flags, pmd;
+
+ /* Use early_pmd_flags but remove the encryption mask */
+ pmd_flags = __sme_clr(early_pmd_flags);
+
+ do {
+ pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0;
+ __early_make_pgtable((unsigned long)vaddr, pmd);
+
+ vaddr += PMD_SIZE;
+ paddr += PMD_SIZE;
+ size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE;
+ } while (size);
+
+ __native_flush_tlb();
+}
+
+void __init sme_unmap_bootdata(char *real_mode_data)
+{
+ struct boot_params *boot_data;
+ unsigned long cmdline_paddr;
+
+ if (!sme_active())
+ return;
+
+ /* Get the command line address before unmapping the real_mode_data */
+ boot_data = (struct boot_params *)real_mode_data;
+ cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
+
+ __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false);
+
+ if (!cmdline_paddr)
+ return;
+
+ __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false);
+}
+
+void __init sme_map_bootdata(char *real_mode_data)
+{
+ struct boot_params *boot_data;
+ unsigned long cmdline_paddr;
+
+ if (!sme_active())
+ return;
+
+ __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true);
+
+ /* Get the command line address after mapping the real_mode_data */
+ boot_data = (struct boot_params *)real_mode_data;
+ cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
+
+ if (!cmdline_paddr)
+ return;
+
+ __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true);
+}
+
+void __init sme_early_init(void)
+{
+ unsigned int i;
+
+ if (!sme_me_mask)
+ return;
+
+ early_pmd_flags = __sme_set(early_pmd_flags);
+
+ __supported_pte_mask = __sme_set(__supported_pte_mask);
+
+ /* Update the protection map with memory encryption mask */
+ for (i = 0; i < ARRAY_SIZE(protection_map); i++)
+ protection_map[i] = pgprot_encrypted(protection_map[i]);
+
+ if (sev_active())
+ swiotlb_force = SWIOTLB_FORCE;
+}
+
+static void *sev_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
+ gfp_t gfp, unsigned long attrs)
+{
+ unsigned long dma_mask;
+ unsigned int order;
+ struct page *page;
+ void *vaddr = NULL;
+
+ dma_mask = dma_alloc_coherent_mask(dev, gfp);
+ order = get_order(size);
+
+ /*
+ * Memory will be memset to zero after marking decrypted, so don't
+ * bother clearing it before.
+ */
+ gfp &= ~__GFP_ZERO;
+
+ page = alloc_pages_node(dev_to_node(dev), gfp, order);
+ if (page) {
+ dma_addr_t addr;
+
+ /*
+ * Since we will be clearing the encryption bit, check the
+ * mask with it already cleared.
+ */
+ addr = __sme_clr(phys_to_dma(dev, page_to_phys(page)));
+ if ((addr + size) > dma_mask) {
+ __free_pages(page, get_order(size));
+ } else {
+ vaddr = page_address(page);
+ *dma_handle = addr;
+ }
+ }
+
+ if (!vaddr)
+ vaddr = swiotlb_alloc_coherent(dev, size, dma_handle, gfp);
+
+ if (!vaddr)
+ return NULL;
+
+ /* Clear the SME encryption bit for DMA use if not swiotlb area */
+ if (!is_swiotlb_buffer(dma_to_phys(dev, *dma_handle))) {
+ set_memory_decrypted((unsigned long)vaddr, 1 << order);
+ memset(vaddr, 0, PAGE_SIZE << order);
+ *dma_handle = __sme_clr(*dma_handle);
+ }
+
+ return vaddr;
+}
+
+static void sev_free(struct device *dev, size_t size, void *vaddr,
+ dma_addr_t dma_handle, unsigned long attrs)
+{
+ /* Set the SME encryption bit for re-use if not swiotlb area */
+ if (!is_swiotlb_buffer(dma_to_phys(dev, dma_handle)))
+ set_memory_encrypted((unsigned long)vaddr,
+ 1 << get_order(size));
+
+ swiotlb_free_coherent(dev, size, vaddr, dma_handle);
+}
+
+static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
+{
+ pgprot_t old_prot, new_prot;
+ unsigned long pfn, pa, size;
+ pte_t new_pte;
+
+ switch (level) {
+ case PG_LEVEL_4K:
+ pfn = pte_pfn(*kpte);
+ old_prot = pte_pgprot(*kpte);
+ break;
+ case PG_LEVEL_2M:
+ pfn = pmd_pfn(*(pmd_t *)kpte);
+ old_prot = pmd_pgprot(*(pmd_t *)kpte);
+ break;
+ case PG_LEVEL_1G:
+ pfn = pud_pfn(*(pud_t *)kpte);
+ old_prot = pud_pgprot(*(pud_t *)kpte);
+ break;
+ default:
+ return;
+ }
+
+ new_prot = old_prot;
+ if (enc)
+ pgprot_val(new_prot) |= _PAGE_ENC;
+ else
+ pgprot_val(new_prot) &= ~_PAGE_ENC;
+
+ /* If prot is same then do nothing. */
+ if (pgprot_val(old_prot) == pgprot_val(new_prot))
+ return;
+
+ pa = pfn << page_level_shift(level);
+ size = page_level_size(level);
+
+ /*
+ * We are going to perform in-place en-/decryption and change the
+ * physical page attribute from C=1 to C=0 or vice versa. Flush the
+ * caches to ensure that data gets accessed with the correct C-bit.
+ */
+ clflush_cache_range(__va(pa), size);
+
+ /* Encrypt/decrypt the contents in-place */
+ if (enc)
+ sme_early_encrypt(pa, size);
+ else
+ sme_early_decrypt(pa, size);
+
+ /* Change the page encryption mask. */
+ new_pte = pfn_pte(pfn, new_prot);
+ set_pte_atomic(kpte, new_pte);
+}
+
+static int __init early_set_memory_enc_dec(unsigned long vaddr,
+ unsigned long size, bool enc)
+{
+ unsigned long vaddr_end, vaddr_next;
+ unsigned long psize, pmask;
+ int split_page_size_mask;
+ int level, ret;
+ pte_t *kpte;
+
+ vaddr_next = vaddr;
+ vaddr_end = vaddr + size;
+
+ for (; vaddr < vaddr_end; vaddr = vaddr_next) {
+ kpte = lookup_address(vaddr, &level);
+ if (!kpte || pte_none(*kpte)) {
+ ret = 1;
+ goto out;
+ }
+
+ if (level == PG_LEVEL_4K) {
+ __set_clr_pte_enc(kpte, level, enc);
+ vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE;
+ continue;
+ }
+
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+
+ /*
+ * Check whether we can change the large page in one go.
+ * We request a split when the address is not aligned and
+ * the number of pages to set/clear encryption bit is smaller
+ * than the number of pages in the large page.
+ */
+ if (vaddr == (vaddr & pmask) &&
+ ((vaddr_end - vaddr) >= psize)) {
+ __set_clr_pte_enc(kpte, level, enc);
+ vaddr_next = (vaddr & pmask) + psize;
+ continue;
+ }
+
+ /*
+ * The virtual address is part of a larger page, create the next
+ * level page table mapping (4K or 2M). If it is part of a 2M
+ * page then we request a split of the large page into 4K
+ * chunks. A 1GB large page is split into 2M pages, resp.
+ */
+ if (level == PG_LEVEL_2M)
+ split_page_size_mask = 0;
+ else
+ split_page_size_mask = 1 << PG_LEVEL_2M;
+
+ kernel_physical_mapping_init(__pa(vaddr & pmask),
+ __pa((vaddr_end & pmask) + psize),
+ split_page_size_mask);
+ }
+
+ ret = 0;
+
+out:
+ __flush_tlb_all();
+ return ret;
+}
+
+int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size)
+{
+ return early_set_memory_enc_dec(vaddr, size, false);
+}
+
+int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size)
+{
+ return early_set_memory_enc_dec(vaddr, size, true);
+}
+
+/*
+ * SME and SEV are very similar but they are not the same, so there are
+ * times that the kernel will need to distinguish between SME and SEV. The
+ * sme_active() and sev_active() functions are used for this. When a
+ * distinction isn't needed, the mem_encrypt_active() function can be used.
+ *
+ * The trampoline code is a good example for this requirement. Before
+ * paging is activated, SME will access all memory as decrypted, but SEV
+ * will access all memory as encrypted. So, when APs are being brought
+ * up under SME the trampoline area cannot be encrypted, whereas under SEV
+ * the trampoline area must be encrypted.
+ */
+bool sme_active(void)
+{
+ return sme_me_mask && !sev_enabled;
+}
+EXPORT_SYMBOL_GPL(sme_active);
+
+bool sev_active(void)
+{
+ return sme_me_mask && sev_enabled;
+}
+EXPORT_SYMBOL_GPL(sev_active);
+
+static const struct dma_map_ops sev_dma_ops = {
+ .alloc = sev_alloc,
+ .free = sev_free,
+ .map_page = swiotlb_map_page,
+ .unmap_page = swiotlb_unmap_page,
+ .map_sg = swiotlb_map_sg_attrs,
+ .unmap_sg = swiotlb_unmap_sg_attrs,
+ .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
+ .sync_single_for_device = swiotlb_sync_single_for_device,
+ .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
+ .sync_sg_for_device = swiotlb_sync_sg_for_device,
+ .mapping_error = swiotlb_dma_mapping_error,
+};
+
+/* Architecture __weak replacement functions */
+void __init mem_encrypt_init(void)
+{
+ if (!sme_me_mask)
+ return;
+
+ /* Call into SWIOTLB to update the SWIOTLB DMA buffers */
+ swiotlb_update_mem_attributes();
+
+ /*
+ * With SEV, DMA operations cannot use encryption. New DMA ops
+ * are required in order to mark the DMA areas as decrypted or
+ * to use bounce buffers.
+ */
+ if (sev_active())
+ dma_ops = &sev_dma_ops;
+
+ /*
+ * With SEV, we need to unroll the rep string I/O instructions.
+ */
+ if (sev_active())
+ static_branch_enable(&sev_enable_key);
+
+ pr_info("AMD %s active\n",
+ sev_active() ? "Secure Encrypted Virtualization (SEV)"
+ : "Secure Memory Encryption (SME)");
+}
+
+void swiotlb_set_mem_attributes(void *vaddr, unsigned long size)
+{
+ WARN(PAGE_ALIGN(size) != size,
+ "size is not page-aligned (%#lx)\n", size);
+
+ /* Make the SWIOTLB buffer area decrypted */
+ set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT);
+}
+
+static void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start,
+ unsigned long end)
+{
+ unsigned long pgd_start, pgd_end, pgd_size;
+ pgd_t *pgd_p;
+
+ pgd_start = start & PGDIR_MASK;
+ pgd_end = end & PGDIR_MASK;
+
+ pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1);
+ pgd_size *= sizeof(pgd_t);
+
+ pgd_p = pgd_base + pgd_index(start);
+
+ memset(pgd_p, 0, pgd_size);
+}
+
+#define PGD_FLAGS _KERNPG_TABLE_NOENC
+#define P4D_FLAGS _KERNPG_TABLE_NOENC
+#define PUD_FLAGS _KERNPG_TABLE_NOENC
+#define PMD_FLAGS (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
+
+static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,
+ unsigned long vaddr, pmdval_t pmd_val)
+{
+ pgd_t *pgd_p;
+ p4d_t *p4d_p;
+ pud_t *pud_p;
+ pmd_t *pmd_p;
+
+ pgd_p = pgd_base + pgd_index(vaddr);
+ if (native_pgd_val(*pgd_p)) {
+ if (IS_ENABLED(CONFIG_X86_5LEVEL))
+ p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
+ else
+ pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
+ } else {
+ pgd_t pgd;
+
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ p4d_p = pgtable_area;
+ memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
+ pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D;
+
+ pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS);
+ } else {
+ pud_p = pgtable_area;
+ memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
+ pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
+
+ pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS);
+ }
+ native_set_pgd(pgd_p, pgd);
+ }
+
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ p4d_p += p4d_index(vaddr);
+ if (native_p4d_val(*p4d_p)) {
+ pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK);
+ } else {
+ p4d_t p4d;
+
+ pud_p = pgtable_area;
+ memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
+ pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
+
+ p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS);
+ native_set_p4d(p4d_p, p4d);
+ }
+ }
+
+ pud_p += pud_index(vaddr);
+ if (native_pud_val(*pud_p)) {
+ if (native_pud_val(*pud_p) & _PAGE_PSE)
+ goto out;
+
+ pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK);
+ } else {
+ pud_t pud;
+
+ pmd_p = pgtable_area;
+ memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
+ pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD;
+
+ pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS);
+ native_set_pud(pud_p, pud);
+ }
+
+ pmd_p += pmd_index(vaddr);
+ if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE))
+ native_set_pmd(pmd_p, native_make_pmd(pmd_val));
+
+out:
+ return pgtable_area;
+}
+
+static unsigned long __init sme_pgtable_calc(unsigned long len)
+{
+ unsigned long p4d_size, pud_size, pmd_size;
+ unsigned long total;
+
+ /*
+ * Perform a relatively simplistic calculation of the pagetable
+ * entries that are needed. That mappings will be covered by 2MB
+ * PMD entries so we can conservatively calculate the required
+ * number of P4D, PUD and PMD structures needed to perform the
+ * mappings. Incrementing the count for each covers the case where
+ * the addresses cross entries.
+ */
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
+ p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
+ pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1;
+ pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
+ } else {
+ p4d_size = 0;
+ pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
+ pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
+ }
+ pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1;
+ pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
+
+ total = p4d_size + pud_size + pmd_size;
+
+ /*
+ * Now calculate the added pagetable structures needed to populate
+ * the new pagetables.
+ */
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
+ p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
+ pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE;
+ pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
+ } else {
+ p4d_size = 0;
+ pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
+ pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
+ }
+ pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE;
+ pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
+
+ total += p4d_size + pud_size + pmd_size;
+
+ return total;
+}
+
+void __init sme_encrypt_kernel(void)
+{
+ unsigned long workarea_start, workarea_end, workarea_len;
+ unsigned long execute_start, execute_end, execute_len;
+ unsigned long kernel_start, kernel_end, kernel_len;
+ unsigned long pgtable_area_len;
+ unsigned long paddr, pmd_flags;
+ unsigned long decrypted_base;
+ void *pgtable_area;
+ pgd_t *pgd;
+
+ if (!sme_active())
+ return;
+
+ /*
+ * Prepare for encrypting the kernel by building new pagetables with
+ * the necessary attributes needed to encrypt the kernel in place.
+ *
+ * One range of virtual addresses will map the memory occupied
+ * by the kernel as encrypted.
+ *
+ * Another range of virtual addresses will map the memory occupied
+ * by the kernel as decrypted and write-protected.
+ *
+ * The use of write-protect attribute will prevent any of the
+ * memory from being cached.
+ */
+
+ /* Physical addresses gives us the identity mapped virtual addresses */
+ kernel_start = __pa_symbol(_text);
+ kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
+ kernel_len = kernel_end - kernel_start;
+
+ /* Set the encryption workarea to be immediately after the kernel */
+ workarea_start = kernel_end;
+
+ /*
+ * Calculate required number of workarea bytes needed:
+ * executable encryption area size:
+ * stack page (PAGE_SIZE)
+ * encryption routine page (PAGE_SIZE)
+ * intermediate copy buffer (PMD_PAGE_SIZE)
+ * pagetable structures for the encryption of the kernel
+ * pagetable structures for workarea (in case not currently mapped)
+ */
+ execute_start = workarea_start;
+ execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE;
+ execute_len = execute_end - execute_start;
+
+ /*
+ * One PGD for both encrypted and decrypted mappings and a set of
+ * PUDs and PMDs for each of the encrypted and decrypted mappings.
+ */
+ pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD;
+ pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2;
+
+ /* PUDs and PMDs needed in the current pagetables for the workarea */
+ pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len);
+
+ /*
+ * The total workarea includes the executable encryption area and
+ * the pagetable area.
+ */
+ workarea_len = execute_len + pgtable_area_len;
+ workarea_end = workarea_start + workarea_len;
+
+ /*
+ * Set the address to the start of where newly created pagetable
+ * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable
+ * structures are created when the workarea is added to the current
+ * pagetables and when the new encrypted and decrypted kernel
+ * mappings are populated.
+ */
+ pgtable_area = (void *)execute_end;
+
+ /*
+ * Make sure the current pagetable structure has entries for
+ * addressing the workarea.
+ */
+ pgd = (pgd_t *)native_read_cr3_pa();
+ paddr = workarea_start;
+ while (paddr < workarea_end) {
+ pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+ paddr,
+ paddr + PMD_FLAGS);
+
+ paddr += PMD_PAGE_SIZE;
+ }
+
+ /* Flush the TLB - no globals so cr3 is enough */
+ native_write_cr3(__native_read_cr3());
+
+ /*
+ * A new pagetable structure is being built to allow for the kernel
+ * to be encrypted. It starts with an empty PGD that will then be
+ * populated with new PUDs and PMDs as the encrypted and decrypted
+ * kernel mappings are created.
+ */
+ pgd = pgtable_area;
+ memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD);
+ pgtable_area += sizeof(*pgd) * PTRS_PER_PGD;
+
+ /* Add encrypted kernel (identity) mappings */
+ pmd_flags = PMD_FLAGS | _PAGE_ENC;
+ paddr = kernel_start;
+ while (paddr < kernel_end) {
+ pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+ paddr,
+ paddr + pmd_flags);
+
+ paddr += PMD_PAGE_SIZE;
+ }
+
+ /*
+ * A different PGD index/entry must be used to get different
+ * pagetable entries for the decrypted mapping. Choose the next
+ * PGD index and convert it to a virtual address to be used as
+ * the base of the mapping.
+ */
+ decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1);
+ decrypted_base <<= PGDIR_SHIFT;
+
+ /* Add decrypted, write-protected kernel (non-identity) mappings */
+ pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT);
+ paddr = kernel_start;
+ while (paddr < kernel_end) {
+ pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+ paddr + decrypted_base,
+ paddr + pmd_flags);
+
+ paddr += PMD_PAGE_SIZE;
+ }
+
+ /* Add decrypted workarea mappings to both kernel mappings */
+ paddr = workarea_start;
+ while (paddr < workarea_end) {
+ pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+ paddr,
+ paddr + PMD_FLAGS);
+
+ pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+ paddr + decrypted_base,
+ paddr + PMD_FLAGS);
+
+ paddr += PMD_PAGE_SIZE;
+ }
+
+ /* Perform the encryption */
+ sme_encrypt_execute(kernel_start, kernel_start + decrypted_base,
+ kernel_len, workarea_start, (unsigned long)pgd);
+
+ /*
+ * At this point we are running encrypted. Remove the mappings for
+ * the decrypted areas - all that is needed for this is to remove
+ * the PGD entry/entries.
+ */
+ sme_clear_pgd(pgd, kernel_start + decrypted_base,
+ kernel_end + decrypted_base);
+
+ sme_clear_pgd(pgd, workarea_start + decrypted_base,
+ workarea_end + decrypted_base);
+
+ /* Flush the TLB - no globals so cr3 is enough */
+ native_write_cr3(__native_read_cr3());
+}
+
+void __init __nostackprotector sme_enable(struct boot_params *bp)
+{
+ const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
+ unsigned int eax, ebx, ecx, edx;
+ unsigned long feature_mask;
+ bool active_by_default;
+ unsigned long me_mask;
+ char buffer[16];
+ u64 msr;
+
+ /* Check for the SME/SEV support leaf */
+ eax = 0x80000000;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (eax < 0x8000001f)
+ return;
+
+#define AMD_SME_BIT BIT(0)
+#define AMD_SEV_BIT BIT(1)
+ /*
+ * Set the feature mask (SME or SEV) based on whether we are
+ * running under a hypervisor.
+ */
+ eax = 1;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT;
+
+ /*
+ * Check for the SME/SEV feature:
+ * CPUID Fn8000_001F[EAX]
+ * - Bit 0 - Secure Memory Encryption support
+ * - Bit 1 - Secure Encrypted Virtualization support
+ * CPUID Fn8000_001F[EBX]
+ * - Bits 5:0 - Pagetable bit position used to indicate encryption
+ */
+ eax = 0x8000001f;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (!(eax & feature_mask))
+ return;
+
+ me_mask = 1UL << (ebx & 0x3f);
+
+ /* Check if memory encryption is enabled */
+ if (feature_mask == AMD_SME_BIT) {
+ /* For SME, check the SYSCFG MSR */
+ msr = __rdmsr(MSR_K8_SYSCFG);
+ if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+ return;
+ } else {
+ /* For SEV, check the SEV MSR */
+ msr = __rdmsr(MSR_AMD64_SEV);
+ if (!(msr & MSR_AMD64_SEV_ENABLED))
+ return;
+
+ /* SEV state cannot be controlled by a command line option */
+ sme_me_mask = me_mask;
+ sev_enabled = true;
+ return;
+ }
+
+ /*
+ * Fixups have not been applied to phys_base yet and we're running
+ * identity mapped, so we must obtain the address to the SME command
+ * line argument data using rip-relative addressing.
+ */
+ asm ("lea sme_cmdline_arg(%%rip), %0"
+ : "=r" (cmdline_arg)
+ : "p" (sme_cmdline_arg));
+ asm ("lea sme_cmdline_on(%%rip), %0"
+ : "=r" (cmdline_on)
+ : "p" (sme_cmdline_on));
+ asm ("lea sme_cmdline_off(%%rip), %0"
+ : "=r" (cmdline_off)
+ : "p" (sme_cmdline_off));
+
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT))
+ active_by_default = true;
+ else
+ active_by_default = false;
+
+ cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr |
+ ((u64)bp->ext_cmd_line_ptr << 32));
+
+ cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer));
+
+ if (!strncmp(buffer, cmdline_on, sizeof(buffer)))
+ sme_me_mask = me_mask;
+ else if (!strncmp(buffer, cmdline_off, sizeof(buffer)))
+ sme_me_mask = 0;
+ else
+ sme_me_mask = active_by_default ? me_mask : 0;
+}
diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S
new file mode 100644
index 000000000000..730e6d541df1
--- /dev/null
+++ b/arch/x86/mm/mem_encrypt_boot.S
@@ -0,0 +1,149 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/processor-flags.h>
+#include <asm/msr-index.h>
+
+ .text
+ .code64
+ENTRY(sme_encrypt_execute)
+
+ /*
+ * Entry parameters:
+ * RDI - virtual address for the encrypted kernel mapping
+ * RSI - virtual address for the decrypted kernel mapping
+ * RDX - length of kernel
+ * RCX - virtual address of the encryption workarea, including:
+ * - stack page (PAGE_SIZE)
+ * - encryption routine page (PAGE_SIZE)
+ * - intermediate copy buffer (PMD_PAGE_SIZE)
+ * R8 - physcial address of the pagetables to use for encryption
+ */
+
+ push %rbp
+ movq %rsp, %rbp /* RBP now has original stack pointer */
+
+ /* Set up a one page stack in the non-encrypted memory area */
+ movq %rcx, %rax /* Workarea stack page */
+ leaq PAGE_SIZE(%rax), %rsp /* Set new stack pointer */
+ addq $PAGE_SIZE, %rax /* Workarea encryption routine */
+
+ push %r12
+ movq %rdi, %r10 /* Encrypted kernel */
+ movq %rsi, %r11 /* Decrypted kernel */
+ movq %rdx, %r12 /* Kernel length */
+
+ /* Copy encryption routine into the workarea */
+ movq %rax, %rdi /* Workarea encryption routine */
+ leaq __enc_copy(%rip), %rsi /* Encryption routine */
+ movq $(.L__enc_copy_end - __enc_copy), %rcx /* Encryption routine length */
+ rep movsb
+
+ /* Setup registers for call */
+ movq %r10, %rdi /* Encrypted kernel */
+ movq %r11, %rsi /* Decrypted kernel */
+ movq %r8, %rdx /* Pagetables used for encryption */
+ movq %r12, %rcx /* Kernel length */
+ movq %rax, %r8 /* Workarea encryption routine */
+ addq $PAGE_SIZE, %r8 /* Workarea intermediate copy buffer */
+
+ call *%rax /* Call the encryption routine */
+
+ pop %r12
+
+ movq %rbp, %rsp /* Restore original stack pointer */
+ pop %rbp
+
+ ret
+ENDPROC(sme_encrypt_execute)
+
+ENTRY(__enc_copy)
+/*
+ * Routine used to encrypt kernel.
+ * This routine must be run outside of the kernel proper since
+ * the kernel will be encrypted during the process. So this
+ * routine is defined here and then copied to an area outside
+ * of the kernel where it will remain and run decrypted
+ * during execution.
+ *
+ * On entry the registers must be:
+ * RDI - virtual address for the encrypted kernel mapping
+ * RSI - virtual address for the decrypted kernel mapping
+ * RDX - address of the pagetables to use for encryption
+ * RCX - length of kernel
+ * R8 - intermediate copy buffer
+ *
+ * RAX - points to this routine
+ *
+ * The kernel will be encrypted by copying from the non-encrypted
+ * kernel space to an intermediate buffer and then copying from the
+ * intermediate buffer back to the encrypted kernel space. The physical
+ * addresses of the two kernel space mappings are the same which
+ * results in the kernel being encrypted "in place".
+ */
+ /* Enable the new page tables */
+ mov %rdx, %cr3
+
+ /* Flush any global TLBs */
+ mov %cr4, %rdx
+ andq $~X86_CR4_PGE, %rdx
+ mov %rdx, %cr4
+ orq $X86_CR4_PGE, %rdx
+ mov %rdx, %cr4
+
+ /* Set the PAT register PA5 entry to write-protect */
+ push %rcx
+ movl $MSR_IA32_CR_PAT, %ecx
+ rdmsr
+ push %rdx /* Save original PAT value */
+ andl $0xffff00ff, %edx /* Clear PA5 */
+ orl $0x00000500, %edx /* Set PA5 to WP */
+ wrmsr
+ pop %rdx /* RDX contains original PAT value */
+ pop %rcx
+
+ movq %rcx, %r9 /* Save kernel length */
+ movq %rdi, %r10 /* Save encrypted kernel address */
+ movq %rsi, %r11 /* Save decrypted kernel address */
+
+ wbinvd /* Invalidate any cache entries */
+
+ /* Copy/encrypt 2MB at a time */
+1:
+ movq %r11, %rsi /* Source - decrypted kernel */
+ movq %r8, %rdi /* Dest - intermediate copy buffer */
+ movq $PMD_PAGE_SIZE, %rcx /* 2MB length */
+ rep movsb
+
+ movq %r8, %rsi /* Source - intermediate copy buffer */
+ movq %r10, %rdi /* Dest - encrypted kernel */
+ movq $PMD_PAGE_SIZE, %rcx /* 2MB length */
+ rep movsb
+
+ addq $PMD_PAGE_SIZE, %r11
+ addq $PMD_PAGE_SIZE, %r10
+ subq $PMD_PAGE_SIZE, %r9 /* Kernel length decrement */
+ jnz 1b /* Kernel length not zero? */
+
+ /* Restore PAT register */
+ push %rdx /* Save original PAT value */
+ movl $MSR_IA32_CR_PAT, %ecx
+ rdmsr
+ pop %rdx /* Restore original PAT value */
+ wrmsr
+
+ ret
+.L__enc_copy_end:
+ENDPROC(__enc_copy)
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
index 62474ba66c8e..4e1f6e1b8159 100644
--- a/arch/x86/mm/mm_internal.h
+++ b/arch/x86/mm/mm_internal.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __X86_MM_INTERNAL_H
#define __X86_MM_INTERNAL_H
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 229d04a83f85..155ecbac9e28 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -33,26 +33,27 @@
#include <linux/compat.h>
#include <asm/elf.h>
+#include "physaddr.h"
+
struct va_alignment __read_mostly va_align = {
.flags = -1,
};
-unsigned long tasksize_32bit(void)
+unsigned long task_size_32bit(void)
{
return IA32_PAGE_OFFSET;
}
-unsigned long tasksize_64bit(void)
+unsigned long task_size_64bit(int full_addr_space)
{
- return TASK_SIZE_MAX;
+ return full_addr_space ? TASK_SIZE_MAX : DEFAULT_MAP_WINDOW;
}
static unsigned long stack_maxrandom_size(unsigned long task_size)
{
unsigned long max = 0;
- if ((current->flags & PF_RANDOMIZE) &&
- !(current->personality & ADDR_NO_RANDOMIZE)) {
- max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit());
+ if (current->flags & PF_RANDOMIZE) {
+ max = (-1UL) & __STACK_RND_MASK(task_size == task_size_32bit());
max <<= PAGE_SHIFT;
}
@@ -79,13 +80,13 @@ static int mmap_is_legacy(void)
static unsigned long arch_rnd(unsigned int rndbits)
{
+ if (!(current->flags & PF_RANDOMIZE))
+ return 0;
return (get_random_long() & ((1UL << rndbits) - 1)) << PAGE_SHIFT;
}
unsigned long arch_mmap_rnd(void)
{
- if (!(current->flags & PF_RANDOMIZE))
- return 0;
return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits);
}
@@ -142,7 +143,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
- arch_rnd(mmap64_rnd_bits), tasksize_64bit());
+ arch_rnd(mmap64_rnd_bits), task_size_64bit(0));
#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
/*
@@ -152,7 +153,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
* mmap_base, the compat syscall uses mmap_compat_base.
*/
arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base,
- arch_rnd(mmap32_rnd_bits), tasksize_32bit());
+ arch_rnd(mmap32_rnd_bits), task_size_32bit());
#endif
}
@@ -175,3 +176,63 @@ const char *arch_vma_name(struct vm_area_struct *vma)
return "[mpx]";
return NULL;
}
+
+/**
+ * mmap_address_hint_valid - Validate the address hint of mmap
+ * @addr: Address hint
+ * @len: Mapping length
+ *
+ * Check whether @addr and @addr + @len result in a valid mapping.
+ *
+ * On 32bit this only checks whether @addr + @len is <= TASK_SIZE.
+ *
+ * On 64bit with 5-level page tables another sanity check is required
+ * because mappings requested by mmap(@addr, 0) which cross the 47-bit
+ * virtual address boundary can cause the following theoretical issue:
+ *
+ * An application calls mmap(addr, 0), i.e. without MAP_FIXED, where @addr
+ * is below the border of the 47-bit address space and @addr + @len is
+ * above the border.
+ *
+ * With 4-level paging this request succeeds, but the resulting mapping
+ * address will always be within the 47-bit virtual address space, because
+ * the hint address does not result in a valid mapping and is
+ * ignored. Hence applications which are not prepared to handle virtual
+ * addresses above 47-bit work correctly.
+ *
+ * With 5-level paging this request would be granted and result in a
+ * mapping which crosses the border of the 47-bit virtual address
+ * space. If the application cannot handle addresses above 47-bit this
+ * will lead to misbehaviour and hard to diagnose failures.
+ *
+ * Therefore ignore address hints which would result in a mapping crossing
+ * the 47-bit virtual address boundary.
+ *
+ * Note, that in the same scenario with MAP_FIXED the behaviour is
+ * different. The request with @addr < 47-bit and @addr + @len > 47-bit
+ * fails on a 4-level paging machine but succeeds on a 5-level paging
+ * machine. It is reasonable to expect that an application does not rely on
+ * the failure of such a fixed mapping request, so the restriction is not
+ * applied.
+ */
+bool mmap_address_hint_valid(unsigned long addr, unsigned long len)
+{
+ if (TASK_SIZE - len < addr)
+ return false;
+
+ return (addr > DEFAULT_MAP_WINDOW) == (addr + len > DEFAULT_MAP_WINDOW);
+}
+
+/* Can we access it for direct reading/writing? Must be RAM: */
+int valid_phys_addr_range(phys_addr_t addr, size_t count)
+{
+ return addr + count <= __pa(high_memory);
+}
+
+/* Can we access it through mmap? Must be a valid physical address: */
+int valid_mmap_phys_addr_range(unsigned long pfn, size_t count)
+{
+ phys_addr_t addr = (phys_addr_t)pfn << PAGE_SHIFT;
+
+ return phys_addr_valid(addr + count - 1);
+}
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index 1c34b767c84c..e500949bae24 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* mpx.c - Memory Protection eXtensions
*
@@ -12,6 +13,7 @@
#include <linux/sched/sysctl.h>
#include <asm/insn.h>
+#include <asm/insn-eval.h>
#include <asm/mman.h>
#include <asm/mmu_context.h>
#include <asm/mpx.h>
@@ -60,123 +62,6 @@ static unsigned long mpx_mmap(unsigned long len)
return addr;
}
-enum reg_type {
- REG_TYPE_RM = 0,
- REG_TYPE_INDEX,
- REG_TYPE_BASE,
-};
-
-static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
- enum reg_type type)
-{
- int regno = 0;
-
- static const int regoff[] = {
- offsetof(struct pt_regs, ax),
- offsetof(struct pt_regs, cx),
- offsetof(struct pt_regs, dx),
- offsetof(struct pt_regs, bx),
- offsetof(struct pt_regs, sp),
- offsetof(struct pt_regs, bp),
- offsetof(struct pt_regs, si),
- offsetof(struct pt_regs, di),
-#ifdef CONFIG_X86_64
- offsetof(struct pt_regs, r8),
- offsetof(struct pt_regs, r9),
- offsetof(struct pt_regs, r10),
- offsetof(struct pt_regs, r11),
- offsetof(struct pt_regs, r12),
- offsetof(struct pt_regs, r13),
- offsetof(struct pt_regs, r14),
- offsetof(struct pt_regs, r15),
-#endif
- };
- int nr_registers = ARRAY_SIZE(regoff);
- /*
- * Don't possibly decode a 32-bit instructions as
- * reading a 64-bit-only register.
- */
- if (IS_ENABLED(CONFIG_X86_64) && !insn->x86_64)
- nr_registers -= 8;
-
- switch (type) {
- case REG_TYPE_RM:
- regno = X86_MODRM_RM(insn->modrm.value);
- if (X86_REX_B(insn->rex_prefix.value))
- regno += 8;
- break;
-
- case REG_TYPE_INDEX:
- regno = X86_SIB_INDEX(insn->sib.value);
- if (X86_REX_X(insn->rex_prefix.value))
- regno += 8;
- break;
-
- case REG_TYPE_BASE:
- regno = X86_SIB_BASE(insn->sib.value);
- if (X86_REX_B(insn->rex_prefix.value))
- regno += 8;
- break;
-
- default:
- pr_err("invalid register type");
- BUG();
- break;
- }
-
- if (regno >= nr_registers) {
- WARN_ONCE(1, "decoded an instruction with an invalid register");
- return -EINVAL;
- }
- return regoff[regno];
-}
-
-/*
- * return the address being referenced be instruction
- * for rm=3 returning the content of the rm reg
- * for rm!=3 calculates the address using SIB and Disp
- */
-static void __user *mpx_get_addr_ref(struct insn *insn, struct pt_regs *regs)
-{
- unsigned long addr, base, indx;
- int addr_offset, base_offset, indx_offset;
- insn_byte_t sib;
-
- insn_get_modrm(insn);
- insn_get_sib(insn);
- sib = insn->sib.value;
-
- if (X86_MODRM_MOD(insn->modrm.value) == 3) {
- addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM);
- if (addr_offset < 0)
- goto out_err;
- addr = regs_get_register(regs, addr_offset);
- } else {
- if (insn->sib.nbytes) {
- base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE);
- if (base_offset < 0)
- goto out_err;
-
- indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX);
- if (indx_offset < 0)
- goto out_err;
-
- base = regs_get_register(regs, base_offset);
- indx = regs_get_register(regs, indx_offset);
- addr = base + indx * (1 << X86_SIB_SCALE(sib));
- } else {
- addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM);
- if (addr_offset < 0)
- goto out_err;
- addr = regs_get_register(regs, addr_offset);
- }
- addr += insn->displacement.value;
- }
- return (void __user *)addr;
-out_err:
- return (void __user *)-1;
-}
-
static int mpx_insn_decode(struct insn *insn,
struct pt_regs *regs)
{
@@ -289,7 +174,7 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
info->si_signo = SIGSEGV;
info->si_errno = 0;
info->si_code = SEGV_BNDERR;
- info->si_addr = mpx_get_addr_ref(&insn, regs);
+ info->si_addr = insn_get_addr_ref(&insn, regs);
/*
* We were not able to extract an address from the instruction,
* probably because there was something invalid in it.
@@ -355,10 +240,19 @@ int mpx_enable_management(void)
*/
bd_base = mpx_get_bounds_dir();
down_write(&mm->mmap_sem);
+
+ /* MPX doesn't support addresses above 47 bits yet. */
+ if (find_vma(mm, DEFAULT_MAP_WINDOW)) {
+ pr_warn_once("%s (%d): MPX cannot handle addresses "
+ "above 47-bits. Disabling.",
+ current->comm, current->pid);
+ ret = -ENXIO;
+ goto out;
+ }
mm->context.bd_addr = bd_base;
if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR)
ret = -ENXIO;
-
+out:
up_write(&mm->mmap_sem);
return ret;
}
@@ -1030,3 +924,25 @@ void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
if (ret)
force_sig(SIGSEGV, current);
}
+
+/* MPX cannot handle addresses above 47 bits yet. */
+unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len,
+ unsigned long flags)
+{
+ if (!kernel_managing_mpx_tables(current->mm))
+ return addr;
+ if (addr + len <= DEFAULT_MAP_WINDOW)
+ return addr;
+ if (flags & MAP_FIXED)
+ return -ENOMEM;
+
+ /*
+ * Requested len is larger than the whole area we're allowed to map in.
+ * Resetting hinting address wouldn't do much good -- fail early.
+ */
+ if (len > DEFAULT_MAP_WINDOW)
+ return -ENOMEM;
+
+ /* Look for unmap area within DEFAULT_MAP_WINDOW */
+ return 0;
+}
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 9405ffc91502..066f3511d5f1 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Generic VM initialization for x86-64 NUMA setups.
* Copyright 2002,2003 Andi Kleen, SuSE Labs.
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index a8f90ce3dedf..34a2a3bfde9c 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* NUMA emulation
*/
@@ -75,13 +76,15 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei,
/*
* Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
- * to max_addr. The return value is the number of nodes allocated.
+ * to max_addr.
+ *
+ * Returns zero on success or negative on error.
*/
static int __init split_nodes_interleave(struct numa_meminfo *ei,
struct numa_meminfo *pi,
u64 addr, u64 max_addr, int nr_nodes)
{
- nodemask_t physnode_mask = NODE_MASK_NONE;
+ nodemask_t physnode_mask = numa_nodes_parsed;
u64 size;
int big;
int nid = 0;
@@ -116,9 +119,6 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
return -1;
}
- for (i = 0; i < pi->nr_blks; i++)
- node_set(pi->blk[i].nid, physnode_mask);
-
/*
* Continue to fill physical nodes with fake nodes until there is no
* memory left on any of them.
@@ -200,13 +200,15 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
/*
* Sets up fake nodes of `size' interleaved over physical nodes ranging from
- * `addr' to `max_addr'. The return value is the number of nodes allocated.
+ * `addr' to `max_addr'.
+ *
+ * Returns zero on success or negative on error.
*/
static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
struct numa_meminfo *pi,
u64 addr, u64 max_addr, u64 size)
{
- nodemask_t physnode_mask = NODE_MASK_NONE;
+ nodemask_t physnode_mask = numa_nodes_parsed;
u64 min_size;
int nid = 0;
int i, ret;
@@ -231,9 +233,6 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
}
size &= FAKE_NODE_MIN_HASH_MASK;
- for (i = 0; i < pi->nr_blks; i++)
- node_set(pi->blk[i].nid, physnode_mask);
-
/*
* Fill physical nodes with fake nodes of size until there is no memory
* left on any of them.
@@ -280,6 +279,22 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
return 0;
}
+int __init setup_emu2phys_nid(int *dfl_phys_nid)
+{
+ int i, max_emu_nid = 0;
+
+ *dfl_phys_nid = NUMA_NO_NODE;
+ for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
+ if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
+ max_emu_nid = i;
+ if (*dfl_phys_nid == NUMA_NO_NODE)
+ *dfl_phys_nid = emu_nid_to_phys[i];
+ }
+ }
+
+ return max_emu_nid;
+}
+
/**
* numa_emulation - Emulate NUMA nodes
* @numa_meminfo: NUMA configuration to massage
@@ -376,23 +391,18 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
* Determine the max emulated nid and the default phys nid to use
* for unmapped nodes.
*/
- max_emu_nid = 0;
- dfl_phys_nid = NUMA_NO_NODE;
- for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
- if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
- max_emu_nid = i;
- if (dfl_phys_nid == NUMA_NO_NODE)
- dfl_phys_nid = emu_nid_to_phys[i];
- }
- }
- if (dfl_phys_nid == NUMA_NO_NODE) {
- pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n");
- goto no_emu;
- }
+ max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid);
/* commit */
*numa_meminfo = ei;
+ /* Make sure numa_nodes_parsed only contains emulated nodes */
+ nodes_clear(numa_nodes_parsed);
+ for (i = 0; i < ARRAY_SIZE(ei.blk); i++)
+ if (ei.blk[i].start != ei.blk[i].end &&
+ ei.blk[i].nid != NUMA_NO_NODE)
+ node_set(ei.blk[i].nid, numa_nodes_parsed);
+
/*
* Transform __apicid_to_node table to use emulated nids by
* reverse-mapping phys_nid. The maps should always exist but fall
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
index ad86ec91e640..86860f279662 100644
--- a/arch/x86/mm/numa_internal.h
+++ b/arch/x86/mm/numa_internal.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __X86_MM_NUMA_INTERNAL_H
#define __X86_MM_NUMA_INTERNAL_H
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 5f169d5d76a8..a25588ad75ef 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* self test for change_page_attr.
*
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 757b0bcdf712..85cf12219dea 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -753,7 +753,7 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
if (!debug_pagealloc_enabled())
spin_unlock(&cpa_lock);
- base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
+ base = alloc_pages(GFP_KERNEL, 0);
if (!debug_pagealloc_enabled())
spin_lock(&cpa_lock);
if (!base)
@@ -904,7 +904,7 @@ static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
static int alloc_pte_page(pmd_t *pmd)
{
- pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
if (!pte)
return -1;
@@ -914,7 +914,7 @@ static int alloc_pte_page(pmd_t *pmd)
static int alloc_pmd_page(pud_t *pud)
{
- pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
if (!pmd)
return -1;
@@ -1120,7 +1120,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
pgd_entry = cpa->pgd + pgd_index(addr);
if (pgd_none(*pgd_entry)) {
- p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
if (!p4d)
return -1;
@@ -1132,7 +1132,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
*/
p4d = p4d_offset(pgd_entry, addr);
if (p4d_none(*p4d)) {
- pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
if (!pud)
return -1;
@@ -1775,6 +1775,70 @@ int set_memory_4k(unsigned long addr, int numpages)
__pgprot(0), 1, 0, NULL);
}
+static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
+{
+ struct cpa_data cpa;
+ unsigned long start;
+ int ret;
+
+ /* Nothing to do if memory encryption is not active */
+ if (!mem_encrypt_active())
+ return 0;
+
+ /* Should not be working on unaligned addresses */
+ if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr))
+ addr &= PAGE_MASK;
+
+ start = addr;
+
+ memset(&cpa, 0, sizeof(cpa));
+ cpa.vaddr = &addr;
+ cpa.numpages = numpages;
+ cpa.mask_set = enc ? __pgprot(_PAGE_ENC) : __pgprot(0);
+ cpa.mask_clr = enc ? __pgprot(0) : __pgprot(_PAGE_ENC);
+ cpa.pgd = init_mm.pgd;
+
+ /* Must avoid aliasing mappings in the highmem code */
+ kmap_flush_unused();
+ vm_unmap_aliases();
+
+ /*
+ * Before changing the encryption attribute, we need to flush caches.
+ */
+ if (static_cpu_has(X86_FEATURE_CLFLUSH))
+ cpa_flush_range(start, numpages, 1);
+ else
+ cpa_flush_all(1);
+
+ ret = __change_page_attr_set_clr(&cpa, 1);
+
+ /*
+ * After changing the encryption attribute, we need to flush TLBs
+ * again in case any speculative TLB caching occurred (but no need
+ * to flush caches again). We could just use cpa_flush_all(), but
+ * in case TLB flushing gets optimized in the cpa_flush_range()
+ * path use the same logic as above.
+ */
+ if (static_cpu_has(X86_FEATURE_CLFLUSH))
+ cpa_flush_range(start, numpages, 0);
+ else
+ cpa_flush_all(0);
+
+ return ret;
+}
+
+int set_memory_encrypted(unsigned long addr, int numpages)
+{
+ return __set_memory_enc_dec(addr, numpages, true);
+}
+EXPORT_SYMBOL_GPL(set_memory_encrypted);
+
+int set_memory_decrypted(unsigned long addr, int numpages)
+{
+ return __set_memory_enc_dec(addr, numpages, false);
+}
+EXPORT_SYMBOL_GPL(set_memory_decrypted);
+
int set_pages_uc(struct page *page, int numpages)
{
unsigned long addr = (unsigned long)page_address(page);
@@ -2020,6 +2084,9 @@ int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
if (!(page_flags & _PAGE_RW))
cpa.mask_clr = __pgprot(_PAGE_RW);
+ if (!(page_flags & _PAGE_ENC))
+ cpa.mask_clr = pgprot_encrypted(cpa.mask_clr);
+
cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
retval = __change_page_attr_set_clr(&cpa, 0);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 45979502f64b..fe7d57a8fb60 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -293,7 +293,7 @@ void init_cache_modes(void)
* pat_init - Initialize PAT MSR and PAT table
*
* This function initializes PAT MSR and PAT table with an OS-defined value
- * to enable additional cache attributes, WC and WT.
+ * to enable additional cache attributes, WC, WT and WP.
*
* This function must be called on all CPUs using the specific sequence of
* operations defined in Intel SDM. mtrr_rendezvous_handler() provides this
@@ -352,7 +352,7 @@ void pat_init(void)
* 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS
* 011 3 UC : _PAGE_CACHE_MODE_UC
* 100 4 WB : Reserved
- * 101 5 WC : Reserved
+ * 101 5 WP : _PAGE_CACHE_MODE_WP
* 110 6 UC-: Reserved
* 111 7 WT : _PAGE_CACHE_MODE_WT
*
@@ -360,7 +360,7 @@ void pat_init(void)
* corresponding types in the presence of PAT errata.
*/
pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
- PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT);
+ PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT);
}
if (!boot_cpu_done) {
@@ -744,6 +744,9 @@ EXPORT_SYMBOL(arch_io_free_memtype_wc);
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t vma_prot)
{
+ if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size))
+ vma_prot = pgprot_decrypted(vma_prot);
+
return vma_prot;
}
diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h
index a739bfc40690..eeb5caeb089b 100644
--- a/arch/x86/mm/pat_internal.h
+++ b/arch/x86/mm/pat_internal.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __PAT_INTERNAL_H_
#define __PAT_INTERNAL_H_
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index d76485b22824..fa16036fa592 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Handle caching attributes in page tables (PAT)
*
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 508a708eb9a6..96d456a94b03 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/mm.h>
#include <linux/gfp.h>
#include <asm/pgalloc.h>
@@ -6,7 +7,7 @@
#include <asm/fixmap.h>
#include <asm/mtrr.h>
-#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO)
#ifdef CONFIG_HIGHPTE
#define PGALLOC_USER_GFP __GFP_HIGHMEM
@@ -56,7 +57,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
pgtable_page_dtor(pte);
paravirt_release_pte(page_to_pfn(pte));
- tlb_remove_page(tlb, pte);
+ tlb_remove_table(tlb, pte);
}
#if CONFIG_PGTABLE_LEVELS > 2
@@ -72,21 +73,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
tlb->need_flush_all = 1;
#endif
pgtable_pmd_page_dtor(page);
- tlb_remove_page(tlb, page);
+ tlb_remove_table(tlb, page);
}
#if CONFIG_PGTABLE_LEVELS > 3
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
- tlb_remove_page(tlb, virt_to_page(pud));
+ tlb_remove_table(tlb, virt_to_page(pud));
}
#if CONFIG_PGTABLE_LEVELS > 4
void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
{
paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
- tlb_remove_page(tlb, virt_to_page(p4d));
+ tlb_remove_table(tlb, virt_to_page(p4d));
}
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
@@ -426,10 +427,8 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
{
int changed = !pte_same(*ptep, entry);
- if (changed && dirty) {
+ if (changed && dirty)
*ptep = entry;
- pte_update(vma->vm_mm, address, ptep);
- }
return changed;
}
@@ -486,9 +485,6 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
(unsigned long *) &ptep->pte);
- if (ret)
- pte_update(vma->vm_mm, addr, ptep);
-
return ret;
}
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index b9bd5b8b14fa..6b9bf023a700 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c
index cfc3b9121ce4..7f9acb68324c 100644
--- a/arch/x86/mm/physaddr.c
+++ b/arch/x86/mm/physaddr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/bootmem.h>
#include <linux/mmdebug.h>
#include <linux/export.h>
diff --git a/arch/x86/mm/physaddr.h b/arch/x86/mm/physaddr.h
index a3cd5a0c97b3..9f6419cafc32 100644
--- a/arch/x86/mm/physaddr.h
+++ b/arch/x86/mm/physaddr.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/processor.h>
static inline int phys_addr_valid(resource_size_t addr)
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index 2dab69a706ec..d7bc0eea20a5 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -18,7 +18,6 @@
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/mmu_context.h> /* vma_pkey() */
-#include <asm/fpu/internal.h> /* fpregs_active() */
int __execute_only_pkey(struct mm_struct *mm)
{
@@ -45,7 +44,7 @@ int __execute_only_pkey(struct mm_struct *mm)
*/
preempt_disable();
if (!need_to_set_mm_pkey &&
- fpregs_active() &&
+ current->thread.fpu.initialized &&
!__pkru_allows_read(read_pkru(), execute_only_pkey)) {
preempt_enable();
return execute_only_pkey;
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index f65a33f505b6..adb3c5784dac 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/spinlock.h>
#include <linux/errno.h>
#include <linux/init.h>
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 3ea20d61b523..dac07e4f5834 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* ACPI 3.0 based NUMA setup
* Copyright 2004 Andi Kleen, SuSE Labs.
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 014d07a80053..3118392cdf75 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -28,6 +28,43 @@
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
*/
+atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
+
+
+static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
+ u16 *new_asid, bool *need_flush)
+{
+ u16 asid;
+
+ if (!static_cpu_has(X86_FEATURE_PCID)) {
+ *new_asid = 0;
+ *need_flush = true;
+ return;
+ }
+
+ for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
+ if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
+ next->context.ctx_id)
+ continue;
+
+ *new_asid = asid;
+ *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) <
+ next_tlb_gen);
+ return;
+ }
+
+ /*
+ * We don't currently own an ASID slot on this CPU.
+ * Allocate a slot.
+ */
+ *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1;
+ if (*new_asid >= TLB_NR_DYN_ASIDS) {
+ *new_asid = 0;
+ this_cpu_write(cpu_tlbstate.next_asid, 1);
+ }
+ *need_flush = true;
+}
+
void leave_mm(int cpu)
{
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -43,8 +80,8 @@ void leave_mm(int cpu)
if (loaded_mm == &init_mm)
return;
- if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
- BUG();
+ /* Warn if we're not lazy. */
+ WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
switch_mm(NULL, &init_mm, NULL);
}
@@ -63,115 +100,308 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
- unsigned cpu = smp_processor_id();
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
+ u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ unsigned cpu = smp_processor_id();
+ u64 next_tlb_gen;
/*
- * NB: The scheduler will call us with prev == next when
- * switching from lazy TLB mode to normal mode if active_mm
- * isn't changing. When this happens, there is no guarantee
- * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next.
+ * NB: The scheduler will call us with prev == next when switching
+ * from lazy TLB mode to normal mode if active_mm isn't changing.
+ * When this happens, we don't assume that CR3 (and hence
+ * cpu_tlbstate.loaded_mm) matches next.
*
* NB: leave_mm() calls us with prev == NULL and tsk == NULL.
*/
- this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ /* We don't want flush_tlb_func_* to run concurrently with us. */
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING))
+ WARN_ON_ONCE(!irqs_disabled());
+
+ /*
+ * Verify that CR3 is what we think it is. This will catch
+ * hypothetical buggy code that directly switches to swapper_pg_dir
+ * without going through leave_mm() / switch_mm_irqs_off() or that
+ * does something like write_cr3(read_cr3_pa()).
+ *
+ * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3()
+ * isn't free.
+ */
+#ifdef CONFIG_DEBUG_VM
+ if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) {
+ /*
+ * If we were to BUG here, we'd be very likely to kill
+ * the system so hard that we don't see the call trace.
+ * Try to recover instead by ignoring the error and doing
+ * a global flush to minimize the chance of corruption.
+ *
+ * (This is far from being a fully correct recovery.
+ * Architecturally, the CPU could prefetch something
+ * back into an incorrect ASID slot and leave it there
+ * to cause trouble down the road. It's better than
+ * nothing, though.)
+ */
+ __flush_tlb_all();
+ }
+#endif
+ this_cpu_write(cpu_tlbstate.is_lazy, false);
if (real_prev == next) {
+ VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
+ next->context.ctx_id);
+
/*
- * There's nothing to do: we always keep the per-mm control
- * regs in sync with cpu_tlbstate.loaded_mm. Just
- * sanity-check mm_cpumask.
+ * We don't currently support having a real mm loaded without
+ * our cpu set in mm_cpumask(). We have all the bookkeeping
+ * in place to figure out whether we would need to flush
+ * if our cpu were cleared in mm_cpumask(), but we don't
+ * currently use it.
*/
- if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next))))
+ if (WARN_ON_ONCE(real_prev != &init_mm &&
+ !cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));
+
return;
- }
+ } else {
+ u16 new_asid;
+ bool need_flush;
+
+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+ /*
+ * If our current stack is in vmalloc space and isn't
+ * mapped in the new pgd, we'll double-fault. Forcibly
+ * map it.
+ */
+ unsigned int index = pgd_index(current_stack_pointer);
+ pgd_t *pgd = next->pgd + index;
+
+ if (unlikely(pgd_none(*pgd)))
+ set_pgd(pgd, init_mm.pgd[index]);
+ }
+
+ /* Stop remote flushes for the previous mm */
+ VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
+ real_prev != &init_mm);
+ cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
- if (IS_ENABLED(CONFIG_VMAP_STACK)) {
/*
- * If our current stack is in vmalloc space and isn't
- * mapped in the new pgd, we'll double-fault. Forcibly
- * map it.
+ * Start remote flushes and then read tlb_gen.
*/
- unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+ next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+
+ choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+
+ if (need_flush) {
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+ write_cr3(build_cr3(next, new_asid));
+
+ /*
+ * NB: This gets called via leave_mm() in the idle path
+ * where RCU functions differently. Tracing normally
+ * uses RCU, so we need to use the _rcuidle variant.
+ *
+ * (There is no good reason for this. The idle code should
+ * be rearranged to call this before rcu_idle_enter().)
+ */
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ } else {
+ /* The new ASID is already up to date. */
+ write_cr3(build_cr3_noflush(next, new_asid));
+
+ /* See above wrt _rcuidle. */
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
+ }
- pgd_t *pgd = next->pgd + stack_pgd_index;
+ this_cpu_write(cpu_tlbstate.loaded_mm, next);
+ this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+ }
+
+ load_mm_cr4(next);
+ switch_ldt(real_prev, next);
+}
+
+/*
+ * Please ignore the name of this function. It should be called
+ * switch_to_kernel_thread().
+ *
+ * enter_lazy_tlb() is a hint from the scheduler that we are entering a
+ * kernel thread or other context without an mm. Acceptable implementations
+ * include doing nothing whatsoever, switching to init_mm, or various clever
+ * lazy tricks to try to minimize TLB flushes.
+ *
+ * The scheduler reserves the right to call enter_lazy_tlb() several times
+ * in a row. It will notify us that we're going back to a real mm by
+ * calling switch_mm_irqs_off().
+ */
+void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
+ return;
- if (unlikely(pgd_none(*pgd)))
- set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
+ if (tlb_defer_switch_to_init_mm()) {
+ /*
+ * There's a significant optimization that may be possible
+ * here. We have accurate enough TLB flush tracking that we
+ * don't need to maintain coherence of TLB per se when we're
+ * lazy. We do, however, need to maintain coherence of
+ * paging-structure caches. We could, in principle, leave our
+ * old mm loaded and only switch to init_mm when
+ * tlb_remove_page() happens.
+ */
+ this_cpu_write(cpu_tlbstate.is_lazy, true);
+ } else {
+ switch_mm(NULL, &init_mm, NULL);
}
+}
- this_cpu_write(cpu_tlbstate.loaded_mm, next);
+/*
+ * Call this when reinitializing a CPU. It fixes the following potential
+ * problems:
+ *
+ * - The ASID changed from what cpu_tlbstate thinks it is (most likely
+ * because the CPU was taken down and came back up with CR3's PCID
+ * bits clear. CPU hotplug can do this.
+ *
+ * - The TLB contains junk in slots corresponding to inactive ASIDs.
+ *
+ * - The CPU went so far out to lunch that it may have missed a TLB
+ * flush.
+ */
+void initialize_tlbstate_and_flush(void)
+{
+ int i;
+ struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+ u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen);
+ unsigned long cr3 = __read_cr3();
- WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
- cpumask_set_cpu(cpu, mm_cpumask(next));
+ /* Assert that CR3 already references the right mm. */
+ WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
/*
- * Re-load page tables.
- *
- * This logic has an ordering constraint:
- *
- * CPU 0: Write to a PTE for 'next'
- * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
- * CPU 1: set bit 1 in next's mm_cpumask
- * CPU 1: load from the PTE that CPU 0 writes (implicit)
- *
- * We need to prevent an outcome in which CPU 1 observes
- * the new PTE value and CPU 0 observes bit 1 clear in
- * mm_cpumask. (If that occurs, then the IPI will never
- * be sent, and CPU 0's TLB will contain a stale entry.)
- *
- * The bad outcome can occur if either CPU's load is
- * reordered before that CPU's store, so both CPUs must
- * execute full barriers to prevent this from happening.
- *
- * Thus, switch_mm needs a full barrier between the
- * store to mm_cpumask and any operation that could load
- * from next->pgd. TLB fills are special and can happen
- * due to instruction fetches or for no reason at all,
- * and neither LOCK nor MFENCE orders them.
- * Fortunately, load_cr3() is serializing and gives the
- * ordering guarantee we need.
+ * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization
+ * doesn't work like other CR4 bits because it can only be set from
+ * long mode.)
*/
- load_cr3(next->pgd);
+ WARN_ON(boot_cpu_has(X86_FEATURE_PCID) &&
+ !(cr4_read_shadow() & X86_CR4_PCIDE));
- /*
- * This gets called via leave_mm() in the idle path where RCU
- * functions differently. Tracing normally uses RCU, so we have to
- * call the tracepoint specially here.
- */
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ /* Force ASID 0 and force a TLB flush. */
+ write_cr3(build_cr3(mm, 0));
- /* Stop flush ipis for the previous mm */
- WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
- real_prev != &init_mm);
- cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+ /* Reinitialize tlbstate. */
+ this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
+ this_cpu_write(cpu_tlbstate.next_asid, 1);
+ this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
+ this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
- /* Load per-mm CR4 and LDTR state */
- load_mm_cr4(next);
- switch_ldt(real_prev, next);
+ for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
+ this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
}
+/*
+ * flush_tlb_func_common()'s memory ordering requirement is that any
+ * TLB fills that happen after we flush the TLB are ordered after we
+ * read active_mm's tlb_gen. We don't need any explicit barriers
+ * because all x86 flush operations are serializing and the
+ * atomic64_read operation won't be reordered by the compiler.
+ */
static void flush_tlb_func_common(const struct flush_tlb_info *f,
bool local, enum tlb_flush_reason reason)
{
+ /*
+ * We have three different tlb_gen values in here. They are:
+ *
+ * - mm_tlb_gen: the latest generation.
+ * - local_tlb_gen: the generation that this CPU has already caught
+ * up to.
+ * - f->new_tlb_gen: the generation that the requester of the flush
+ * wants us to catch up to.
+ */
+ struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+ u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
+ u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+
/* This code cannot presently handle being reentered. */
VM_WARN_ON(!irqs_disabled());
- if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
- leave_mm(smp_processor_id());
+ if (unlikely(loaded_mm == &init_mm))
+ return;
+
+ VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
+ loaded_mm->context.ctx_id);
+
+ if (this_cpu_read(cpu_tlbstate.is_lazy)) {
+ /*
+ * We're in lazy mode. We need to at least flush our
+ * paging-structure cache to avoid speculatively reading
+ * garbage into our TLB. Since switching to init_mm is barely
+ * slower than a minimal flush, just switch to init_mm.
+ */
+ switch_mm_irqs_off(NULL, &init_mm, NULL);
return;
}
- if (f->end == TLB_FLUSH_ALL) {
- local_flush_tlb();
- if (local)
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- trace_tlb_flush(reason, TLB_FLUSH_ALL);
- } else {
+ if (unlikely(local_tlb_gen == mm_tlb_gen)) {
+ /*
+ * There's nothing to do: we're already up to date. This can
+ * happen if two concurrent flushes happen -- the first flush to
+ * be handled can catch us all the way up, leaving no work for
+ * the second flush.
+ */
+ trace_tlb_flush(reason, 0);
+ return;
+ }
+
+ WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
+ WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
+
+ /*
+ * If we get to this point, we know that our TLB is out of date.
+ * This does not strictly imply that we need to flush (it's
+ * possible that f->new_tlb_gen <= local_tlb_gen), but we're
+ * going to need to flush in the very near future, so we might
+ * as well get it over with.
+ *
+ * The only question is whether to do a full or partial flush.
+ *
+ * We do a partial flush if requested and two extra conditions
+ * are met:
+ *
+ * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that
+ * we've always done all needed flushes to catch up to
+ * local_tlb_gen. If, for example, local_tlb_gen == 2 and
+ * f->new_tlb_gen == 3, then we know that the flush needed to bring
+ * us up to date for tlb_gen 3 is the partial flush we're
+ * processing.
+ *
+ * As an example of why this check is needed, suppose that there
+ * are two concurrent flushes. The first is a full flush that
+ * changes context.tlb_gen from 1 to 2. The second is a partial
+ * flush that changes context.tlb_gen from 2 to 3. If they get
+ * processed on this CPU in reverse order, we'll see
+ * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
+ * If we were to use __flush_tlb_single() and set local_tlb_gen to
+ * 3, we'd be break the invariant: we'd update local_tlb_gen above
+ * 1 without the full flush that's needed for tlb_gen 2.
+ *
+ * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation.
+ * Partial TLB flushes are not all that much cheaper than full TLB
+ * flushes, so it seems unlikely that it would be a performance win
+ * to do a partial flush if that won't bring our TLB fully up to
+ * date. By doing a full flush instead, we can increase
+ * local_tlb_gen all the way to mm_tlb_gen and we can probably
+ * avoid another flush in the very near future.
+ */
+ if (f->end != TLB_FLUSH_ALL &&
+ f->new_tlb_gen == local_tlb_gen + 1 &&
+ f->new_tlb_gen == mm_tlb_gen) {
+ /* Partial flush */
unsigned long addr;
unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
+
addr = f->start;
while (addr < f->end) {
__flush_tlb_single(addr);
@@ -180,7 +410,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
if (local)
count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
trace_tlb_flush(reason, nr_pages);
+ } else {
+ /* Full flush. */
+ local_flush_tlb();
+ if (local)
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ trace_tlb_flush(reason, TLB_FLUSH_ALL);
}
+
+ /* Both paths above update our state to mm_tlb_gen. */
+ this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
}
static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
@@ -214,6 +453,21 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
(info->end - info->start) >> PAGE_SHIFT);
if (is_uv_system()) {
+ /*
+ * This whole special case is confused. UV has a "Broadcast
+ * Assist Unit", which seems to be a fancy way to send IPIs.
+ * Back when x86 used an explicit TLB flush IPI, UV was
+ * optimized to use its own mechanism. These days, x86 uses
+ * smp_call_function_many(), but UV still uses a manual IPI,
+ * and that IPI's action is out of date -- it does a manual
+ * flush instead of calling flush_tlb_func_remote(). This
+ * means that the percpu tlb_gen variables won't be updated
+ * and we'll do pointless flushes on future context switches.
+ *
+ * Rather than hooking native_flush_tlb_others() here, I think
+ * that UV should be updated so that smp_call_function_many(),
+ * etc, are optimal on UV.
+ */
unsigned int cpu;
cpu = smp_processor_id();
@@ -250,8 +504,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
cpu = get_cpu();
- /* Synchronize with switch_mm. */
- smp_mb();
+ /* This is also a barrier that synchronizes with switch_mm(). */
+ info.new_tlb_gen = inc_mm_tlb_gen(mm);
/* Should we flush just the requested range? */
if ((end != TLB_FLUSH_ALL) &&
@@ -273,6 +527,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), &info);
+
put_cpu();
}
@@ -281,8 +536,6 @@ static void do_flush_tlb_all(void *info)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
__flush_tlb_all();
- if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
- leave_mm(smp_processor_id());
}
void flush_tlb_all(void)
@@ -335,6 +588,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
flush_tlb_others(&batch->cpumask, &info);
+
cpumask_clear(&batch->cpumask);
put_cpu();
OpenPOWER on IntegriCloud