summaryrefslogtreecommitdiffstats
path: root/arch/powerpc/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r--arch/powerpc/mm/44x_mmu.c2
-rw-r--r--arch/powerpc/mm/8xx_mmu.c33
-rw-r--r--arch/powerpc/mm/Makefile17
-rw-r--r--arch/powerpc/mm/copro_fault.c4
-rw-r--r--arch/powerpc/mm/dma-noncoherent.c2
-rw-r--r--arch/powerpc/mm/dump_hashpagetable.c3
-rw-r--r--arch/powerpc/mm/dump_linuxpagetables-8xx.c82
-rw-r--r--arch/powerpc/mm/dump_linuxpagetables-book3s64.c120
-rw-r--r--arch/powerpc/mm/dump_linuxpagetables-generic.c82
-rw-r--r--arch/powerpc/mm/dump_linuxpagetables.c167
-rw-r--r--arch/powerpc/mm/dump_linuxpagetables.h19
-rw-r--r--arch/powerpc/mm/fault.c166
-rw-r--r--arch/powerpc/mm/hash64_4k.c8
-rw-r--r--arch/powerpc/mm/hash64_64k.c15
-rw-r--r--arch/powerpc/mm/hash_low_32.S1
-rw-r--r--arch/powerpc/mm/hash_native_64.c97
-rw-r--r--arch/powerpc/mm/hash_utils_64.c98
-rw-r--r--arch/powerpc/mm/highmem.c2
-rw-r--r--arch/powerpc/mm/hugepage-hash64.c15
-rw-r--r--arch/powerpc/mm/hugetlbpage-hash64.c4
-rw-r--r--arch/powerpc/mm/hugetlbpage.c114
-rw-r--r--arch/powerpc/mm/init_32.c7
-rw-r--r--arch/powerpc/mm/init_64.c57
-rw-r--r--arch/powerpc/mm/mem.c53
-rw-r--r--arch/powerpc/mm/mmap.c28
-rw-r--r--arch/powerpc/mm/mmu_context.c6
-rw-r--r--arch/powerpc/mm/mmu_context_book3s64.c129
-rw-r--r--arch/powerpc/mm/mmu_context_hash32.c1
-rw-r--r--arch/powerpc/mm/mmu_context_iommu.c85
-rw-r--r--arch/powerpc/mm/mmu_context_nohash.c153
-rw-r--r--arch/powerpc/mm/mmu_decl.h8
-rw-r--r--arch/powerpc/mm/numa.c83
-rw-r--r--arch/powerpc/mm/pgtable-book3e.c9
-rw-r--r--arch/powerpc/mm/pgtable-book3s64.c337
-rw-r--r--arch/powerpc/mm/pgtable-hash64.c21
-rw-r--r--arch/powerpc/mm/pgtable-radix.c388
-rw-r--r--arch/powerpc/mm/pgtable.c81
-rw-r--r--arch/powerpc/mm/pgtable_32.c74
-rw-r--r--arch/powerpc/mm/pgtable_64.c233
-rw-r--r--arch/powerpc/mm/pkeys.c144
-rw-r--r--arch/powerpc/mm/ppc_mmu_32.c6
-rw-r--r--arch/powerpc/mm/slb.c777
-rw-r--r--arch/powerpc/mm/slb_low.S327
-rw-r--r--arch/powerpc/mm/slice.c512
-rw-r--r--arch/powerpc/mm/subpage-prot.c5
-rw-r--r--arch/powerpc/mm/tlb-radix.c510
-rw-r--r--arch/powerpc/mm/tlb_hash32.c10
-rw-r--r--arch/powerpc/mm/tlb_hash64.c2
-rw-r--r--arch/powerpc/mm/tlb_low_64e.S1
-rw-r--r--arch/powerpc/mm/tlb_nohash.c3
-rw-r--r--arch/powerpc/mm/tlb_nohash_low.S2
51 files changed, 3027 insertions, 2076 deletions
diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c
index 82b1ff759e26..12d92518e898 100644
--- a/arch/powerpc/mm/44x_mmu.c
+++ b/arch/powerpc/mm/44x_mmu.c
@@ -229,7 +229,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
}
#ifdef CONFIG_SMP
-void mmu_init_secondary(int cpu)
+void __init mmu_init_secondary(int cpu)
{
unsigned long addr;
unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1);
diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c
index 849f50cd62f2..01b7f5107c3a 100644
--- a/arch/powerpc/mm/8xx_mmu.c
+++ b/arch/powerpc/mm/8xx_mmu.c
@@ -13,6 +13,7 @@
*/
#include <linux/memblock.h>
+#include <linux/mmu_context.h>
#include <asm/fixmap.h>
#include <asm/code-patching.h>
@@ -67,7 +68,7 @@ void __init MMU_init_hw(void)
/* PIN up to the 3 first 8Mb after IMMR in DTLB table */
#ifdef CONFIG_PIN_TLB_DATA
unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000;
- unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_PRIVILEGED | _PAGE_DIRTY;
+ unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY;
#ifdef CONFIG_PIN_TLB_IMMR
int i = 29;
#else
@@ -79,7 +80,7 @@ void __init MMU_init_hw(void)
for (; i < 32 && mem >= LARGE_PAGE_SIZE_8M; i++) {
mtspr(SPRN_MD_CTR, ctr | (i << 8));
mtspr(SPRN_MD_EPN, (unsigned long)__va(addr) | MD_EVALID);
- mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID | M_APG2);
+ mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID);
mtspr(SPRN_MD_RPN, addr | flags | _PAGE_PRESENT);
addr += LARGE_PAGE_SIZE_8M;
mem -= LARGE_PAGE_SIZE_8M;
@@ -91,29 +92,19 @@ static void __init mmu_mapin_immr(void)
{
unsigned long p = PHYS_IMMR_BASE;
unsigned long v = VIRT_IMMR_BASE;
- unsigned long f = pgprot_val(PAGE_KERNEL_NCG);
int offset;
for (offset = 0; offset < IMMR_SIZE; offset += PAGE_SIZE)
- map_kernel_page(v + offset, p + offset, f);
+ map_kernel_page(v + offset, p + offset, PAGE_KERNEL_NCG);
}
-/* Address of instructions to patch */
-#ifndef CONFIG_PIN_TLB_IMMR
-extern unsigned int DTLBMiss_jmp;
-#endif
-extern unsigned int DTLBMiss_cmp, FixupDAR_cmp;
-#ifndef CONFIG_PIN_TLB_TEXT
-extern unsigned int ITLBMiss_cmp;
-#endif
-
-static void __init mmu_patch_cmp_limit(unsigned int *addr, unsigned long mapped)
+static void __init mmu_patch_cmp_limit(s32 *site, unsigned long mapped)
{
- unsigned int instr = *addr;
+ unsigned int instr = *(unsigned int *)patch_site_addr(site);
instr &= 0xffff0000;
instr |= (unsigned long)__va(mapped) >> 16;
- patch_instruction(addr, instr);
+ patch_instruction_site(site, instr);
}
unsigned long __init mmu_mapin_ram(unsigned long top)
@@ -124,17 +115,17 @@ unsigned long __init mmu_mapin_ram(unsigned long top)
mapped = 0;
mmu_mapin_immr();
#ifndef CONFIG_PIN_TLB_IMMR
- patch_instruction(&DTLBMiss_jmp, PPC_INST_NOP);
+ patch_instruction_site(&patch__dtlbmiss_immr_jmp, PPC_INST_NOP);
#endif
#ifndef CONFIG_PIN_TLB_TEXT
- mmu_patch_cmp_limit(&ITLBMiss_cmp, 0);
+ mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, 0);
#endif
} else {
mapped = top & ~(LARGE_PAGE_SIZE_8M - 1);
}
- mmu_patch_cmp_limit(&DTLBMiss_cmp, mapped);
- mmu_patch_cmp_limit(&FixupDAR_cmp, mapped);
+ mmu_patch_cmp_limit(&patch__dtlbmiss_linmem_top, mapped);
+ mmu_patch_cmp_limit(&patch__fixupdar_linmem_top, mapped);
/* If the size of RAM is not an exact power of two, we may not
* have covered RAM in its entirety with 8 MiB
@@ -192,7 +183,7 @@ void set_context(unsigned long id, pgd_t *pgd)
mtspr(SPRN_M_TW, __pa(pgd) - offset);
/* Update context */
- mtspr(SPRN_M_CASID, id);
+ mtspr(SPRN_M_CASID, id - 1);
/* sync */
mb();
}
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index f06f3577d8d1..ca96e7be4d0e 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -3,10 +3,10 @@
# Makefile for the linux ppc-specific parts of the memory manager.
#
-subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
-
ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
+CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE)
+
obj-y := fault.o mem.o pgtable.o mmap.o \
init_$(BITS).o pgtable_$(BITS).o \
init-common.o mmu_context.o drmem.o
@@ -15,11 +15,11 @@ obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \
obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(BITS)e.o
hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o
obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o
-obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o
+obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-hash64.o hash_utils_64.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o
obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o tlb-radix.o
obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o
obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(BITS).o
-ifeq ($(CONFIG_PPC_BOOK3S_64),y)
+ifdef CONFIG_PPC_BOOK3S_64
obj-$(CONFIG_PPC_4K_PAGES) += hash64_4k.o
obj-$(CONFIG_PPC_64K_PAGES) += hash64_64k.o
endif
@@ -31,7 +31,7 @@ obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
obj-$(CONFIG_PPC_SPLPAR) += vphn.o
obj-$(CONFIG_PPC_MM_SLICES) += slice.o
obj-y += hugetlbpage.o
-ifeq ($(CONFIG_HUGETLB_PAGE),y)
+ifdef CONFIG_HUGETLB_PAGE
obj-$(CONFIG_PPC_BOOK3S_64) += hugetlbpage-hash64.o
obj-$(CONFIG_PPC_RADIX_MMU) += hugetlbpage-radix.o
obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o
@@ -43,5 +43,12 @@ obj-$(CONFIG_HIGHMEM) += highmem.o
obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o
obj-$(CONFIG_SPAPR_TCE_IOMMU) += mmu_context_iommu.o
obj-$(CONFIG_PPC_PTDUMP) += dump_linuxpagetables.o
+ifdef CONFIG_PPC_PTDUMP
+obj-$(CONFIG_4xx) += dump_linuxpagetables-generic.o
+obj-$(CONFIG_PPC_8xx) += dump_linuxpagetables-8xx.o
+obj-$(CONFIG_PPC_BOOK3E_MMU) += dump_linuxpagetables-generic.o
+obj-$(CONFIG_PPC_BOOK3S_32) += dump_linuxpagetables-generic.o
+obj-$(CONFIG_PPC_BOOK3S_64) += dump_linuxpagetables-book3s64.o
+endif
obj-$(CONFIG_PPC_HTDUMP) += dump_hashpagetable.o
obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o
diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
index 697b70ad1195..c8da352e8686 100644
--- a/arch/powerpc/mm/copro_fault.c
+++ b/arch/powerpc/mm/copro_fault.c
@@ -34,7 +34,7 @@
* to handle fortunately.
*/
int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
- unsigned long dsisr, unsigned *flt)
+ unsigned long dsisr, vm_fault_t *flt)
{
struct vm_area_struct *vma;
unsigned long is_write;
@@ -112,7 +112,7 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb)
return 1;
psize = get_slice_psize(mm, ea);
ssize = user_segment_size(ea);
- vsid = get_vsid(mm->context.id, ea, ssize);
+ vsid = get_user_vsid(&mm->context, ea, ssize);
vsidkey = SLB_VSID_USER;
break;
case VMALLOC_REGION_ID:
diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c
index 382528475433..b6e7b5952ab5 100644
--- a/arch/powerpc/mm/dma-noncoherent.c
+++ b/arch/powerpc/mm/dma-noncoherent.c
@@ -228,7 +228,7 @@ __dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, gfp_t
do {
SetPageReserved(page);
map_kernel_page(vaddr, page_to_phys(page),
- pgprot_val(pgprot_noncached(PAGE_KERNEL)));
+ pgprot_noncached(PAGE_KERNEL));
page++;
vaddr += PAGE_SIZE;
} while (size -= PAGE_SIZE);
diff --git a/arch/powerpc/mm/dump_hashpagetable.c b/arch/powerpc/mm/dump_hashpagetable.c
index 14cfb11b09d0..869294695048 100644
--- a/arch/powerpc/mm/dump_hashpagetable.c
+++ b/arch/powerpc/mm/dump_hashpagetable.c
@@ -19,7 +19,6 @@
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
-#include <asm/fixmap.h>
#include <asm/pgtable.h>
#include <linux/const.h>
#include <asm/page.h>
@@ -260,7 +259,7 @@ static int pseries_find(unsigned long ea, int psize, bool primary, u64 *v, u64 *
/* to check in the secondary hash table, we invert the hash */
if (!primary)
hash = ~hash;
- hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
/* see if we can find an entry in the hpte with this hash */
for (i = 0; i < HPTES_PER_GROUP; i += 4, hpte_group += 4) {
lpar_rc = plpar_pte_read_4(0, hpte_group, (void *)ptes);
diff --git a/arch/powerpc/mm/dump_linuxpagetables-8xx.c b/arch/powerpc/mm/dump_linuxpagetables-8xx.c
new file mode 100644
index 000000000000..ab9e3f24db2f
--- /dev/null
+++ b/arch/powerpc/mm/dump_linuxpagetables-8xx.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * From split of dump_linuxpagetables.c
+ * Copyright 2016, Rashmica Gupta, IBM Corp.
+ *
+ */
+#include <linux/kernel.h>
+#include <asm/pgtable.h>
+
+#include "dump_linuxpagetables.h"
+
+static const struct flag_info flag_array[] = {
+ {
+ .mask = _PAGE_SH,
+ .val = 0,
+ .set = "user",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_RO | _PAGE_NA,
+ .val = 0,
+ .set = "rw",
+ }, {
+ .mask = _PAGE_RO | _PAGE_NA,
+ .val = _PAGE_RO,
+ .set = "r ",
+ }, {
+ .mask = _PAGE_RO | _PAGE_NA,
+ .val = _PAGE_NA,
+ .set = " ",
+ }, {
+ .mask = _PAGE_EXEC,
+ .val = _PAGE_EXEC,
+ .set = " X ",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_PRESENT,
+ .val = _PAGE_PRESENT,
+ .set = "present",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_GUARDED,
+ .val = _PAGE_GUARDED,
+ .set = "guarded",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_DIRTY,
+ .val = _PAGE_DIRTY,
+ .set = "dirty",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_ACCESSED,
+ .val = _PAGE_ACCESSED,
+ .set = "accessed",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_NO_CACHE,
+ .val = _PAGE_NO_CACHE,
+ .set = "no cache",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_SPECIAL,
+ .val = _PAGE_SPECIAL,
+ .set = "special",
+ }
+};
+
+struct pgtable_level pg_level[5] = {
+ {
+ }, { /* pgd */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pud */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pmd */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pte */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ },
+};
diff --git a/arch/powerpc/mm/dump_linuxpagetables-book3s64.c b/arch/powerpc/mm/dump_linuxpagetables-book3s64.c
new file mode 100644
index 000000000000..ed6fcf78256e
--- /dev/null
+++ b/arch/powerpc/mm/dump_linuxpagetables-book3s64.c
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * From split of dump_linuxpagetables.c
+ * Copyright 2016, Rashmica Gupta, IBM Corp.
+ *
+ */
+#include <linux/kernel.h>
+#include <asm/pgtable.h>
+
+#include "dump_linuxpagetables.h"
+
+static const struct flag_info flag_array[] = {
+ {
+ .mask = _PAGE_PRIVILEGED,
+ .val = 0,
+ .set = "user",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_READ,
+ .val = _PAGE_READ,
+ .set = "r",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_WRITE,
+ .val = _PAGE_WRITE,
+ .set = "w",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_EXEC,
+ .val = _PAGE_EXEC,
+ .set = " X ",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_PTE,
+ .val = _PAGE_PTE,
+ .set = "pte",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_PRESENT,
+ .val = _PAGE_PRESENT,
+ .set = "valid",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_PRESENT | _PAGE_INVALID,
+ .val = 0,
+ .set = " ",
+ .clear = "present",
+ }, {
+ .mask = H_PAGE_HASHPTE,
+ .val = H_PAGE_HASHPTE,
+ .set = "hpte",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_DIRTY,
+ .val = _PAGE_DIRTY,
+ .set = "dirty",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_ACCESSED,
+ .val = _PAGE_ACCESSED,
+ .set = "accessed",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_NON_IDEMPOTENT,
+ .val = _PAGE_NON_IDEMPOTENT,
+ .set = "non-idempotent",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_TOLERANT,
+ .val = _PAGE_TOLERANT,
+ .set = "tolerant",
+ .clear = " ",
+ }, {
+ .mask = H_PAGE_BUSY,
+ .val = H_PAGE_BUSY,
+ .set = "busy",
+ }, {
+#ifdef CONFIG_PPC_64K_PAGES
+ .mask = H_PAGE_COMBO,
+ .val = H_PAGE_COMBO,
+ .set = "combo",
+ }, {
+ .mask = H_PAGE_4K_PFN,
+ .val = H_PAGE_4K_PFN,
+ .set = "4K_pfn",
+ }, {
+#else /* CONFIG_PPC_64K_PAGES */
+ .mask = H_PAGE_F_GIX,
+ .val = H_PAGE_F_GIX,
+ .set = "f_gix",
+ .is_val = true,
+ .shift = H_PAGE_F_GIX_SHIFT,
+ }, {
+ .mask = H_PAGE_F_SECOND,
+ .val = H_PAGE_F_SECOND,
+ .set = "f_second",
+ }, {
+#endif /* CONFIG_PPC_64K_PAGES */
+ .mask = _PAGE_SPECIAL,
+ .val = _PAGE_SPECIAL,
+ .set = "special",
+ }
+};
+
+struct pgtable_level pg_level[5] = {
+ {
+ }, { /* pgd */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pud */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pmd */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pte */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ },
+};
diff --git a/arch/powerpc/mm/dump_linuxpagetables-generic.c b/arch/powerpc/mm/dump_linuxpagetables-generic.c
new file mode 100644
index 000000000000..1e3829ec1348
--- /dev/null
+++ b/arch/powerpc/mm/dump_linuxpagetables-generic.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * From split of dump_linuxpagetables.c
+ * Copyright 2016, Rashmica Gupta, IBM Corp.
+ *
+ */
+#include <linux/kernel.h>
+#include <asm/pgtable.h>
+
+#include "dump_linuxpagetables.h"
+
+static const struct flag_info flag_array[] = {
+ {
+ .mask = _PAGE_USER,
+ .val = _PAGE_USER,
+ .set = "user",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_RW,
+ .val = _PAGE_RW,
+ .set = "rw",
+ .clear = "r ",
+ }, {
+#ifndef CONFIG_PPC_BOOK3S_32
+ .mask = _PAGE_EXEC,
+ .val = _PAGE_EXEC,
+ .set = " X ",
+ .clear = " ",
+ }, {
+#endif
+ .mask = _PAGE_PRESENT,
+ .val = _PAGE_PRESENT,
+ .set = "present",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_GUARDED,
+ .val = _PAGE_GUARDED,
+ .set = "guarded",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_DIRTY,
+ .val = _PAGE_DIRTY,
+ .set = "dirty",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_ACCESSED,
+ .val = _PAGE_ACCESSED,
+ .set = "accessed",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_WRITETHRU,
+ .val = _PAGE_WRITETHRU,
+ .set = "write through",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_NO_CACHE,
+ .val = _PAGE_NO_CACHE,
+ .set = "no cache",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_SPECIAL,
+ .val = _PAGE_SPECIAL,
+ .set = "special",
+ }
+};
+
+struct pgtable_level pg_level[5] = {
+ {
+ }, { /* pgd */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pud */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pmd */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pte */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ },
+};
diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c
index 876e2a3c79f2..2b74f8adf4d0 100644
--- a/arch/powerpc/mm/dump_linuxpagetables.c
+++ b/arch/powerpc/mm/dump_linuxpagetables.c
@@ -27,6 +27,8 @@
#include <asm/page.h>
#include <asm/pgalloc.h>
+#include "dump_linuxpagetables.h"
+
#ifdef CONFIG_PPC32
#define KERN_VIRT_START 0
#endif
@@ -101,159 +103,6 @@ static struct addr_marker address_markers[] = {
{ -1, NULL },
};
-struct flag_info {
- u64 mask;
- u64 val;
- const char *set;
- const char *clear;
- bool is_val;
- int shift;
-};
-
-static const struct flag_info flag_array[] = {
- {
- .mask = _PAGE_USER | _PAGE_PRIVILEGED,
- .val = _PAGE_USER,
- .set = "user",
- .clear = " ",
- }, {
- .mask = _PAGE_RW | _PAGE_RO | _PAGE_NA,
- .val = _PAGE_RW,
- .set = "rw",
- }, {
- .mask = _PAGE_RW | _PAGE_RO | _PAGE_NA,
- .val = _PAGE_RO,
- .set = "ro",
- }, {
-#if _PAGE_NA != 0
- .mask = _PAGE_RW | _PAGE_RO | _PAGE_NA,
- .val = _PAGE_RO,
- .set = "na",
- }, {
-#endif
- .mask = _PAGE_EXEC,
- .val = _PAGE_EXEC,
- .set = " X ",
- .clear = " ",
- }, {
- .mask = _PAGE_PTE,
- .val = _PAGE_PTE,
- .set = "pte",
- .clear = " ",
- }, {
- .mask = _PAGE_PRESENT,
- .val = _PAGE_PRESENT,
- .set = "present",
- .clear = " ",
- }, {
-#ifdef CONFIG_PPC_BOOK3S_64
- .mask = H_PAGE_HASHPTE,
- .val = H_PAGE_HASHPTE,
-#else
- .mask = _PAGE_HASHPTE,
- .val = _PAGE_HASHPTE,
-#endif
- .set = "hpte",
- .clear = " ",
- }, {
-#ifndef CONFIG_PPC_BOOK3S_64
- .mask = _PAGE_GUARDED,
- .val = _PAGE_GUARDED,
- .set = "guarded",
- .clear = " ",
- }, {
-#endif
- .mask = _PAGE_DIRTY,
- .val = _PAGE_DIRTY,
- .set = "dirty",
- .clear = " ",
- }, {
- .mask = _PAGE_ACCESSED,
- .val = _PAGE_ACCESSED,
- .set = "accessed",
- .clear = " ",
- }, {
-#ifndef CONFIG_PPC_BOOK3S_64
- .mask = _PAGE_WRITETHRU,
- .val = _PAGE_WRITETHRU,
- .set = "write through",
- .clear = " ",
- }, {
-#endif
-#ifndef CONFIG_PPC_BOOK3S_64
- .mask = _PAGE_NO_CACHE,
- .val = _PAGE_NO_CACHE,
- .set = "no cache",
- .clear = " ",
- }, {
-#else
- .mask = _PAGE_NON_IDEMPOTENT,
- .val = _PAGE_NON_IDEMPOTENT,
- .set = "non-idempotent",
- .clear = " ",
- }, {
- .mask = _PAGE_TOLERANT,
- .val = _PAGE_TOLERANT,
- .set = "tolerant",
- .clear = " ",
- }, {
-#endif
-#ifdef CONFIG_PPC_BOOK3S_64
- .mask = H_PAGE_BUSY,
- .val = H_PAGE_BUSY,
- .set = "busy",
- }, {
-#ifdef CONFIG_PPC_64K_PAGES
- .mask = H_PAGE_COMBO,
- .val = H_PAGE_COMBO,
- .set = "combo",
- }, {
- .mask = H_PAGE_4K_PFN,
- .val = H_PAGE_4K_PFN,
- .set = "4K_pfn",
- }, {
-#else /* CONFIG_PPC_64K_PAGES */
- .mask = H_PAGE_F_GIX,
- .val = H_PAGE_F_GIX,
- .set = "f_gix",
- .is_val = true,
- .shift = H_PAGE_F_GIX_SHIFT,
- }, {
- .mask = H_PAGE_F_SECOND,
- .val = H_PAGE_F_SECOND,
- .set = "f_second",
- }, {
-#endif /* CONFIG_PPC_64K_PAGES */
-#endif
- .mask = _PAGE_SPECIAL,
- .val = _PAGE_SPECIAL,
- .set = "special",
- }
-};
-
-struct pgtable_level {
- const struct flag_info *flag;
- size_t num;
- u64 mask;
-};
-
-static struct pgtable_level pg_level[] = {
- {
- }, { /* pgd */
- .flag = flag_array,
- .num = ARRAY_SIZE(flag_array),
- }, { /* pud */
- .flag = flag_array,
- .num = ARRAY_SIZE(flag_array),
- }, { /* pmd */
- .flag = flag_array,
- .num = ARRAY_SIZE(flag_array),
- }, { /* pte */
- .flag = flag_array,
- .num = ARRAY_SIZE(flag_array),
- },
-};
-
static void dump_flag_info(struct pg_state *st, const struct flag_info
*flag, u64 pte, int num)
{
@@ -418,12 +267,13 @@ static void walk_pagetables(struct pg_state *st)
unsigned int i;
unsigned long addr;
+ addr = st->start_address;
+
/*
* Traverse the linux pagetable structure and dump pages that are in
* the hash pagetable.
*/
- for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
- addr = KERN_VIRT_START + i * PGDIR_SIZE;
+ for (i = 0; i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) {
if (!pgd_none(*pgd) && !pgd_huge(*pgd))
/* pgd exists */
walk_pud(st, pgd, addr);
@@ -472,9 +322,14 @@ static int ptdump_show(struct seq_file *m, void *v)
{
struct pg_state st = {
.seq = m,
- .start_address = KERN_VIRT_START,
.marker = address_markers,
};
+
+ if (radix_enabled())
+ st.start_address = PAGE_OFFSET;
+ else
+ st.start_address = KERN_VIRT_START;
+
/* Traverse kernel page tables */
walk_pagetables(&st);
note_page(&st, 0, 0, 0);
diff --git a/arch/powerpc/mm/dump_linuxpagetables.h b/arch/powerpc/mm/dump_linuxpagetables.h
new file mode 100644
index 000000000000..5d513636de73
--- /dev/null
+++ b/arch/powerpc/mm/dump_linuxpagetables.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/types.h>
+
+struct flag_info {
+ u64 mask;
+ u64 val;
+ const char *set;
+ const char *clear;
+ bool is_val;
+ int shift;
+};
+
+struct pgtable_level {
+ const struct flag_info *flag;
+ size_t num;
+ u64 mask;
+};
+
+extern struct pgtable_level pg_level[5];
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 866446cf2d9a..1697e903bbf2 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -22,6 +22,7 @@
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
+#include <linux/pagemap.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
@@ -41,7 +42,6 @@
#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
-#include <asm/tlbflush.h>
#include <asm/siginfo.h>
#include <asm/debug.h>
@@ -66,37 +66,33 @@ static inline bool notify_page_fault(struct pt_regs *regs)
}
/*
- * Check whether the instruction at regs->nip is a store using
+ * Check whether the instruction inst is a store using
* an update addressing form which will update r1.
*/
-static bool store_updates_sp(struct pt_regs *regs)
+static bool store_updates_sp(unsigned int inst)
{
- unsigned int inst;
-
- if (get_user(inst, (unsigned int __user *)regs->nip))
- return false;
/* check for 1 in the rA field */
if (((inst >> 16) & 0x1f) != 1)
return false;
/* check major opcode */
switch (inst >> 26) {
- case 37: /* stwu */
- case 39: /* stbu */
- case 45: /* sthu */
- case 53: /* stfsu */
- case 55: /* stfdu */
+ case OP_STWU:
+ case OP_STBU:
+ case OP_STHU:
+ case OP_STFSU:
+ case OP_STFDU:
return true;
- case 62: /* std or stdu */
+ case OP_STD: /* std or stdu */
return (inst & 3) == 1;
- case 31:
+ case OP_31:
/* check minor opcode */
switch ((inst >> 1) & 0x3ff) {
- case 181: /* stdux */
- case 183: /* stwux */
- case 247: /* stbux */
- case 439: /* sthux */
- case 695: /* stfsux */
- case 759: /* stfdux */
+ case OP_31_XOP_STDUX:
+ case OP_31_XOP_STWUX:
+ case OP_31_XOP_STBUX:
+ case OP_31_XOP_STHUX:
+ case OP_31_XOP_STFSUX:
+ case OP_31_XOP_STFDUX:
return true;
}
}
@@ -107,8 +103,7 @@ static bool store_updates_sp(struct pt_regs *regs)
*/
static int
-__bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code,
- int pkey)
+__bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code)
{
/*
* If we are in kernel mode, bail out with a SEGV, this will
@@ -118,18 +113,17 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code,
if (!user_mode(regs))
return SIGSEGV;
- _exception_pkey(SIGSEGV, regs, si_code, address, pkey);
+ _exception(SIGSEGV, regs, si_code, address);
return 0;
}
static noinline int bad_area_nosemaphore(struct pt_regs *regs, unsigned long address)
{
- return __bad_area_nosemaphore(regs, address, SEGV_MAPERR, 0);
+ return __bad_area_nosemaphore(regs, address, SEGV_MAPERR);
}
-static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code,
- int pkey)
+static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code)
{
struct mm_struct *mm = current->mm;
@@ -139,57 +133,66 @@ static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code,
*/
up_read(&mm->mmap_sem);
- return __bad_area_nosemaphore(regs, address, si_code, pkey);
+ return __bad_area_nosemaphore(regs, address, si_code);
}
static noinline int bad_area(struct pt_regs *regs, unsigned long address)
{
- return __bad_area(regs, address, SEGV_MAPERR, 0);
+ return __bad_area(regs, address, SEGV_MAPERR);
}
static int bad_key_fault_exception(struct pt_regs *regs, unsigned long address,
int pkey)
{
- return __bad_area_nosemaphore(regs, address, SEGV_PKUERR, pkey);
+ /*
+ * If we are in kernel mode, bail out with a SEGV, this will
+ * be caught by the assembly which will restore the non-volatile
+ * registers before calling bad_page_fault()
+ */
+ if (!user_mode(regs))
+ return SIGSEGV;
+
+ _exception_pkey(regs, address, pkey);
+
+ return 0;
}
static noinline int bad_access(struct pt_regs *regs, unsigned long address)
{
- return __bad_area(regs, address, SEGV_ACCERR, 0);
+ return __bad_area(regs, address, SEGV_ACCERR);
}
static int do_sigbus(struct pt_regs *regs, unsigned long address,
- unsigned int fault)
+ vm_fault_t fault)
{
- siginfo_t info;
- unsigned int lsb = 0;
-
if (!user_mode(regs))
return SIGBUS;
current->thread.trap_nr = BUS_ADRERR;
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_ADRERR;
- info.si_addr = (void __user *)address;
#ifdef CONFIG_MEMORY_FAILURE
if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
+ unsigned int lsb = 0; /* shutup gcc */
+
pr_err("MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
current->comm, current->pid, address);
- info.si_code = BUS_MCEERR_AR;
+
+ if (fault & VM_FAULT_HWPOISON_LARGE)
+ lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
+ if (fault & VM_FAULT_HWPOISON)
+ lsb = PAGE_SHIFT;
+
+ force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb,
+ current);
+ return 0;
}
- if (fault & VM_FAULT_HWPOISON_LARGE)
- lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
- if (fault & VM_FAULT_HWPOISON)
- lsb = PAGE_SHIFT;
#endif
- info.si_addr_lsb = lsb;
- force_sig_info(SIGBUS, &info, current);
+ force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, current);
return 0;
}
-static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
+static int mm_fault_error(struct pt_regs *regs, unsigned long addr,
+ vm_fault_t fault)
{
/*
* Kernel page fault interrupted by SIGKILL. We have no reason to
@@ -234,8 +237,8 @@ static bool bad_kernel_fault(bool is_exec, unsigned long error_code,
}
static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
- struct vm_area_struct *vma,
- bool store_update_sp)
+ struct vm_area_struct *vma, unsigned int flags,
+ bool *must_retry)
{
/*
* N.B. The POWER/Open ABI allows programs to access up to
@@ -247,6 +250,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
* expand to 1MB without further checks.
*/
if (address + 0x100000 < vma->vm_end) {
+ unsigned int __user *nip = (unsigned int __user *)regs->nip;
/* get user regs even if this fault is in kernel mode */
struct pt_regs *uregs = current->thread.regs;
if (uregs == NULL)
@@ -264,8 +268,22 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
* between the last mapped region and the stack will
* expand the stack rather than segfaulting.
*/
- if (address + 2048 < uregs->gpr[1] && !store_update_sp)
- return true;
+ if (address + 2048 >= uregs->gpr[1])
+ return false;
+
+ if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
+ access_ok(VERIFY_READ, nip, sizeof(*nip))) {
+ unsigned int inst;
+ int res;
+
+ pagefault_disable();
+ res = __get_user_inatomic(inst, nip);
+ pagefault_enable();
+ if (!res)
+ return !store_updates_sp(inst);
+ *must_retry = true;
+ }
+ return true;
}
return false;
}
@@ -297,7 +315,12 @@ static bool access_error(bool is_write, bool is_exec,
if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
return true;
-
+ /*
+ * We should ideally do the vma pkey access check here. But in the
+ * fault path, handle_mm_fault() also does the same check. To avoid
+ * these multiple checks, we skip it here and handle access error due
+ * to pkeys later.
+ */
return false;
}
@@ -397,8 +420,8 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
int is_exec = TRAP(regs) == 0x400;
int is_user = user_mode(regs);
int is_write = page_fault_is_write(error_code);
- int fault, major = 0;
- bool store_update_sp = false;
+ vm_fault_t fault, major = 0;
+ bool must_retry = false;
if (notify_page_fault(regs))
return 0;
@@ -449,9 +472,6 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
* can result in fault, which will cause a deadlock when called with
* mmap_sem held
*/
- if (is_write && is_user)
- store_update_sp = store_updates_sp(regs);
-
if (is_user)
flags |= FAULT_FLAG_USER;
if (is_write)
@@ -498,8 +518,17 @@ retry:
return bad_area(regs, address);
/* The stack is being expanded, check if it's valid */
- if (unlikely(bad_stack_expansion(regs, address, vma, store_update_sp)))
- return bad_area(regs, address);
+ if (unlikely(bad_stack_expansion(regs, address, vma, flags,
+ &must_retry))) {
+ if (!must_retry)
+ return bad_area(regs, address);
+
+ up_read(&mm->mmap_sem);
+ if (fault_in_pages_readable((const char __user *)regs->nip,
+ sizeof(unsigned int)))
+ return bad_area_nosemaphore(regs, address);
+ goto retry;
+ }
/* Try to expand it */
if (unlikely(expand_stack(vma, address)))
@@ -518,25 +547,16 @@ good_area:
#ifdef CONFIG_PPC_MEM_KEYS
/*
- * if the HPTE is not hashed, hardware will not detect
- * a key fault. Lets check if we failed because of a
- * software detected key fault.
+ * we skipped checking for access error due to key earlier.
+ * Check that using handle_mm_fault error return.
*/
if (unlikely(fault & VM_FAULT_SIGSEGV) &&
- !arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
- is_exec, 0)) {
- /*
- * The PGD-PDT...PMD-PTE tree may not have been fully setup.
- * Hence we cannot walk the tree to locate the PTE, to locate
- * the key. Hence let's use vma_pkey() to get the key; instead
- * of get_mm_addr_key().
- */
+ !arch_vma_access_permitted(vma, is_write, is_exec, 0)) {
+
int pkey = vma_pkey(vma);
- if (likely(pkey)) {
- up_read(&mm->mmap_sem);
- return bad_key_fault_exception(regs, address, pkey);
- }
+ up_read(&mm->mmap_sem);
+ return bad_key_fault_exception(regs, address, pkey);
}
#endif /* CONFIG_PPC_MEM_KEYS */
diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c
index d573d7d07f25..6fa6765a10eb 100644
--- a/arch/powerpc/mm/hash64_4k.c
+++ b/arch/powerpc/mm/hash64_4k.c
@@ -80,7 +80,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
hash = hpt_hash(vpn, shift, ssize);
repeat:
- hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
/* Insert into the hash table, primary slot */
slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
@@ -89,7 +89,7 @@ repeat:
* Primary is full, try the secondary
*/
if (unlikely(slot == -1)) {
- hpte_group = ((~hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+ hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
rflags,
HPTE_V_SECONDARY,
@@ -97,8 +97,8 @@ repeat:
MMU_PAGE_4K, ssize);
if (slot == -1) {
if (mftb() & 0x1)
- hpte_group = ((hash & htab_hash_mask) *
- HPTES_PER_GROUP) & ~0x7UL;
+ hpte_group = (hash & htab_hash_mask) *
+ HPTES_PER_GROUP;
mmu_hash_ops.hpte_remove(hpte_group);
/*
* FIXME!! Should be try the group from which we removed ?
diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
index e601d95c3b20..3afa253d7f52 100644
--- a/arch/powerpc/mm/hash64_64k.c
+++ b/arch/powerpc/mm/hash64_64k.c
@@ -154,7 +154,7 @@ htab_insert_hpte:
}
hash = hpt_hash(vpn, shift, ssize);
repeat:
- hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
/* Insert into the hash table, primary slot */
slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
@@ -165,7 +165,7 @@ repeat:
if (unlikely(slot == -1)) {
bool soft_invalid;
- hpte_group = ((~hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+ hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
rflags, HPTE_V_SECONDARY,
MMU_PAGE_4K, MMU_PAGE_4K,
@@ -193,8 +193,7 @@ repeat:
* that we do not get the same soft-invalid slot.
*/
if (soft_invalid || (mftb() & 0x1))
- hpte_group = ((hash & htab_hash_mask) *
- HPTES_PER_GROUP) & ~0x7UL;
+ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
mmu_hash_ops.hpte_remove(hpte_group);
/*
@@ -288,7 +287,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
hash = hpt_hash(vpn, shift, ssize);
repeat:
- hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
/* Insert into the hash table, primary slot */
slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
@@ -298,7 +297,7 @@ repeat:
* Primary is full, try the secondary
*/
if (unlikely(slot == -1)) {
- hpte_group = ((~hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+ hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
rflags,
HPTE_V_SECONDARY,
@@ -306,8 +305,8 @@ repeat:
MMU_PAGE_64K, ssize);
if (slot == -1) {
if (mftb() & 0x1)
- hpte_group = ((hash & htab_hash_mask) *
- HPTES_PER_GROUP) & ~0x7UL;
+ hpte_group = (hash & htab_hash_mask) *
+ HPTES_PER_GROUP;
mmu_hash_ops.hpte_remove(hpte_group);
/*
* FIXME!! Should be try the group from which we removed ?
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index ffbd7c0bda96..26acf6c8c20c 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -27,6 +27,7 @@
#include <asm/thread_info.h>
#include <asm/asm-offsets.h>
#include <asm/export.h>
+#include <asm/feature-fixups.h>
#ifdef CONFIG_SMP
.section .bss
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 656933c85925..aaa28fd918fe 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -23,13 +23,13 @@
#include <asm/mmu.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
#include <asm/trace.h>
#include <asm/tlb.h>
#include <asm/cputable.h>
#include <asm/udbg.h>
#include <asm/kexec.h>
#include <asm/ppc-opcode.h>
+#include <asm/feature-fixups.h>
#include <misc/cxl-base.h>
@@ -115,6 +115,8 @@ static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
tlbiel_hash_set_isa300(0, is, 0, 2, 1);
asm volatile("ptesync": : :"memory");
+
+ asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
}
void hash__tlbiel_all(unsigned int action)
@@ -140,8 +142,6 @@ void hash__tlbiel_all(unsigned int action)
tlbiel_all_isa206(POWER7_TLB_SETS, is);
else
WARN(1, "%s called on pre-POWER7 CPU\n", __func__);
-
- asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
}
static inline unsigned long ___tlbie(unsigned long vpn, int psize,
@@ -423,9 +423,7 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
DBG_LOW(" update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)",
vpn, want_v & HPTE_V_AVPN, slot, newpp);
- hpte_v = be64_to_cpu(hptep->v);
- if (cpu_has_feature(CPU_FTR_ARCH_300))
- hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
+ hpte_v = hpte_get_old_v(hptep);
/*
* We need to invalidate the TLB always because hpte_remove doesn't do
* a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
@@ -439,9 +437,7 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
} else {
native_lock_hpte(hptep);
/* recheck with locks held */
- hpte_v = be64_to_cpu(hptep->v);
- if (cpu_has_feature(CPU_FTR_ARCH_300))
- hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
+ hpte_v = hpte_get_old_v(hptep);
if (unlikely(!HPTE_V_COMPARE(hpte_v, want_v) ||
!(hpte_v & HPTE_V_VALID))) {
ret = -1;
@@ -481,11 +477,9 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize)
/* Bolted mappings are only ever in the primary group */
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
for (i = 0; i < HPTES_PER_GROUP; i++) {
- hptep = htab_address + slot;
- hpte_v = be64_to_cpu(hptep->v);
- if (cpu_has_feature(CPU_FTR_ARCH_300))
- hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
+ hptep = htab_address + slot;
+ hpte_v = hpte_get_old_v(hptep);
if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
/* HPTE matches */
return slot;
@@ -574,11 +568,19 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
DBG_LOW(" invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot);
want_v = hpte_encode_avpn(vpn, bpsize, ssize);
- native_lock_hpte(hptep);
- hpte_v = be64_to_cpu(hptep->v);
- if (cpu_has_feature(CPU_FTR_ARCH_300))
- hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
+ hpte_v = hpte_get_old_v(hptep);
+
+ if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
+ native_lock_hpte(hptep);
+ /* recheck with locks held */
+ hpte_v = hpte_get_old_v(hptep);
+ if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
+ /* Invalidate the hpte. NOTE: this also unlocks it */
+ hptep->v = 0;
+ else
+ native_unlock_hpte(hptep);
+ }
/*
* We need to invalidate the TLB always because hpte_remove doesn't do
* a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
@@ -586,13 +588,6 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
* (hpte_remove) because we assume the old translation is still
* technically "valid".
*/
- if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
- native_unlock_hpte(hptep);
- else
- /* Invalidate the hpte. NOTE: this also unlocks it */
- hptep->v = 0;
-
- /* Invalidate the TLB */
tlbie(vpn, bpsize, apsize, ssize, local);
local_irq_restore(flags);
@@ -634,17 +629,23 @@ static void native_hugepage_invalidate(unsigned long vsid,
hptep = htab_address + slot;
want_v = hpte_encode_avpn(vpn, psize, ssize);
- native_lock_hpte(hptep);
- hpte_v = be64_to_cpu(hptep->v);
- if (cpu_has_feature(CPU_FTR_ARCH_300))
- hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
+ hpte_v = hpte_get_old_v(hptep);
/* Even if we miss, we need to invalidate the TLB */
- if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
- native_unlock_hpte(hptep);
- else
- /* Invalidate the hpte. NOTE: this also unlocks it */
- hptep->v = 0;
+ if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
+ /* recheck with locks held */
+ native_lock_hpte(hptep);
+ hpte_v = hpte_get_old_v(hptep);
+
+ if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
+ /*
+ * Invalidate the hpte. NOTE: this also unlocks it
+ */
+
+ hptep->v = 0;
+ } else
+ native_unlock_hpte(hptep);
+ }
/*
* We need to do tlb invalidate for all the address, tlbie
* instruction compares entry_VA in tlb with the VA specified
@@ -812,16 +813,19 @@ static void native_flush_hash_range(unsigned long number, int local)
slot += hidx & _PTEIDX_GROUP_IX;
hptep = htab_address + slot;
want_v = hpte_encode_avpn(vpn, psize, ssize);
+ hpte_v = hpte_get_old_v(hptep);
+
+ if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
+ continue;
+ /* lock and try again */
native_lock_hpte(hptep);
- hpte_v = be64_to_cpu(hptep->v);
- if (cpu_has_feature(CPU_FTR_ARCH_300))
- hpte_v = hpte_new_to_old_v(hpte_v,
- be64_to_cpu(hptep->r));
- if (!HPTE_V_COMPARE(hpte_v, want_v) ||
- !(hpte_v & HPTE_V_VALID))
+ hpte_v = hpte_get_old_v(hptep);
+
+ if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
native_unlock_hpte(hptep);
else
hptep->v = 0;
+
} pte_iterate_hashed_end();
}
@@ -866,18 +870,6 @@ static void native_flush_hash_range(unsigned long number, int local)
local_irq_restore(flags);
}
-static int native_register_proc_table(unsigned long base, unsigned long page_size,
- unsigned long table_size)
-{
- unsigned long patb1 = base << 25; /* VSID */
-
- patb1 |= (page_size << 5); /* sllp */
- patb1 |= table_size;
-
- partition_tb->patb1 = cpu_to_be64(patb1);
- return 0;
-}
-
void __init hpte_init_native(void)
{
mmu_hash_ops.hpte_invalidate = native_hpte_invalidate;
@@ -889,7 +881,4 @@ void __init hpte_init_native(void)
mmu_hash_ops.hpte_clear_all = native_hpte_clear;
mmu_hash_ops.flush_hash_range = native_flush_hash_range;
mmu_hash_ops.hugepage_invalidate = native_hugepage_invalidate;
-
- if (cpu_has_feature(CPU_FTR_ARCH_300))
- register_process_table = native_register_proc_table;
}
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index cf290d415dcd..0cc7fbc3bd1c 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -48,7 +48,6 @@
#include <linux/uaccess.h>
#include <asm/machdep.h>
#include <asm/prom.h>
-#include <asm/tlbflush.h>
#include <asm/io.h>
#include <asm/eeh.h>
#include <asm/tlb.h>
@@ -64,6 +63,7 @@
#include <asm/trace.h>
#include <asm/ps3.h>
#include <asm/pte-walk.h>
+#include <asm/asm-prototypes.h>
#ifdef DEBUG
#define DBG(fmt...) udbg_printf(fmt)
@@ -132,9 +132,10 @@ EXPORT_SYMBOL(mmu_hash_ops);
* is provided by the firmware.
*/
-/* Pre-POWER4 CPUs (4k pages only)
+/*
+ * Fallback (4k pages only)
*/
-static struct mmu_psize_def mmu_psize_defaults_old[] = {
+static struct mmu_psize_def mmu_psize_defaults[] = {
[MMU_PAGE_4K] = {
.shift = 12,
.sllp = 0,
@@ -554,8 +555,8 @@ static void __init htab_scan_page_sizes(void)
mmu_psize_set_default_penc();
/* Default to 4K pages only */
- memcpy(mmu_psize_defs, mmu_psize_defaults_old,
- sizeof(mmu_psize_defaults_old));
+ memcpy(mmu_psize_defs, mmu_psize_defaults,
+ sizeof(mmu_psize_defaults));
/*
* Try to find the available page sizes in the device-tree
@@ -571,8 +572,10 @@ static void __init htab_scan_page_sizes(void)
}
#ifdef CONFIG_HUGETLB_PAGE
- /* Reserve 16G huge page memory sections for huge pages */
- of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
+ if (!hugetlb_disabled) {
+ /* Reserve 16G huge page memory sections for huge pages */
+ of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
+ }
#endif /* CONFIG_HUGETLB_PAGE */
}
@@ -781,7 +784,7 @@ void resize_hpt_for_hotplug(unsigned long new_mem_size)
}
}
-int hash__create_section_mapping(unsigned long start, unsigned long end)
+int hash__create_section_mapping(unsigned long start, unsigned long end, int nid)
{
int rc = htab_bolt_mapping(start, end, __pa(start),
pgprot_val(PAGE_KERNEL), mmu_linear_psize,
@@ -804,31 +807,6 @@ int hash__remove_section_mapping(unsigned long start, unsigned long end)
}
#endif /* CONFIG_MEMORY_HOTPLUG */
-static void update_hid_for_hash(void)
-{
- unsigned long hid0;
- unsigned long rb = 3UL << PPC_BITLSHIFT(53); /* IS = 3 */
-
- asm volatile("ptesync": : :"memory");
- /* prs = 0, ric = 2, rs = 0, r = 1 is = 3 */
- asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
- : : "r"(rb), "i"(0), "i"(0), "i"(2), "r"(0) : "memory");
- asm volatile("eieio; tlbsync; ptesync; isync; slbia": : :"memory");
- trace_tlbie(0, 0, rb, 0, 2, 0, 0);
-
- /*
- * now switch the HID
- */
- hid0 = mfspr(SPRN_HID0);
- hid0 &= ~HID0_POWER9_RADIX;
- mtspr(SPRN_HID0, hid0);
- asm volatile("isync": : :"memory");
-
- /* Wait for it to happen */
- while ((mfspr(SPRN_HID0) & HID0_POWER9_RADIX))
- cpu_relax();
-}
-
static void __init hash_init_partition_table(phys_addr_t hash_table,
unsigned long htab_size)
{
@@ -841,8 +819,6 @@ static void __init hash_init_partition_table(phys_addr_t hash_table,
htab_size = __ilog2(htab_size) - 18;
mmu_partition_table_set_entry(0, hash_table | htab_size, 0);
pr_info("Partition table %p\n", partition_tb);
- if (cpu_has_feature(CPU_FTR_POWER9_DD1))
- update_hid_for_hash();
}
static void __init htab_initialize(void)
@@ -875,6 +851,12 @@ static void __init htab_initialize(void)
/* Using a hypervisor which owns the htab */
htab_address = NULL;
_SDR1 = 0;
+ /*
+ * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall
+ * to inform the hypervisor that we wish to use the HPT.
+ */
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ register_process_table(0, 0, 0);
#ifdef CONFIG_FA_DUMP
/*
* If firmware assisted dump is active firmware preserves
@@ -1003,13 +985,14 @@ void __init hash__early_init_mmu(void)
*/
__pte_frag_nr = H_PTE_FRAG_NR;
__pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT;
+ __pmd_frag_nr = H_PMD_FRAG_NR;
+ __pmd_frag_size_shift = H_PMD_FRAG_SIZE_SHIFT;
__pte_index_size = H_PTE_INDEX_SIZE;
__pmd_index_size = H_PMD_INDEX_SIZE;
__pud_index_size = H_PUD_INDEX_SIZE;
__pgd_index_size = H_PGD_INDEX_SIZE;
__pud_cache_index = H_PUD_CACHE_INDEX;
- __pmd_cache_index = H_PMD_CACHE_INDEX;
__pte_table_size = H_PTE_TABLE_SIZE;
__pmd_table_size = H_PMD_TABLE_SIZE;
__pud_table_size = H_PUD_TABLE_SIZE;
@@ -1018,9 +1001,9 @@ void __init hash__early_init_mmu(void)
* 4k use hugepd format, so for hash set then to
* zero
*/
- __pmd_val_bits = 0;
- __pud_val_bits = 0;
- __pgd_val_bits = 0;
+ __pmd_val_bits = HASH_PMD_VAL_BITS;
+ __pud_val_bits = HASH_PUD_VAL_BITS;
+ __pgd_val_bits = HASH_PGD_VAL_BITS;
__kernel_virt_start = H_KERN_VIRT_START;
__kernel_virt_size = H_KERN_VIRT_SIZE;
@@ -1066,9 +1049,6 @@ void hash__early_init_mmu_secondary(void)
/* Initialize hash table for that CPU */
if (!firmware_has_feature(FW_FEATURE_LPAR)) {
- if (cpu_has_feature(CPU_FTR_POWER9_DD1))
- update_hid_for_hash();
-
if (!cpu_has_feature(CPU_FTR_ARCH_300))
mtspr(SPRN_SDR1, _SDR1);
else
@@ -1110,19 +1090,18 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
#ifdef CONFIG_PPC_MM_SLICES
static unsigned int get_paca_psize(unsigned long addr)
{
- u64 lpsizes;
- unsigned char *hpsizes;
+ unsigned char *psizes;
unsigned long index, mask_index;
if (addr < SLICE_LOW_TOP) {
- lpsizes = get_paca()->mm_ctx_low_slices_psize;
+ psizes = get_paca()->mm_ctx_low_slices_psize;
index = GET_LOW_SLICE_INDEX(addr);
- return (lpsizes >> (index * 4)) & 0xF;
+ } else {
+ psizes = get_paca()->mm_ctx_high_slices_psize;
+ index = GET_HIGH_SLICE_INDEX(addr);
}
- hpsizes = get_paca()->mm_ctx_high_slices_psize;
- index = GET_HIGH_SLICE_INDEX(addr);
mask_index = index & 0x1;
- return (hpsizes[index >> 1] >> (mask_index * 4)) & 0xF;
+ return (psizes[index >> 1] >> (mask_index * 4)) & 0xF;
}
#else
@@ -1146,7 +1125,7 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) {
copy_mm_to_paca(mm);
- slb_flush_and_rebolt();
+ slb_flush_and_restore_bolted();
}
}
#endif /* CONFIG_PPC_64K_PAGES */
@@ -1218,7 +1197,7 @@ static void check_paca_psize(unsigned long ea, struct mm_struct *mm,
if (user_region) {
if (psize != get_paca_psize(ea)) {
copy_mm_to_paca(mm);
- slb_flush_and_rebolt();
+ slb_flush_and_restore_bolted();
}
} else if (get_paca()->vmalloc_sllp !=
mmu_psize_defs[mmu_vmalloc_psize].sllp) {
@@ -1262,7 +1241,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
}
psize = get_slice_psize(mm, ea);
ssize = user_segment_size(ea);
- vsid = get_vsid(mm->context.id, ea, ssize);
+ vsid = get_user_vsid(&mm->context, ea, ssize);
break;
case VMALLOC_REGION_ID:
vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
@@ -1503,7 +1482,7 @@ static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
#endif
void hash_preload(struct mm_struct *mm, unsigned long ea,
- unsigned long access, unsigned long trap)
+ bool is_exec, unsigned long trap)
{
int hugepage_shift;
unsigned long vsid;
@@ -1511,6 +1490,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
pte_t *ptep;
unsigned long flags;
int rc, ssize, update_flags = 0;
+ unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0);
BUG_ON(REGION_ID(ea) != USER_REGION_ID);
@@ -1527,7 +1507,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
/* Get VSID */
ssize = user_segment_size(ea);
- vsid = get_vsid(mm->context.id, ea, ssize);
+ vsid = get_user_vsid(&mm->context, ea, ssize);
if (!vsid)
return;
/*
@@ -1773,8 +1753,7 @@ long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
long slot;
repeat:
- hpte_group = ((hash & htab_hash_mask) *
- HPTES_PER_GROUP) & ~0x7UL;
+ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
/* Insert into the hash table, primary slot */
slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, vflags,
@@ -1782,15 +1761,14 @@ repeat:
/* Primary is full, try the secondary */
if (unlikely(slot == -1)) {
- hpte_group = ((~hash & htab_hash_mask) *
- HPTES_PER_GROUP) & ~0x7UL;
+ hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags,
vflags | HPTE_V_SECONDARY,
psize, psize, ssize);
if (slot == -1) {
if (mftb() & 0x1)
- hpte_group = ((hash & htab_hash_mask) *
- HPTES_PER_GROUP)&~0x7UL;
+ hpte_group = (hash & htab_hash_mask) *
+ HPTES_PER_GROUP;
mmu_hash_ops.hpte_remove(hpte_group);
goto repeat;
diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c
index 668e87d03f9e..82a0e37557a5 100644
--- a/arch/powerpc/mm/highmem.c
+++ b/arch/powerpc/mm/highmem.c
@@ -56,7 +56,7 @@ EXPORT_SYMBOL(kmap_atomic_prot);
void __kunmap_atomic(void *kvaddr)
{
unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
- int type;
+ int type __maybe_unused;
if (vaddr < __fix_to_virt(FIX_KMAP_END)) {
pagefault_enable();
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
index f20d16f849c5..dfbc3b32f09b 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -51,6 +51,12 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
new_pmd |= _PAGE_DIRTY;
} while (!pmd_xchg(pmdp, __pmd(old_pmd), __pmd(new_pmd)));
+ /*
+ * Make sure this is thp or devmap entry
+ */
+ if (!(old_pmd & (H_PAGE_THP_HUGE | _PAGE_DEVMAP)))
+ return 0;
+
rflags = htab_convert_pte_flags(new_pmd);
#if 0
@@ -128,7 +134,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
new_pmd |= H_PAGE_HASHPTE;
repeat:
- hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
/* Insert into the hash table, primary slot */
slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
@@ -137,16 +143,15 @@ repeat:
* Primary is full, try the secondary
*/
if (unlikely(slot == -1)) {
- hpte_group = ((~hash & htab_hash_mask) *
- HPTES_PER_GROUP) & ~0x7UL;
+ hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
rflags,
HPTE_V_SECONDARY,
psize, lpsize, ssize);
if (slot == -1) {
if (mftb() & 0x1)
- hpte_group = ((hash & htab_hash_mask) *
- HPTES_PER_GROUP) & ~0x7UL;
+ hpte_group = (hash & htab_hash_mask) *
+ HPTES_PER_GROUP;
mmu_hash_ops.hpte_remove(hpte_group);
goto repeat;
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index b320f5097a06..2e6a8f9345d3 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -62,6 +62,10 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
new_pte |= _PAGE_DIRTY;
} while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+ /* Make sure this is a hugetlb entry */
+ if (old_pte & (H_PAGE_THP_HUGE | _PAGE_DEVMAP))
+ return 0;
+
rflags = htab_convert_pte_flags(new_pte);
if (unlikely(mmu_psize == MMU_PAGE_16G))
offset = PTRS_PER_PUD;
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 876da2bc1796..8cf035e68378 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -15,10 +15,10 @@
#include <linux/export.h>
#include <linux/of_fdt.h>
#include <linux/memblock.h>
-#include <linux/bootmem.h>
#include <linux/moduleparam.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/kmemleak.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
@@ -35,6 +35,8 @@
#define PAGE_SHIFT_16M 24
#define PAGE_SHIFT_16G 34
+bool hugetlb_disabled = false;
+
unsigned int HPAGE_SHIFT;
EXPORT_SYMBOL(HPAGE_SHIFT);
@@ -50,7 +52,8 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long s
}
static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
- unsigned long address, unsigned pdshift, unsigned pshift)
+ unsigned long address, unsigned int pdshift,
+ unsigned int pshift, spinlock_t *ptl)
{
struct kmem_cache *cachep;
pte_t *new;
@@ -80,8 +83,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
*/
smp_wmb();
- spin_lock(&mm->page_table_lock);
-
+ spin_lock(ptl);
/*
* We have multiple higher-level entries that point to the same
* actual pte location. Fill in each as we go and backtrack on error.
@@ -93,7 +95,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
break;
else {
#ifdef CONFIG_PPC_BOOK3S_64
- *hpdp = __hugepd(__pa(new) |
+ *hpdp = __hugepd(__pa(new) | HUGEPD_VAL_BITS |
(shift_to_mmu_psize(pshift) << 2));
#elif defined(CONFIG_PPC_8xx)
*hpdp = __hugepd(__pa(new) | _PMD_USER |
@@ -110,24 +112,14 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
for (i = i - 1 ; i >= 0; i--, hpdp--)
*hpdp = __hugepd(0);
kmem_cache_free(cachep, new);
+ } else {
+ kmemleak_ignore(new);
}
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
/*
- * These macros define how to determine which level of the page table holds
- * the hpdp.
- */
-#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
-#define HUGEPD_PGD_SHIFT PGDIR_SHIFT
-#define HUGEPD_PUD_SHIFT PUD_SHIFT
-#else
-#define HUGEPD_PGD_SHIFT PUD_SHIFT
-#define HUGEPD_PUD_SHIFT PMD_SHIFT
-#endif
-
-/*
* At this point we do the placement change only for BOOK3S 64. This would
* possibly work on other subarchs.
*/
@@ -139,6 +131,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
hugepd_t *hpdp = NULL;
unsigned pshift = __ffs(sz);
unsigned pdshift = PGDIR_SHIFT;
+ spinlock_t *ptl;
addr &= ~(sz-1);
pg = pgd_offset(mm, addr);
@@ -147,39 +140,46 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
if (pshift == PGDIR_SHIFT)
/* 16GB huge page */
return (pte_t *) pg;
- else if (pshift > PUD_SHIFT)
+ else if (pshift > PUD_SHIFT) {
/*
* We need to use hugepd table
*/
+ ptl = &mm->page_table_lock;
hpdp = (hugepd_t *)pg;
- else {
+ } else {
pdshift = PUD_SHIFT;
pu = pud_alloc(mm, pg, addr);
if (pshift == PUD_SHIFT)
return (pte_t *)pu;
- else if (pshift > PMD_SHIFT)
+ else if (pshift > PMD_SHIFT) {
+ ptl = pud_lockptr(mm, pu);
hpdp = (hugepd_t *)pu;
- else {
+ } else {
pdshift = PMD_SHIFT;
pm = pmd_alloc(mm, pu, addr);
if (pshift == PMD_SHIFT)
/* 16MB hugepage */
return (pte_t *)pm;
- else
+ else {
+ ptl = pmd_lockptr(mm, pm);
hpdp = (hugepd_t *)pm;
+ }
}
}
#else
- if (pshift >= HUGEPD_PGD_SHIFT) {
+ if (pshift >= PGDIR_SHIFT) {
+ ptl = &mm->page_table_lock;
hpdp = (hugepd_t *)pg;
} else {
pdshift = PUD_SHIFT;
pu = pud_alloc(mm, pg, addr);
- if (pshift >= HUGEPD_PUD_SHIFT) {
+ if (pshift >= PUD_SHIFT) {
+ ptl = pud_lockptr(mm, pu);
hpdp = (hugepd_t *)pu;
} else {
pdshift = PMD_SHIFT;
pm = pmd_alloc(mm, pu, addr);
+ ptl = pmd_lockptr(mm, pm);
hpdp = (hugepd_t *)pm;
}
}
@@ -189,7 +189,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
- if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
+ if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr,
+ pdshift, pshift, ptl))
return NULL;
return hugepte_offset(*hpdp, addr, pdshift);
@@ -329,7 +330,8 @@ static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshif
if (shift >= pdshift)
hugepd_free(tlb, hugepte);
else
- pgtable_free_tlb(tlb, hugepte, pdshift - shift);
+ pgtable_free_tlb(tlb, hugepte,
+ get_hugepd_cache_index(pdshift - shift));
}
static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -500,6 +502,10 @@ struct page *follow_huge_pd(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
retry:
+ /*
+ * hugepage directory entries are protected by mm->page_table_lock
+ * Use this instead of huge_pte_lockptr
+ */
ptl = &mm->page_table_lock;
spin_lock(ptl);
@@ -553,9 +559,11 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
struct hstate *hstate = hstate_file(file);
int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
+#ifdef CONFIG_PPC_RADIX_MMU
if (radix_enabled())
return radix__hugetlb_get_unmapped_area(file, addr, len,
pgoff, flags);
+#endif
return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
}
#endif
@@ -563,15 +571,14 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
#ifdef CONFIG_PPC_MM_SLICES
- unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
/* With radix we don't use slice, so derive it from vma*/
- if (!radix_enabled())
+ if (!radix_enabled()) {
+ unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
+
return 1UL << mmu_psize_to_shift(psize);
+ }
#endif
- if (!is_vm_hugetlb_page(vma))
- return PAGE_SIZE;
-
- return huge_page_size(hstate_vma(vma));
+ return vma_kernel_pagesize(vma);
}
static inline bool is_power_of_4(unsigned long x)
@@ -607,15 +614,12 @@ static int __init add_huge_page_size(unsigned long long size)
* firmware we only add hugetlb support for page sizes that can be
* supported by linux page table layout.
* For now we have
- * Radix: 2M
+ * Radix: 2M and 1G
* Hash: 16M and 16G
*/
if (radix_enabled()) {
- if (mmu_psize != MMU_PAGE_2M) {
- if (cpu_has_feature(CPU_FTR_POWER9_DD1) ||
- (mmu_psize != MMU_PAGE_1G))
- return -EINVAL;
- }
+ if (mmu_psize != MMU_PAGE_2M && mmu_psize != MMU_PAGE_1G)
+ return -EINVAL;
} else {
if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G)
return -EINVAL;
@@ -653,6 +657,11 @@ static int __init hugetlbpage_init(void)
{
int psize;
+ if (hugetlb_disabled) {
+ pr_info("HugeTLB support is disabled!\n");
+ return 0;
+ }
+
#if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx)
if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
return -ENODEV;
@@ -666,15 +675,26 @@ static int __init hugetlbpage_init(void)
shift = mmu_psize_to_shift(psize);
- if (add_huge_page_size(1ULL << shift) < 0)
+#ifdef CONFIG_PPC_BOOK3S_64
+ if (shift > PGDIR_SHIFT)
continue;
-
- if (shift < HUGEPD_PUD_SHIFT)
+ else if (shift > PUD_SHIFT)
+ pdshift = PGDIR_SHIFT;
+ else if (shift > PMD_SHIFT)
+ pdshift = PUD_SHIFT;
+ else
+ pdshift = PMD_SHIFT;
+#else
+ if (shift < PUD_SHIFT)
pdshift = PMD_SHIFT;
- else if (shift < HUGEPD_PGD_SHIFT)
+ else if (shift < PGDIR_SHIFT)
pdshift = PUD_SHIFT;
else
pdshift = PGDIR_SHIFT;
+#endif
+
+ if (add_huge_page_size(1ULL << shift) < 0)
+ continue;
/*
* if we have pdshift and shift value same, we don't
* use pgt cache for hugepd.
@@ -819,8 +839,12 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
ret_pte = (pte_t *) pmdp;
goto out;
}
-
- if (pmd_huge(pmd)) {
+ /*
+ * pmd_large check below will handle the swap pmd pte
+ * we need to do both the check because they are config
+ * dependent.
+ */
+ if (pmd_huge(pmd) || pmd_large(pmd)) {
ret_pte = (pte_t *) pmdp;
goto out;
} else if (is_hugepd(__hugepd(pmd_val(pmd))))
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 6419b33ca309..3e59e5d64b01 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -88,18 +88,13 @@ void MMU_init(void);
int __map_without_bats;
int __map_without_ltlbs;
-/*
- * This tells the system to allow ioremapping memory marked as reserved.
- */
-int __allow_ioremap_reserved;
-
/* max amount of low RAM to map in */
unsigned long __max_low_memory = MAX_LOW_MEM;
/*
* Check for command-line options that affect what MMU_init will do.
*/
-void __init MMU_setup(void)
+static void __init MMU_setup(void)
{
/* Check for nobats option (used in mapin_ram). */
if (strstr(boot_command_line, "nobats")) {
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index fdb424a29f03..7a9886f98b0c 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -68,12 +68,6 @@
#include "mmu_decl.h"
-#ifdef CONFIG_PPC_BOOK3S_64
-#if H_PGTABLE_RANGE > USER_VSID_RANGE
-#warning Limited user VSID range means pagetable space is wasted
-#endif
-#endif /* CONFIG_PPC_BOOK3S_64 */
-
phys_addr_t memstart_addr = ~0;
EXPORT_SYMBOL_GPL(memstart_addr);
phys_addr_t kernstart_addr;
@@ -314,55 +308,6 @@ void register_page_bootmem_memmap(unsigned long section_nr,
{
}
-/*
- * We do not have access to the sparsemem vmemmap, so we fallback to
- * walking the list of sparsemem blocks which we already maintain for
- * the sake of crashdump. In the long run, we might want to maintain
- * a tree if performance of that linear walk becomes a problem.
- *
- * realmode_pfn_to_page functions can fail due to:
- * 1) As real sparsemem blocks do not lay in RAM continously (they
- * are in virtual address space which is not available in the real mode),
- * the requested page struct can be split between blocks so get_page/put_page
- * may fail.
- * 2) When huge pages are used, the get_page/put_page API will fail
- * in real mode as the linked addresses in the page struct are virtual
- * too.
- */
-struct page *realmode_pfn_to_page(unsigned long pfn)
-{
- struct vmemmap_backing *vmem_back;
- struct page *page;
- unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
- unsigned long pg_va = (unsigned long) pfn_to_page(pfn);
-
- for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back->list) {
- if (pg_va < vmem_back->virt_addr)
- continue;
-
- /* After vmemmap_list entry free is possible, need check all */
- if ((pg_va + sizeof(struct page)) <=
- (vmem_back->virt_addr + page_size)) {
- page = (struct page *) (vmem_back->phys + pg_va -
- vmem_back->virt_addr);
- return page;
- }
- }
-
- /* Probably that page struct is split between real pages */
- return NULL;
-}
-EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
-
-#else
-
-struct page *realmode_pfn_to_page(unsigned long pfn)
-{
- struct page *page = pfn_to_page(pfn);
- return page;
-}
-EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
-
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
#ifdef CONFIG_PPC_BOOK3S_64
@@ -372,7 +317,7 @@ static int __init parse_disable_radix(char *p)
{
bool val;
- if (strlen(p) == 0)
+ if (!p)
val = true;
else if (kstrtobool(p, &val))
return -EINVAL;
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index fe8c61149fb8..0a64fffabee1 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -27,12 +27,11 @@
#include <linux/mm.h>
#include <linux/stddef.h>
#include <linux/init.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/highmem.h>
#include <linux/initrd.h>
#include <linux/pagemap.h>
#include <linux/suspend.h>
-#include <linux/memblock.h>
#include <linux/hugetlb.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
@@ -63,6 +62,7 @@
#endif
unsigned long long memory_limit;
+bool init_mem_is_free;
#ifdef CONFIG_HIGHMEM
pte_t *kmap_pte;
@@ -82,17 +82,7 @@ static inline pte_t *virt_to_kpte(unsigned long vaddr)
int page_is_ram(unsigned long pfn)
{
-#ifndef CONFIG_PPC64 /* XXX for now */
- return pfn < max_pfn;
-#else
- unsigned long paddr = (pfn << PAGE_SHIFT);
- struct memblock_region *reg;
-
- for_each_memblock(memory, reg)
- if (paddr >= reg->base && paddr < (reg->base + reg->size))
- return 1;
- return 0;
-#endif
+ return memblock_is_memory(__pfn_to_phys(pfn));
}
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
@@ -117,7 +107,7 @@ int memory_add_physaddr_to_nid(u64 start)
}
#endif
-int __weak create_section_mapping(unsigned long start, unsigned long end)
+int __weak create_section_mapping(unsigned long start, unsigned long end, int nid)
{
return -ENODEV;
}
@@ -127,7 +117,7 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end)
return -ENODEV;
}
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
+int __meminit arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
bool want_memblock)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
@@ -137,18 +127,19 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
resize_hpt_for_hotplug(memblock_phys_mem_size());
start = (unsigned long)__va(start);
- rc = create_section_mapping(start, start + size);
+ rc = create_section_mapping(start, start + size, nid);
if (rc) {
pr_warn("Unable to create mapping for hot added memory 0x%llx..0x%llx: %d\n",
start, start + size, rc);
return -EFAULT;
}
+ flush_inval_dcache_range(start, start + size);
return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
+int __meminit arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
@@ -169,6 +160,7 @@ int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
/* Remove htab bolted mappings for this section of memory */
start = (unsigned long)__va(start);
+ flush_inval_dcache_range(start, start + size);
ret = remove_section_mapping(start, start + size);
/* Ensure all vmalloc mappings are flushed in case they also
@@ -212,7 +204,7 @@ walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
EXPORT_SYMBOL_GPL(walk_system_ram_range);
#ifndef CONFIG_NEED_MULTIPLE_NODES
-void __init initmem_init(void)
+void __init mem_topology_setup(void)
{
max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
min_low_pfn = MEMORY_START >> PAGE_SHIFT;
@@ -223,8 +215,11 @@ void __init initmem_init(void)
/* Place all memblock_regions in the same node and merge contiguous
* memblock_regions
*/
- memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
+ memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0);
+}
+void __init initmem_init(void)
+{
/* XXX need to clip this if using highmem? */
sparse_memory_present_with_active_regions(0);
sparse_init();
@@ -313,11 +308,11 @@ void __init paging_init(void)
unsigned long end = __fix_to_virt(FIX_HOLE);
for (; v < end; v += PAGE_SIZE)
- map_kernel_page(v, 0, 0); /* XXX gross */
+ map_kernel_page(v, 0, __pgprot(0)); /* XXX gross */
#endif
#ifdef CONFIG_HIGHMEM
- map_kernel_page(PKMAP_BASE, 0, 0); /* XXX gross */
+ map_kernel_page(PKMAP_BASE, 0, __pgprot(0)); /* XXX gross */
pkmap_page_table = virt_to_kpte(PKMAP_BASE);
kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN));
@@ -353,7 +348,7 @@ void __init mem_init(void)
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
set_max_mapnr(max_pfn);
- free_all_bootmem();
+ memblock_free_all();
#ifdef CONFIG_HIGHMEM
{
@@ -401,6 +396,7 @@ void free_initmem(void)
{
ppc_md.progress = ppc_printk_progress;
mark_initmem_nx();
+ init_mem_is_free = true;
free_initmem_default(POISON_FREE_INITMEM);
}
@@ -512,10 +508,13 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
* We don't need to worry about _PAGE_PRESENT here because we are
* called with either mm->page_table_lock held or ptl lock held
*/
- unsigned long access, trap;
+ unsigned long trap;
+ bool is_exec;
- if (radix_enabled())
+ if (radix_enabled()) {
+ prefetch((void *)address);
return;
+ }
/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
if (!pte_young(*ptep) || address >= TASK_SIZE)
@@ -532,16 +531,16 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL;
switch (trap) {
case 0x300:
- access = 0UL;
+ is_exec = false;
break;
case 0x400:
- access = _PAGE_EXEC;
+ is_exec = true;
break;
default:
return;
}
- hash_preload(vma->vm_mm, address, access, trap);
+ hash_preload(vma->vm_mm, address, is_exec, trap);
#endif /* CONFIG_PPC_STD_MMU */
#if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \
&& defined(CONFIG_HUGETLB_PAGE)
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index d503f344e476..b24ce40acd47 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -39,12 +39,12 @@
#define MIN_GAP (128*1024*1024)
#define MAX_GAP (TASK_SIZE/6*5)
-static inline int mmap_is_legacy(void)
+static inline int mmap_is_legacy(struct rlimit *rlim_stack)
{
if (current->personality & ADDR_COMPAT_LAYOUT)
return 1;
- if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
+ if (rlim_stack->rlim_cur == RLIM_INFINITY)
return 1;
return sysctl_legacy_va_layout;
@@ -76,9 +76,10 @@ static inline unsigned long stack_maxrandom_size(void)
return (1<<30);
}
-static inline unsigned long mmap_base(unsigned long rnd)
+static inline unsigned long mmap_base(unsigned long rnd,
+ struct rlimit *rlim_stack)
{
- unsigned long gap = rlimit(RLIMIT_STACK);
+ unsigned long gap = rlim_stack->rlim_cur;
unsigned long pad = stack_maxrandom_size() + stack_guard_gap;
/* Values close to RLIM_INFINITY can overflow. */
@@ -196,26 +197,28 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
}
static void radix__arch_pick_mmap_layout(struct mm_struct *mm,
- unsigned long random_factor)
+ unsigned long random_factor,
+ struct rlimit *rlim_stack)
{
- if (mmap_is_legacy()) {
+ if (mmap_is_legacy(rlim_stack)) {
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = radix__arch_get_unmapped_area;
} else {
- mm->mmap_base = mmap_base(random_factor);
+ mm->mmap_base = mmap_base(random_factor, rlim_stack);
mm->get_unmapped_area = radix__arch_get_unmapped_area_topdown;
}
}
#else
/* dummy */
extern void radix__arch_pick_mmap_layout(struct mm_struct *mm,
- unsigned long random_factor);
+ unsigned long random_factor,
+ struct rlimit *rlim_stack);
#endif
/*
* This function, called very early during the creation of a new
* process VM image, sets up which VM layout function to use:
*/
-void arch_pick_mmap_layout(struct mm_struct *mm)
+void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
unsigned long random_factor = 0UL;
@@ -223,16 +226,17 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
random_factor = arch_mmap_rnd();
if (radix_enabled())
- return radix__arch_pick_mmap_layout(mm, random_factor);
+ return radix__arch_pick_mmap_layout(mm, random_factor,
+ rlim_stack);
/*
* Fall back to the standard layout if the personality
* bit is set, or if the expected stack growth is unlimited:
*/
- if (mmap_is_legacy()) {
+ if (mmap_is_legacy(rlim_stack)) {
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
} else {
- mm->mmap_base = mmap_base(random_factor);
+ mm->mmap_base = mmap_base(random_factor, rlim_stack);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}
diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c
index 0ab297c4cfad..f84e14f23e50 100644
--- a/arch/powerpc/mm/mmu_context.c
+++ b/arch/powerpc/mm/mmu_context.c
@@ -57,8 +57,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* in switch_slb(), and/or the store of paca->mm_ctx_id in
* copy_mm_to_paca().
*
- * On the read side the barrier is in pte_xchg(), which orders
- * the store to the PTE vs the load of mm_cpumask.
+ * On the other side, the barrier is in mm/tlb-radix.c for
+ * radix which orders earlier stores to clear the PTEs vs
+ * the load of mm_cpumask. And pte_xchg which does the same
+ * thing for hash.
*
* This full barrier is needed by membarrier when switching
* between processes after store to rq->curr, before user-space
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index 3f980baade4c..510f103d7813 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -26,48 +26,16 @@
#include <asm/mmu_context.h>
#include <asm/pgalloc.h>
-static DEFINE_SPINLOCK(mmu_context_lock);
static DEFINE_IDA(mmu_context_ida);
static int alloc_context_id(int min_id, int max_id)
{
- int index, err;
-
-again:
- if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL))
- return -ENOMEM;
-
- spin_lock(&mmu_context_lock);
- err = ida_get_new_above(&mmu_context_ida, min_id, &index);
- spin_unlock(&mmu_context_lock);
-
- if (err == -EAGAIN)
- goto again;
- else if (err)
- return err;
-
- if (index > max_id) {
- spin_lock(&mmu_context_lock);
- ida_remove(&mmu_context_ida, index);
- spin_unlock(&mmu_context_lock);
- return -ENOMEM;
- }
-
- return index;
+ return ida_alloc_range(&mmu_context_ida, min_id, max_id, GFP_KERNEL);
}
void hash__reserve_context_id(int id)
{
- int rc, result = 0;
-
- do {
- if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL))
- break;
-
- spin_lock(&mmu_context_lock);
- rc = ida_get_new_above(&mmu_context_ida, id, &result);
- spin_unlock(&mmu_context_lock);
- } while (rc == -EAGAIN);
+ int result = ida_alloc_range(&mmu_context_ida, id, id, GFP_KERNEL);
WARN(result != id, "mmu: Failed to reserve context id %d (rc %d)\n", id, result);
}
@@ -85,6 +53,8 @@ int hash__alloc_context_id(void)
}
EXPORT_SYMBOL_GPL(hash__alloc_context_id);
+void slb_setup_new_exec(void);
+
static int hash__init_new_context(struct mm_struct *mm)
{
int index;
@@ -94,13 +64,6 @@ static int hash__init_new_context(struct mm_struct *mm)
return index;
/*
- * In the case of exec, use the default limit,
- * otherwise inherit it from the mm we are duplicating.
- */
- if (!mm->context.slb_addr_limit)
- mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64;
-
- /*
* The old code would re-promote on fork, we don't do that when using
* slices as it could cause problem promoting slices that have been
* forced down to 4K.
@@ -115,7 +78,7 @@ static int hash__init_new_context(struct mm_struct *mm)
* check against 0 is OK.
*/
if (mm->context.id == 0)
- slice_set_user_psize(mm, mmu_virtual_psize);
+ slice_init_new_context_exec(mm);
subpage_prot_init_new_context(mm);
@@ -123,6 +86,13 @@ static int hash__init_new_context(struct mm_struct *mm)
return index;
}
+void hash__setup_new_exec(void)
+{
+ slice_setup_new_exec();
+
+ slb_setup_new_exec();
+}
+
static int radix__init_new_context(struct mm_struct *mm)
{
unsigned long rts_field;
@@ -166,9 +136,8 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
mm->context.id = index;
-#ifdef CONFIG_PPC_64K_PAGES
mm->context.pte_frag = NULL;
-#endif
+ mm->context.pmd_frag = NULL;
#ifdef CONFIG_SPAPR_TCE_IOMMU
mm_iommu_init(mm);
#endif
@@ -180,39 +149,64 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
void __destroy_context(int context_id)
{
- spin_lock(&mmu_context_lock);
- ida_remove(&mmu_context_ida, context_id);
- spin_unlock(&mmu_context_lock);
+ ida_free(&mmu_context_ida, context_id);
}
EXPORT_SYMBOL_GPL(__destroy_context);
-#ifdef CONFIG_PPC_64K_PAGES
-static void destroy_pagetable_page(struct mm_struct *mm)
+static void destroy_contexts(mm_context_t *ctx)
+{
+ int index, context_id;
+
+ for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) {
+ context_id = ctx->extended_id[index];
+ if (context_id)
+ ida_free(&mmu_context_ida, context_id);
+ }
+}
+
+static void pte_frag_destroy(void *pte_frag)
{
int count;
- void *pte_frag;
struct page *page;
- pte_frag = mm->context.pte_frag;
- if (!pte_frag)
- return;
-
page = virt_to_page(pte_frag);
/* drop all the pending references */
count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
/* We allow PTE_FRAG_NR fragments from a PTE page */
- if (page_ref_sub_and_test(page, PTE_FRAG_NR - count)) {
+ if (atomic_sub_and_test(PTE_FRAG_NR - count, &page->pt_frag_refcount)) {
pgtable_page_dtor(page);
- free_unref_page(page);
+ __free_page(page);
}
}
-#else
-static inline void destroy_pagetable_page(struct mm_struct *mm)
+static void pmd_frag_destroy(void *pmd_frag)
{
+ int count;
+ struct page *page;
+
+ page = virt_to_page(pmd_frag);
+ /* drop all the pending references */
+ count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT;
+ /* We allow PTE_FRAG_NR fragments from a PTE page */
+ if (atomic_sub_and_test(PMD_FRAG_NR - count, &page->pt_frag_refcount)) {
+ pgtable_pmd_page_dtor(page);
+ __free_page(page);
+ }
+}
+
+static void destroy_pagetable_cache(struct mm_struct *mm)
+{
+ void *frag;
+
+ frag = mm->context.pte_frag;
+ if (frag)
+ pte_frag_destroy(frag);
+
+ frag = mm->context.pmd_frag;
+ if (frag)
+ pmd_frag_destroy(frag);
return;
}
-#endif
void destroy_context(struct mm_struct *mm)
{
@@ -223,13 +217,14 @@ void destroy_context(struct mm_struct *mm)
WARN_ON(process_tb[mm->context.id].prtb0 != 0);
else
subpage_prot_free(mm);
- destroy_pagetable_page(mm);
- __destroy_context(mm->context.id);
+ destroy_contexts(&mm->context);
mm->context.id = MMU_NO_CONTEXT;
}
void arch_exit_mmap(struct mm_struct *mm)
{
+ destroy_pagetable_cache(mm);
+
if (radix_enabled()) {
/*
* Radix doesn't have a valid bit in the process table
@@ -252,15 +247,7 @@ void arch_exit_mmap(struct mm_struct *mm)
#ifdef CONFIG_PPC_RADIX_MMU
void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
{
-
- if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
- isync();
- mtspr(SPRN_PID, next->context.id);
- isync();
- asm volatile(PPC_INVALIDATE_ERAT : : :"memory");
- } else {
- mtspr(SPRN_PID, next->context.id);
- isync();
- }
+ mtspr(SPRN_PID, next->context.id);
+ isync();
}
#endif
diff --git a/arch/powerpc/mm/mmu_context_hash32.c b/arch/powerpc/mm/mmu_context_hash32.c
index aa5a7fd89461..921c1e33e941 100644
--- a/arch/powerpc/mm/mmu_context_hash32.c
+++ b/arch/powerpc/mm/mmu_context_hash32.c
@@ -27,7 +27,6 @@
#include <linux/export.h>
#include <asm/mmu_context.h>
-#include <asm/tlbflush.h>
/*
* On 32-bit PowerPC 6xx/7xx/7xxx CPUs, we use a set of 16 VSIDs
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
index e0a2d8e806ed..56c2234cc6ae 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -18,15 +18,21 @@
#include <linux/migrate.h>
#include <linux/hugetlb.h>
#include <linux/swap.h>
+#include <linux/sizes.h>
#include <asm/mmu_context.h>
+#include <asm/pte-walk.h>
static DEFINE_MUTEX(mem_list_mutex);
+#define MM_IOMMU_TABLE_GROUP_PAGE_DIRTY 0x1
+#define MM_IOMMU_TABLE_GROUP_PAGE_MASK ~(SZ_4K - 1)
+
struct mm_iommu_table_group_mem_t {
struct list_head next;
struct rcu_head rcu;
unsigned long used;
atomic64_t mapped;
+ unsigned int pageshift;
u64 ua; /* userspace address */
u64 entries; /* number of entries in hpas[] */
u64 *hpas; /* vmalloc'ed */
@@ -75,8 +81,7 @@ EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
/*
* Taken from alloc_migrate_target with changes to remove CMA allocations
*/
-struct page *new_iommu_non_cma_page(struct page *page, unsigned long private,
- int **resultp)
+struct page *new_iommu_non_cma_page(struct page *page, unsigned long private)
{
gfp_t gfp_mask = GFP_USER;
struct page *new_page;
@@ -112,7 +117,7 @@ static int mm_iommu_move_page_from_cma(struct page *page)
put_page(page); /* Drop the gup reference */
ret = migrate_pages(&cma_migrate_pages, new_iommu_non_cma_page,
- NULL, 0, MIGRATE_SYNC, MR_CMA);
+ NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE);
if (ret) {
if (!list_empty(&cma_migrate_pages))
putback_movable_pages(&cma_migrate_pages);
@@ -126,6 +131,9 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
{
struct mm_iommu_table_group_mem_t *mem;
long i, j, ret = 0, locked_entries = 0;
+ unsigned int pageshift;
+ unsigned long flags;
+ unsigned long cur_ua;
struct page *page = NULL;
mutex_lock(&mem_list_mutex);
@@ -160,7 +168,13 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
goto unlock_exit;
}
- mem->hpas = vzalloc(entries * sizeof(mem->hpas[0]));
+ /*
+ * For a starting point for a maximum page size calculation
+ * we use @ua and @entries natural alignment to allow IOMMU pages
+ * smaller than huge pages but still bigger than PAGE_SIZE.
+ */
+ mem->pageshift = __ffs(ua | (entries << PAGE_SHIFT));
+ mem->hpas = vzalloc(array_size(entries, sizeof(mem->hpas[0])));
if (!mem->hpas) {
kfree(mem);
ret = -ENOMEM;
@@ -168,7 +182,8 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
}
for (i = 0; i < entries; ++i) {
- if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
+ cur_ua = ua + (i << PAGE_SHIFT);
+ if (1 != get_user_pages_fast(cur_ua,
1/* pages */, 1/* iswrite */, &page)) {
ret = -EFAULT;
for (j = 0; j < i; ++j)
@@ -187,7 +202,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
if (is_migrate_cma_page(page)) {
if (mm_iommu_move_page_from_cma(page))
goto populate;
- if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
+ if (1 != get_user_pages_fast(cur_ua,
1/* pages */, 1/* iswrite */,
&page)) {
ret = -EFAULT;
@@ -200,6 +215,24 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
}
}
populate:
+ pageshift = PAGE_SHIFT;
+ if (mem->pageshift > PAGE_SHIFT && PageCompound(page)) {
+ pte_t *pte;
+ struct page *head = compound_head(page);
+ unsigned int compshift = compound_order(head);
+ unsigned int pteshift;
+
+ local_irq_save(flags); /* disables as well */
+ pte = find_linux_pte(mm->pgd, cur_ua, NULL, &pteshift);
+
+ /* Double check it is still the same pinned page */
+ if (pte && pte_page(*pte) == head &&
+ pteshift == compshift + PAGE_SHIFT)
+ pageshift = max_t(unsigned int, pteshift,
+ PAGE_SHIFT);
+ local_irq_restore(flags);
+ }
+ mem->pageshift = min(mem->pageshift, pageshift);
mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
}
@@ -234,6 +267,9 @@ static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
if (!page)
continue;
+ if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY)
+ SetPageDirty(page);
+
put_page(page);
mem->hpas[i] = 0;
}
@@ -331,7 +367,6 @@ struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm,
return ret;
}
-EXPORT_SYMBOL_GPL(mm_iommu_lookup_rm);
struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
unsigned long ua, unsigned long entries)
@@ -350,7 +385,7 @@ struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
EXPORT_SYMBOL_GPL(mm_iommu_find);
long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
- unsigned long ua, unsigned long *hpa)
+ unsigned long ua, unsigned int pageshift, unsigned long *hpa)
{
const long entry = (ua - mem->ua) >> PAGE_SHIFT;
u64 *va = &mem->hpas[entry];
@@ -358,14 +393,17 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
if (entry >= mem->entries)
return -EFAULT;
- *hpa = *va | (ua & ~PAGE_MASK);
+ if (pageshift > mem->pageshift)
+ return -EFAULT;
+
+ *hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
return 0;
}
EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
- unsigned long ua, unsigned long *hpa)
+ unsigned long ua, unsigned int pageshift, unsigned long *hpa)
{
const long entry = (ua - mem->ua) >> PAGE_SHIFT;
void *va = &mem->hpas[entry];
@@ -374,15 +412,38 @@ long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
if (entry >= mem->entries)
return -EFAULT;
+ if (pageshift > mem->pageshift)
+ return -EFAULT;
+
pa = (void *) vmalloc_to_phys(va);
if (!pa)
return -EFAULT;
- *hpa = *pa | (ua & ~PAGE_MASK);
+ *hpa = (*pa & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
return 0;
}
-EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa_rm);
+
+extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua)
+{
+ struct mm_iommu_table_group_mem_t *mem;
+ long entry;
+ void *va;
+ unsigned long *pa;
+
+ mem = mm_iommu_lookup_rm(mm, ua, PAGE_SIZE);
+ if (!mem)
+ return;
+
+ entry = (ua - mem->ua) >> PAGE_SHIFT;
+ va = &mem->hpas[entry];
+
+ pa = (void *) vmalloc_to_phys(va);
+ if (!pa)
+ return;
+
+ *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY;
+}
long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem)
{
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index 4554d6527682..2faca46ad720 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -44,7 +44,7 @@
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/spinlock.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/slab.h>
@@ -54,16 +54,44 @@
#include "mmu_decl.h"
-static unsigned int first_context, last_context;
+/*
+ * The MPC8xx has only 16 contexts. We rotate through them on each task switch.
+ * A better way would be to keep track of tasks that own contexts, and implement
+ * an LRU usage. That way very active tasks don't always have to pay the TLB
+ * reload overhead. The kernel pages are mapped shared, so the kernel can run on
+ * behalf of any task that makes a kernel entry. Shared does not mean they are
+ * not protected, just that the ASID comparison is not performed. -- Dan
+ *
+ * The IBM4xx has 256 contexts, so we can just rotate through these as a way of
+ * "switching" contexts. If the TID of the TLB is zero, the PID/TID comparison
+ * is disabled, so we can use a TID of zero to represent all kernel pages as
+ * shared among all contexts. -- Dan
+ *
+ * The IBM 47x core supports 16-bit PIDs, thus 65535 contexts. We should
+ * normally never have to steal though the facility is present if needed.
+ * -- BenH
+ */
+#define FIRST_CONTEXT 1
+#ifdef DEBUG_CLAMP_LAST_CONTEXT
+#define LAST_CONTEXT DEBUG_CLAMP_LAST_CONTEXT
+#elif defined(CONFIG_PPC_8xx)
+#define LAST_CONTEXT 16
+#elif defined(CONFIG_PPC_47x)
+#define LAST_CONTEXT 65535
+#else
+#define LAST_CONTEXT 255
+#endif
+
static unsigned int next_context, nr_free_contexts;
static unsigned long *context_map;
+#ifdef CONFIG_SMP
static unsigned long *stale_map[NR_CPUS];
+#endif
static struct mm_struct **context_mm;
static DEFINE_RAW_SPINLOCK(context_lock);
-static bool no_selective_tlbil;
#define CTX_MAP_SIZE \
- (sizeof(unsigned long) * (last_context / BITS_PER_LONG + 1))
+ (sizeof(unsigned long) * (LAST_CONTEXT / BITS_PER_LONG + 1))
/* Steal a context from a task that has one at the moment.
@@ -87,7 +115,7 @@ static unsigned int steal_context_smp(unsigned int id)
struct mm_struct *mm;
unsigned int cpu, max, i;
- max = last_context - first_context;
+ max = LAST_CONTEXT - FIRST_CONTEXT;
/* Attempt to free next_context first and then loop until we manage */
while (max--) {
@@ -99,8 +127,8 @@ static unsigned int steal_context_smp(unsigned int id)
*/
if (mm->context.active) {
id++;
- if (id > last_context)
- id = first_context;
+ if (id > LAST_CONTEXT)
+ id = FIRST_CONTEXT;
continue;
}
pr_hardcont(" | steal %d from 0x%p", id, mm);
@@ -139,10 +167,12 @@ static unsigned int steal_context_smp(unsigned int id)
static unsigned int steal_all_contexts(void)
{
struct mm_struct *mm;
+#ifdef CONFIG_SMP
int cpu = smp_processor_id();
+#endif
unsigned int id;
- for (id = first_context; id <= last_context; id++) {
+ for (id = FIRST_CONTEXT; id <= LAST_CONTEXT; id++) {
/* Pick up the victim mm */
mm = context_mm[id];
@@ -150,22 +180,24 @@ static unsigned int steal_all_contexts(void)
/* Mark this mm as having no context anymore */
mm->context.id = MMU_NO_CONTEXT;
- if (id != first_context) {
+ if (id != FIRST_CONTEXT) {
context_mm[id] = NULL;
__clear_bit(id, context_map);
#ifdef DEBUG_MAP_CONSISTENCY
mm->context.active = 0;
#endif
}
+#ifdef CONFIG_SMP
__clear_bit(id, stale_map[cpu]);
+#endif
}
/* Flush the TLB for all contexts (not to be used on SMP) */
_tlbil_all();
- nr_free_contexts = last_context - first_context;
+ nr_free_contexts = LAST_CONTEXT - FIRST_CONTEXT;
- return first_context;
+ return FIRST_CONTEXT;
}
/* Note that this will also be called on SMP if all other CPUs are
@@ -176,7 +208,9 @@ static unsigned int steal_all_contexts(void)
static unsigned int steal_context_up(unsigned int id)
{
struct mm_struct *mm;
+#ifdef CONFIG_SMP
int cpu = smp_processor_id();
+#endif
/* Pick up the victim mm */
mm = context_mm[id];
@@ -190,7 +224,9 @@ static unsigned int steal_context_up(unsigned int id)
mm->context.id = MMU_NO_CONTEXT;
/* XXX This clear should ultimately be part of local_flush_tlb_mm */
+#ifdef CONFIG_SMP
__clear_bit(id, stale_map[cpu]);
+#endif
return id;
}
@@ -201,7 +237,7 @@ static void context_check_map(void)
unsigned int id, nrf, nact;
nrf = nact = 0;
- for (id = first_context; id <= last_context; id++) {
+ for (id = FIRST_CONTEXT; id <= LAST_CONTEXT; id++) {
int used = test_bit(id, context_map);
if (!used)
nrf++;
@@ -219,7 +255,7 @@ static void context_check_map(void)
if (nact > num_online_cpus())
pr_err("MMU: More active contexts than CPUs ! (%d vs %d)\n",
nact, num_online_cpus());
- if (first_context > 0 && !test_bit(0, context_map))
+ if (FIRST_CONTEXT > 0 && !test_bit(0, context_map))
pr_err("MMU: Context 0 has been freed !!!\n");
}
#else
@@ -229,7 +265,10 @@ static void context_check_map(void) { }
void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
- unsigned int i, id, cpu = smp_processor_id();
+ unsigned int id;
+#ifdef CONFIG_SMP
+ unsigned int i, cpu = smp_processor_id();
+#endif
unsigned long *map;
/* No lockless fast path .. yet */
@@ -263,8 +302,8 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next,
/* We really don't have a context, let's try to acquire one */
id = next_context;
- if (id > last_context)
- id = first_context;
+ if (id > LAST_CONTEXT)
+ id = FIRST_CONTEXT;
map = context_map;
/* No more free contexts, let's try to steal one */
@@ -277,7 +316,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next,
goto stolen;
}
#endif /* CONFIG_SMP */
- if (no_selective_tlbil)
+ if (IS_ENABLED(CONFIG_PPC_8xx))
id = steal_all_contexts();
else
id = steal_context_up(id);
@@ -287,9 +326,9 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next,
/* We know there's at least one free context, try to find it */
while (__test_and_set_bit(id, map)) {
- id = find_next_zero_bit(map, last_context+1, id);
- if (id > last_context)
- id = first_context;
+ id = find_next_zero_bit(map, LAST_CONTEXT+1, id);
+ if (id > LAST_CONTEXT)
+ id = FIRST_CONTEXT;
}
stolen:
next_context = id + 1;
@@ -303,6 +342,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next,
/* If that context got marked stale on this CPU, then flush the
* local TLB for it and unmark it before we use it
*/
+#ifdef CONFIG_SMP
if (test_bit(id, stale_map[cpu])) {
pr_hardcont(" | stale flush %d [%d..%d]",
id, cpu_first_thread_sibling(cpu),
@@ -317,6 +357,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next,
__clear_bit(id, stale_map[i]);
}
}
+#endif
/* Flick the MMU and release lock */
pr_hardcont(" -> %d\n", id);
@@ -331,6 +372,17 @@ int init_new_context(struct task_struct *t, struct mm_struct *mm)
{
pr_hard("initing context for mm @%p\n", mm);
+#ifdef CONFIG_PPC_MM_SLICES
+ /*
+ * We have MMU_NO_CONTEXT set to be ~0. Hence check
+ * explicitly against context.id == 0. This ensures that we properly
+ * initialize context slice details for newly allocated mm's (which will
+ * have id == 0) and don't alter context slice inherited via fork (which
+ * will have id != 0).
+ */
+ if (mm->context.id == 0)
+ slice_init_new_context_exec(mm);
+#endif
mm->context.id = MMU_NO_CONTEXT;
mm->context.active = 0;
return 0;
@@ -407,52 +459,13 @@ void __init mmu_context_init(void)
init_mm.context.active = NR_CPUS;
/*
- * The MPC8xx has only 16 contexts. We rotate through them on each
- * task switch. A better way would be to keep track of tasks that
- * own contexts, and implement an LRU usage. That way very active
- * tasks don't always have to pay the TLB reload overhead. The
- * kernel pages are mapped shared, so the kernel can run on behalf
- * of any task that makes a kernel entry. Shared does not mean they
- * are not protected, just that the ASID comparison is not performed.
- * -- Dan
- *
- * The IBM4xx has 256 contexts, so we can just rotate through these
- * as a way of "switching" contexts. If the TID of the TLB is zero,
- * the PID/TID comparison is disabled, so we can use a TID of zero
- * to represent all kernel pages as shared among all contexts.
- * -- Dan
- *
- * The IBM 47x core supports 16-bit PIDs, thus 65535 contexts. We
- * should normally never have to steal though the facility is
- * present if needed.
- * -- BenH
- */
- if (mmu_has_feature(MMU_FTR_TYPE_8xx)) {
- first_context = 0;
- last_context = 15;
- no_selective_tlbil = true;
- } else if (mmu_has_feature(MMU_FTR_TYPE_47x)) {
- first_context = 1;
- last_context = 65535;
- no_selective_tlbil = false;
- } else {
- first_context = 1;
- last_context = 255;
- no_selective_tlbil = false;
- }
-
-#ifdef DEBUG_CLAMP_LAST_CONTEXT
- last_context = DEBUG_CLAMP_LAST_CONTEXT;
-#endif
- /*
* Allocate the maps used by context management
*/
- context_map = memblock_virt_alloc(CTX_MAP_SIZE, 0);
- context_mm = memblock_virt_alloc(sizeof(void *) * (last_context + 1), 0);
-#ifndef CONFIG_SMP
- stale_map[0] = memblock_virt_alloc(CTX_MAP_SIZE, 0);
-#else
- stale_map[boot_cpuid] = memblock_virt_alloc(CTX_MAP_SIZE, 0);
+ context_map = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES);
+ context_mm = memblock_alloc(sizeof(void *) * (LAST_CONTEXT + 1),
+ SMP_CACHE_BYTES);
+#ifdef CONFIG_SMP
+ stale_map[boot_cpuid] = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES);
cpuhp_setup_state_nocalls(CPUHP_POWERPC_MMU_CTX_PREPARE,
"powerpc/mmu/ctx:prepare",
@@ -461,17 +474,17 @@ void __init mmu_context_init(void)
printk(KERN_INFO
"MMU: Allocated %zu bytes of context maps for %d contexts\n",
- 2 * CTX_MAP_SIZE + (sizeof(void *) * (last_context + 1)),
- last_context - first_context + 1);
+ 2 * CTX_MAP_SIZE + (sizeof(void *) * (LAST_CONTEXT + 1)),
+ LAST_CONTEXT - FIRST_CONTEXT + 1);
/*
* Some processors have too few contexts to reserve one for
* init_mm, and require using context 0 for a normal task.
* Other processors reserve the use of context zero for the kernel.
- * This code assumes first_context < 32.
+ * This code assumes FIRST_CONTEXT < 32.
*/
- context_map[0] = (1 << first_context) - 1;
- next_context = first_context;
- nr_free_contexts = last_context - first_context + 1;
+ context_map[0] = (1 << FIRST_CONTEXT) - 1;
+ next_context = FIRST_CONTEXT;
+ nr_free_contexts = LAST_CONTEXT - FIRST_CONTEXT + 1;
}
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 57fbc554c785..8574fbbc45e0 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -19,10 +19,10 @@
*
*/
#include <linux/mm.h>
-#include <asm/tlbflush.h>
#include <asm/mmu.h>
#ifdef CONFIG_PPC_MMU_NOHASH
+#include <asm/trace.h>
/*
* On 40x and 8xx, we directly inline tlbia and tlbivax
@@ -31,10 +31,12 @@
static inline void _tlbil_all(void)
{
asm volatile ("sync; tlbia; isync" : : : "memory");
+ trace_tlbia(MMU_NO_CONTEXT);
}
static inline void _tlbil_pid(unsigned int pid)
{
asm volatile ("sync; tlbia; isync" : : : "memory");
+ trace_tlbia(pid);
}
#define _tlbil_pid_noind(pid) _tlbil_pid(pid)
@@ -56,6 +58,7 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid,
unsigned int tsize, unsigned int ind)
{
asm volatile ("tlbie %0; sync" : : "r" (address) : "memory");
+ trace_tlbie(0, 0, address, pid, 0, 0, 0);
}
#elif defined(CONFIG_PPC_BOOK3E)
extern void _tlbil_va(unsigned long address, unsigned int pid,
@@ -83,7 +86,7 @@ static inline void _tlbivax_bcast(unsigned long address, unsigned int pid,
#else /* CONFIG_PPC_MMU_NOHASH */
extern void hash_preload(struct mm_struct *mm, unsigned long ea,
- unsigned long access, unsigned long trap);
+ bool is_exec, unsigned long trap);
extern void _tlbie(unsigned long address);
@@ -98,7 +101,6 @@ extern void setbat(int index, unsigned long virt, phys_addr_t phys,
unsigned int size, pgprot_t prot);
extern int __map_without_bats;
-extern int __allow_ioremap_reserved;
extern unsigned int rtas_data, rtas_size;
struct hash_pte;
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index edd8d0bc9364..ce28ae5ca080 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -11,7 +11,7 @@
#define pr_fmt(fmt) "numa: " fmt
#include <linux/threads.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
@@ -19,7 +19,6 @@
#include <linux/nodemask.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
-#include <linux/memblock.h>
#include <linux/of.h>
#include <linux/pfn.h>
#include <linux/cpuset.h>
@@ -788,7 +787,7 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
void *nd;
int tnid;
- nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
+ nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
nd = __va(nd_pa);
/* report and initialize */
@@ -831,18 +830,13 @@ out:
of_node_put(rtas);
}
-void __init initmem_init(void)
+void __init mem_topology_setup(void)
{
- int nid, cpu;
-
- max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
- max_pfn = max_low_pfn;
+ int cpu;
if (parse_numa_properties())
setup_nonnuma();
- memblock_dump_all();
-
/*
* Modify the set of possible NUMA nodes to reflect information
* available about the set of online nodes, and the set of nodes
@@ -853,6 +847,23 @@ void __init initmem_init(void)
find_possible_nodes();
+ setup_node_to_cpumask_map();
+
+ reset_numa_cpu_lookup_table();
+
+ for_each_present_cpu(cpu)
+ numa_setup_cpu(cpu);
+}
+
+void __init initmem_init(void)
+{
+ int nid;
+
+ max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
+ max_pfn = max_low_pfn;
+
+ memblock_dump_all();
+
for_each_online_node(nid) {
unsigned long start_pfn, end_pfn;
@@ -863,10 +874,6 @@ void __init initmem_init(void)
sparse_init();
- setup_node_to_cpumask_map();
-
- reset_numa_cpu_lookup_table();
-
/*
* We need the numa_cpu_lookup_table to be accurate for all CPUs,
* even before we online them, so that we can use cpu_to_{node,mem}
@@ -876,8 +883,6 @@ void __init initmem_init(void)
*/
cpuhp_setup_state_nocalls(CPUHP_POWER_NUMA_PREPARE, "powerpc/numa:prepare",
ppc_numa_cpu_prepare, ppc_numa_cpu_dead);
- for_each_present_cpu(cpu)
- numa_setup_cpu(cpu);
}
static int __init early_numa(char *p)
@@ -1072,7 +1077,6 @@ static int prrn_enabled;
static void reset_topology_timer(void);
static int topology_timer_secs = 1;
static int topology_inited;
-static int topology_update_needed;
/*
* Change polling interval for associativity changes.
@@ -1105,7 +1109,7 @@ static void setup_cpu_associativity_change_counters(void)
for_each_possible_cpu(cpu) {
int i;
u8 *counts = vphn_cpu_change_counts[cpu];
- volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
+ volatile u8 *hypervisor_counts = lppaca_of(cpu).vphn_assoc_counts;
for (i = 0; i < distance_ref_points_depth; i++)
counts[i] = hypervisor_counts[i];
@@ -1131,7 +1135,7 @@ static int update_cpu_associativity_changes_mask(void)
for_each_possible_cpu(cpu) {
int i, changed = 0;
u8 *counts = vphn_cpu_change_counts[cpu];
- volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
+ volatile u8 *hypervisor_counts = lppaca_of(cpu).vphn_assoc_counts;
for (i = 0; i < distance_ref_points_depth; i++) {
if (hypervisor_counts[i] != counts[i]) {
@@ -1174,7 +1178,7 @@ static long vphn_get_associativity(unsigned long cpu,
switch (rc) {
case H_FUNCTION:
- printk(KERN_INFO
+ printk_once(KERN_INFO
"VPHN is not supported. Disabling polling...\n");
stop_topology_update();
break;
@@ -1199,7 +1203,9 @@ int find_and_online_cpu_nid(int cpu)
int new_nid;
/* Use associativity from first thread for all siblings */
- vphn_get_associativity(cpu, associativity);
+ if (vphn_get_associativity(cpu, associativity))
+ return cpu_to_node(cpu);
+
new_nid = associativity_to_nid(associativity);
if (new_nid < 0 || !node_possible(new_nid))
new_nid = first_online_node;
@@ -1210,9 +1216,10 @@ int find_and_online_cpu_nid(int cpu)
* Need to ensure that NODE_DATA is initialized for a node from
* available memory (see memblock_alloc_try_nid). If unable to
* init the node, then default to nearest node that has memory
- * installed.
+ * installed. Skip onlining a node if the subsystems are not
+ * yet initialized.
*/
- if (try_online_node(new_nid))
+ if (!topology_inited || try_online_node(new_nid))
new_nid = first_online_node;
#else
/*
@@ -1300,17 +1307,14 @@ int numa_update_cpu_topology(bool cpus_locked)
struct device *dev;
int weight, new_nid, i = 0;
- if (!prrn_enabled && !vphn_enabled) {
- if (!topology_inited)
- topology_update_needed = 1;
+ if (!prrn_enabled && !vphn_enabled && topology_inited)
return 0;
- }
weight = cpumask_weight(&cpu_associativity_changes_mask);
if (!weight)
return 0;
- updates = kzalloc(weight * (sizeof(*updates)), GFP_KERNEL);
+ updates = kcalloc(weight, sizeof(*updates), GFP_KERNEL);
if (!updates)
return 0;
@@ -1417,7 +1421,6 @@ int numa_update_cpu_topology(bool cpus_locked)
out:
kfree(updates);
- topology_update_needed = 0;
return changed;
}
@@ -1451,7 +1454,8 @@ static struct timer_list topology_timer;
static void reset_topology_timer(void)
{
- mod_timer(&topology_timer, jiffies + topology_timer_secs * HZ);
+ if (vphn_enabled)
+ mod_timer(&topology_timer, jiffies + topology_timer_secs * HZ);
}
#ifdef CONFIG_SMP
@@ -1516,6 +1520,10 @@ int start_topology_update(void)
}
}
+ pr_info("Starting topology update%s%s\n",
+ (prrn_enabled ? " prrn_enabled" : ""),
+ (vphn_enabled ? " vphn_enabled" : ""));
+
return rc;
}
@@ -1537,6 +1545,8 @@ int stop_topology_update(void)
rc = del_timer_sync(&topology_timer);
}
+ pr_info("Stopping topology update\n");
+
return rc;
}
@@ -1545,6 +1555,15 @@ int prrn_is_enabled(void)
return prrn_enabled;
}
+void __init shared_proc_topology_init(void)
+{
+ if (lppaca_shared_proc(get_lppaca())) {
+ bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask),
+ nr_cpumask_bits);
+ numa_update_cpu_topology(false);
+ }
+}
+
static int topology_read(struct seq_file *file, void *v)
{
if (vphn_enabled || prrn_enabled)
@@ -1602,10 +1621,6 @@ static int topology_update_init(void)
return -ENOMEM;
topology_inited = 1;
- if (topology_update_needed)
- bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask),
- nr_cpumask_bits);
-
return 0;
}
device_initcall(topology_update_init);
diff --git a/arch/powerpc/mm/pgtable-book3e.c b/arch/powerpc/mm/pgtable-book3e.c
index a2298930f990..e0ccf36714b2 100644
--- a/arch/powerpc/mm/pgtable-book3e.c
+++ b/arch/powerpc/mm/pgtable-book3e.c
@@ -42,7 +42,7 @@ int __meminit vmemmap_create_mapping(unsigned long start,
* thus must have the low bits clear
*/
for (i = 0; i < page_size; i += PAGE_SIZE)
- BUG_ON(map_kernel_page(start + i, phys, flags));
+ BUG_ON(map_kernel_page(start + i, phys, __pgprot(flags)));
return 0;
}
@@ -70,7 +70,7 @@ static __ref void *early_alloc_pgtable(unsigned long size)
* map_kernel_page adds an entry to the ioremap page table
* and adds an entry to the HPT, possibly bolting it
*/
-int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags)
+int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
{
pgd_t *pgdp;
pud_t *pudp;
@@ -89,8 +89,6 @@ int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags)
ptep = pte_alloc_kernel(pmdp, ea);
if (!ptep)
return -ENOMEM;
- set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
- __pgprot(flags)));
} else {
pgdp = pgd_offset_k(ea);
#ifndef __PAGETABLE_PUD_FOLDED
@@ -113,9 +111,8 @@ int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags)
pmd_populate_kernel(&init_mm, pmdp, ptep);
}
ptep = pte_offset_kernel(pmdp, ea);
- set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
- __pgprot(flags)));
}
+ set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot));
smp_wmb();
return 0;
diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c
index 422e80253a33..9f93c9f985c5 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -9,14 +9,22 @@
#include <linux/sched.h>
#include <linux/mm_types.h>
+#include <linux/memblock.h>
#include <misc/cxl-base.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
+#include <asm/trace.h>
+#include <asm/powernv.h>
#include "mmu_decl.h"
#include <trace/events/thp.h>
+unsigned long __pmd_frag_nr;
+EXPORT_SYMBOL(__pmd_frag_nr);
+unsigned long __pmd_frag_size_shift;
+EXPORT_SYMBOL(__pmd_frag_size_shift);
+
int (*register_process_table)(unsigned long base, unsigned long page_size,
unsigned long tbl_size);
@@ -34,13 +42,16 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
int changed;
#ifdef CONFIG_DEBUG_VM
WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
- assert_spin_locked(&vma->vm_mm->page_table_lock);
+ assert_spin_locked(pmd_lockptr(vma->vm_mm, pmdp));
#endif
changed = !pmd_same(*(pmdp), entry);
if (changed) {
- __ptep_set_access_flags(vma->vm_mm, pmdp_ptep(pmdp),
- pmd_pte(entry), address);
- flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ /*
+ * We can use MMU_PAGE_2M here, because only radix
+ * path look at the psize.
+ */
+ __ptep_set_access_flags(vma, pmdp_ptep(pmdp),
+ pmd_pte(entry), address, MMU_PAGE_2M);
}
return changed;
}
@@ -58,9 +69,14 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
#ifdef CONFIG_DEBUG_VM
- WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));
- assert_spin_locked(&mm->page_table_lock);
- WARN_ON(!(pmd_trans_huge(pmd) || pmd_devmap(pmd)));
+ /*
+ * Make sure hardware valid bit is not set. We don't do
+ * tlb flush for this update.
+ */
+
+ WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
+ WARN_ON(!(pmd_large(pmd) || pmd_devmap(pmd)));
#endif
trace_hugepage_set_pmd(addr, pmd_val(pmd));
return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
@@ -95,7 +111,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
{
unsigned long old_pmd;
- old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
+ old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
/*
* This ensures that generic code that rely on IRQ disabling
@@ -141,7 +157,8 @@ pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd)
{
- return;
+ if (radix_enabled())
+ prefetch((void *)addr);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -155,15 +172,15 @@ void mmu_cleanup_all(void)
}
#ifdef CONFIG_MEMORY_HOTPLUG
-int create_section_mapping(unsigned long start, unsigned long end)
+int __meminit create_section_mapping(unsigned long start, unsigned long end, int nid)
{
if (radix_enabled())
- return radix__create_section_mapping(start, end);
+ return radix__create_section_mapping(start, end, nid);
- return hash__create_section_mapping(start, end);
+ return hash__create_section_mapping(start, end, nid);
}
-int remove_section_mapping(unsigned long start, unsigned long end)
+int __meminit remove_section_mapping(unsigned long start, unsigned long end)
{
if (radix_enabled())
return radix__remove_section_mapping(start, end);
@@ -171,3 +188,297 @@ int remove_section_mapping(unsigned long start, unsigned long end)
return hash__remove_section_mapping(start, end);
}
#endif /* CONFIG_MEMORY_HOTPLUG */
+
+void __init mmu_partition_table_init(void)
+{
+ unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
+ unsigned long ptcr;
+
+ BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large.");
+ partition_tb = __va(memblock_alloc_base(patb_size, patb_size,
+ MEMBLOCK_ALLOC_ANYWHERE));
+
+ /* Initialize the Partition Table with no entries */
+ memset((void *)partition_tb, 0, patb_size);
+
+ /*
+ * update partition table control register,
+ * 64 K size.
+ */
+ ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12);
+ mtspr(SPRN_PTCR, ptcr);
+ powernv_set_nmmu_ptcr(ptcr);
+}
+
+void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
+ unsigned long dw1)
+{
+ unsigned long old = be64_to_cpu(partition_tb[lpid].patb0);
+
+ partition_tb[lpid].patb0 = cpu_to_be64(dw0);
+ partition_tb[lpid].patb1 = cpu_to_be64(dw1);
+
+ /*
+ * Global flush of TLBs and partition table caches for this lpid.
+ * The type of flush (hash or radix) depends on what the previous
+ * use of this partition ID was, not the new use.
+ */
+ asm volatile("ptesync" : : : "memory");
+ if (old & PATB_HR) {
+ asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
+ "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+ asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
+ "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+ trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 1);
+ } else {
+ asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
+ "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+ trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0);
+ }
+ /* do we need fixup here ?*/
+ asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+}
+EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry);
+
+static pmd_t *get_pmd_from_cache(struct mm_struct *mm)
+{
+ void *pmd_frag, *ret;
+
+ spin_lock(&mm->page_table_lock);
+ ret = mm->context.pmd_frag;
+ if (ret) {
+ pmd_frag = ret + PMD_FRAG_SIZE;
+ /*
+ * If we have taken up all the fragments mark PTE page NULL
+ */
+ if (((unsigned long)pmd_frag & ~PAGE_MASK) == 0)
+ pmd_frag = NULL;
+ mm->context.pmd_frag = pmd_frag;
+ }
+ spin_unlock(&mm->page_table_lock);
+ return (pmd_t *)ret;
+}
+
+static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
+{
+ void *ret = NULL;
+ struct page *page;
+ gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
+
+ if (mm == &init_mm)
+ gfp &= ~__GFP_ACCOUNT;
+ page = alloc_page(gfp);
+ if (!page)
+ return NULL;
+ if (!pgtable_pmd_page_ctor(page)) {
+ __free_pages(page, 0);
+ return NULL;
+ }
+
+ atomic_set(&page->pt_frag_refcount, 1);
+
+ ret = page_address(page);
+ /*
+ * if we support only one fragment just return the
+ * allocated page.
+ */
+ if (PMD_FRAG_NR == 1)
+ return ret;
+
+ spin_lock(&mm->page_table_lock);
+ /*
+ * If we find pgtable_page set, we return
+ * the allocated page with single fragement
+ * count.
+ */
+ if (likely(!mm->context.pmd_frag)) {
+ atomic_set(&page->pt_frag_refcount, PMD_FRAG_NR);
+ mm->context.pmd_frag = ret + PMD_FRAG_SIZE;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ return (pmd_t *)ret;
+}
+
+pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr)
+{
+ pmd_t *pmd;
+
+ pmd = get_pmd_from_cache(mm);
+ if (pmd)
+ return pmd;
+
+ return __alloc_for_pmdcache(mm);
+}
+
+void pmd_fragment_free(unsigned long *pmd)
+{
+ struct page *page = virt_to_page(pmd);
+
+ BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
+ if (atomic_dec_and_test(&page->pt_frag_refcount)) {
+ pgtable_pmd_page_dtor(page);
+ __free_page(page);
+ }
+}
+
+static pte_t *get_pte_from_cache(struct mm_struct *mm)
+{
+ void *pte_frag, *ret;
+
+ spin_lock(&mm->page_table_lock);
+ ret = mm->context.pte_frag;
+ if (ret) {
+ pte_frag = ret + PTE_FRAG_SIZE;
+ /*
+ * If we have taken up all the fragments mark PTE page NULL
+ */
+ if (((unsigned long)pte_frag & ~PAGE_MASK) == 0)
+ pte_frag = NULL;
+ mm->context.pte_frag = pte_frag;
+ }
+ spin_unlock(&mm->page_table_lock);
+ return (pte_t *)ret;
+}
+
+static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel)
+{
+ void *ret = NULL;
+ struct page *page;
+
+ if (!kernel) {
+ page = alloc_page(PGALLOC_GFP | __GFP_ACCOUNT);
+ if (!page)
+ return NULL;
+ if (!pgtable_page_ctor(page)) {
+ __free_page(page);
+ return NULL;
+ }
+ } else {
+ page = alloc_page(PGALLOC_GFP);
+ if (!page)
+ return NULL;
+ }
+
+ atomic_set(&page->pt_frag_refcount, 1);
+
+ ret = page_address(page);
+ /*
+ * if we support only one fragment just return the
+ * allocated page.
+ */
+ if (PTE_FRAG_NR == 1)
+ return ret;
+ spin_lock(&mm->page_table_lock);
+ /*
+ * If we find pgtable_page set, we return
+ * the allocated page with single fragement
+ * count.
+ */
+ if (likely(!mm->context.pte_frag)) {
+ atomic_set(&page->pt_frag_refcount, PTE_FRAG_NR);
+ mm->context.pte_frag = ret + PTE_FRAG_SIZE;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ return (pte_t *)ret;
+}
+
+pte_t *pte_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
+{
+ pte_t *pte;
+
+ pte = get_pte_from_cache(mm);
+ if (pte)
+ return pte;
+
+ return __alloc_for_ptecache(mm, kernel);
+}
+
+void pte_fragment_free(unsigned long *table, int kernel)
+{
+ struct page *page = virt_to_page(table);
+
+ BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
+ if (atomic_dec_and_test(&page->pt_frag_refcount)) {
+ if (!kernel)
+ pgtable_page_dtor(page);
+ __free_page(page);
+ }
+}
+
+static inline void pgtable_free(void *table, int index)
+{
+ switch (index) {
+ case PTE_INDEX:
+ pte_fragment_free(table, 0);
+ break;
+ case PMD_INDEX:
+ pmd_fragment_free(table);
+ break;
+ case PUD_INDEX:
+ kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table);
+ break;
+#if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE)
+ /* 16M hugepd directory at pud level */
+ case HTLB_16M_INDEX:
+ BUILD_BUG_ON(H_16M_CACHE_INDEX <= 0);
+ kmem_cache_free(PGT_CACHE(H_16M_CACHE_INDEX), table);
+ break;
+ /* 16G hugepd directory at the pgd level */
+ case HTLB_16G_INDEX:
+ BUILD_BUG_ON(H_16G_CACHE_INDEX <= 0);
+ kmem_cache_free(PGT_CACHE(H_16G_CACHE_INDEX), table);
+ break;
+#endif
+ /* We don't free pgd table via RCU callback */
+ default:
+ BUG();
+ }
+}
+
+#ifdef CONFIG_SMP
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index)
+{
+ unsigned long pgf = (unsigned long)table;
+
+ BUG_ON(index > MAX_PGTABLE_INDEX_SIZE);
+ pgf |= index;
+ tlb_remove_table(tlb, (void *)pgf);
+}
+
+void __tlb_remove_table(void *_table)
+{
+ void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
+ unsigned int index = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
+
+ return pgtable_free(table, index);
+}
+#else
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index)
+{
+ return pgtable_free(table, index);
+}
+#endif
+
+#ifdef CONFIG_PROC_FS
+atomic_long_t direct_pages_count[MMU_PAGE_COUNT];
+
+void arch_report_meminfo(struct seq_file *m)
+{
+ /*
+ * Hash maps the memory with one size mmu_linear_psize.
+ * So don't bother to print these on hash
+ */
+ if (!radix_enabled())
+ return;
+ seq_printf(m, "DirectMap4k: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[MMU_PAGE_4K]) << 2);
+ seq_printf(m, "DirectMap64k: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[MMU_PAGE_64K]) << 6);
+ seq_printf(m, "DirectMap2M: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[MMU_PAGE_2M]) << 11);
+ seq_printf(m, "DirectMap1G: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[MMU_PAGE_1G]) << 20);
+}
+#endif /* CONFIG_PROC_FS */
diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c
index 469808e77e58..c08d49046a96 100644
--- a/arch/powerpc/mm/pgtable-hash64.c
+++ b/arch/powerpc/mm/pgtable-hash64.c
@@ -24,6 +24,10 @@
#define CREATE_TRACE_POINTS
#include <trace/events/thp.h>
+#if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE))
+#warning Limited user VSID range means pagetable space is wasted
+#endif
+
#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
* vmemmap is the starting address of the virtual address space where
@@ -138,7 +142,7 @@ void hash__vmemmap_remove_mapping(unsigned long start,
* map_kernel_page adds an entry to the ioremap page table
* and adds an entry to the HPT, possibly bolting it
*/
-int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags)
+int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
{
pgd_t *pgdp;
pud_t *pudp;
@@ -157,8 +161,7 @@ int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flag
ptep = pte_alloc_kernel(pmdp, ea);
if (!ptep)
return -ENOMEM;
- set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
- __pgprot(flags)));
+ set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot));
} else {
/*
* If the mm subsystem is not fully up, we cannot create a
@@ -166,7 +169,7 @@ int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flag
* entry in the hardware page table.
*
*/
- if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags,
+ if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, pgprot_val(prot),
mmu_io_psize, mmu_kernel_ssize)) {
printk(KERN_ERR "Failed to do bolted mapping IO "
"memory at %016lx !\n", pa);
@@ -189,7 +192,7 @@ unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr
#ifdef CONFIG_DEBUG_VM
WARN_ON(!hash__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
#endif
__asm__ __volatile__(
@@ -261,7 +264,8 @@ void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)
{
pgtable_t *pgtable_slot;
- assert_spin_locked(&mm->page_table_lock);
+
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
/*
* we store the pgtable in the second half of PMD
*/
@@ -281,7 +285,8 @@ pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
pgtable_t pgtable;
pgtable_t *pgtable_slot;
- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
+
pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
pgtable = *pgtable_slot;
/*
@@ -320,7 +325,7 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
if (!is_kernel_addr(addr)) {
ssize = user_segment_size(addr);
- vsid = get_vsid(mm->context.id, addr, ssize);
+ vsid = get_user_vsid(&mm->context, addr, ssize);
WARN_ON(vsid == 0);
} else {
vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 2e10a964e290..931156069a81 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -48,20 +48,88 @@ static int native_register_process_table(unsigned long base, unsigned long pg_sz
return 0;
}
-static __ref void *early_alloc_pgtable(unsigned long size)
+static __ref void *early_alloc_pgtable(unsigned long size, int nid,
+ unsigned long region_start, unsigned long region_end)
{
+ unsigned long pa = 0;
void *pt;
- pt = __va(memblock_alloc_base(size, size, MEMBLOCK_ALLOC_ANYWHERE));
+ if (region_start || region_end) /* has region hint */
+ pa = memblock_alloc_range(size, size, region_start, region_end,
+ MEMBLOCK_NONE);
+ else if (nid != -1) /* has node hint */
+ pa = memblock_alloc_base_nid(size, size,
+ MEMBLOCK_ALLOC_ANYWHERE,
+ nid, MEMBLOCK_NONE);
+
+ if (!pa)
+ pa = memblock_alloc_base(size, size, MEMBLOCK_ALLOC_ANYWHERE);
+
+ BUG_ON(!pa);
+
+ pt = __va(pa);
memset(pt, 0, size);
return pt;
}
-int radix__map_kernel_page(unsigned long ea, unsigned long pa,
+static int early_map_kernel_page(unsigned long ea, unsigned long pa,
pgprot_t flags,
- unsigned int map_page_size)
+ unsigned int map_page_size,
+ int nid,
+ unsigned long region_start, unsigned long region_end)
+{
+ unsigned long pfn = pa >> PAGE_SHIFT;
+ pgd_t *pgdp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+
+ pgdp = pgd_offset_k(ea);
+ if (pgd_none(*pgdp)) {
+ pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
+ region_start, region_end);
+ pgd_populate(&init_mm, pgdp, pudp);
+ }
+ pudp = pud_offset(pgdp, ea);
+ if (map_page_size == PUD_SIZE) {
+ ptep = (pte_t *)pudp;
+ goto set_the_pte;
+ }
+ if (pud_none(*pudp)) {
+ pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid,
+ region_start, region_end);
+ pud_populate(&init_mm, pudp, pmdp);
+ }
+ pmdp = pmd_offset(pudp, ea);
+ if (map_page_size == PMD_SIZE) {
+ ptep = pmdp_ptep(pmdp);
+ goto set_the_pte;
+ }
+ if (!pmd_present(*pmdp)) {
+ ptep = early_alloc_pgtable(PAGE_SIZE, nid,
+ region_start, region_end);
+ pmd_populate_kernel(&init_mm, pmdp, ptep);
+ }
+ ptep = pte_offset_kernel(pmdp, ea);
+
+set_the_pte:
+ set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
+ smp_wmb();
+ return 0;
+}
+
+/*
+ * nid, region_start, and region_end are hints to try to place the page
+ * table memory in the same node or region.
+ */
+static int __map_kernel_page(unsigned long ea, unsigned long pa,
+ pgprot_t flags,
+ unsigned int map_page_size,
+ int nid,
+ unsigned long region_start, unsigned long region_end)
{
+ unsigned long pfn = pa >> PAGE_SHIFT;
pgd_t *pgdp;
pud_t *pudp;
pmd_t *pmdp;
@@ -70,61 +138,48 @@ int radix__map_kernel_page(unsigned long ea, unsigned long pa,
* Make sure task size is correct as per the max adddr
*/
BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
- if (slab_is_available()) {
- pgdp = pgd_offset_k(ea);
- pudp = pud_alloc(&init_mm, pgdp, ea);
- if (!pudp)
- return -ENOMEM;
- if (map_page_size == PUD_SIZE) {
- ptep = (pte_t *)pudp;
- goto set_the_pte;
- }
- pmdp = pmd_alloc(&init_mm, pudp, ea);
- if (!pmdp)
- return -ENOMEM;
- if (map_page_size == PMD_SIZE) {
- ptep = pmdp_ptep(pmdp);
- goto set_the_pte;
- }
- ptep = pte_alloc_kernel(pmdp, ea);
- if (!ptep)
- return -ENOMEM;
- } else {
- pgdp = pgd_offset_k(ea);
- if (pgd_none(*pgdp)) {
- pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
- BUG_ON(pudp == NULL);
- pgd_populate(&init_mm, pgdp, pudp);
- }
- pudp = pud_offset(pgdp, ea);
- if (map_page_size == PUD_SIZE) {
- ptep = (pte_t *)pudp;
- goto set_the_pte;
- }
- if (pud_none(*pudp)) {
- pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
- BUG_ON(pmdp == NULL);
- pud_populate(&init_mm, pudp, pmdp);
- }
- pmdp = pmd_offset(pudp, ea);
- if (map_page_size == PMD_SIZE) {
- ptep = pmdp_ptep(pmdp);
- goto set_the_pte;
- }
- if (!pmd_present(*pmdp)) {
- ptep = early_alloc_pgtable(PAGE_SIZE);
- BUG_ON(ptep == NULL);
- pmd_populate_kernel(&init_mm, pmdp, ptep);
- }
- ptep = pte_offset_kernel(pmdp, ea);
+
+ if (unlikely(!slab_is_available()))
+ return early_map_kernel_page(ea, pa, flags, map_page_size,
+ nid, region_start, region_end);
+
+ /*
+ * Should make page table allocation functions be able to take a
+ * node, so we can place kernel page tables on the right nodes after
+ * boot.
+ */
+ pgdp = pgd_offset_k(ea);
+ pudp = pud_alloc(&init_mm, pgdp, ea);
+ if (!pudp)
+ return -ENOMEM;
+ if (map_page_size == PUD_SIZE) {
+ ptep = (pte_t *)pudp;
+ goto set_the_pte;
}
+ pmdp = pmd_alloc(&init_mm, pudp, ea);
+ if (!pmdp)
+ return -ENOMEM;
+ if (map_page_size == PMD_SIZE) {
+ ptep = pmdp_ptep(pmdp);
+ goto set_the_pte;
+ }
+ ptep = pte_alloc_kernel(pmdp, ea);
+ if (!ptep)
+ return -ENOMEM;
set_the_pte:
- set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, flags));
+ set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
smp_wmb();
return 0;
}
+int radix__map_kernel_page(unsigned long ea, unsigned long pa,
+ pgprot_t flags,
+ unsigned int map_page_size)
+{
+ return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
+}
+
#ifdef CONFIG_STRICT_KERNEL_RWX
void radix__change_memory_range(unsigned long start, unsigned long end,
unsigned long clear)
@@ -171,16 +226,6 @@ void radix__mark_rodata_ro(void)
{
unsigned long start, end;
- /*
- * mark_rodata_ro() will mark itself as !writable at some point.
- * Due to DD1 workaround in radix__pte_update(), we'll end up with
- * an invalid pte and the system will crash quite severly.
- */
- if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
- pr_warn("Warning: Unable to mark rodata read only on P9 DD1\n");
- return;
- }
-
start = (unsigned long)_stext;
end = (unsigned long)__init_begin;
@@ -196,9 +241,8 @@ void radix__mark_initmem_nx(void)
}
#endif /* CONFIG_STRICT_KERNEL_RWX */
-static inline void __meminit print_mapping(unsigned long start,
- unsigned long end,
- unsigned long size)
+static inline void __meminit
+print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec)
{
char buf[10];
@@ -207,76 +251,78 @@ static inline void __meminit print_mapping(unsigned long start,
string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
- pr_info("Mapped 0x%016lx-0x%016lx with %s pages\n", start, end, buf);
+ pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf,
+ exec ? " (exec)" : "");
+}
+
+static unsigned long next_boundary(unsigned long addr, unsigned long end)
+{
+#ifdef CONFIG_STRICT_KERNEL_RWX
+ if (addr < __pa_symbol(__init_begin))
+ return __pa_symbol(__init_begin);
+#endif
+ return end;
}
static int __meminit create_physical_mapping(unsigned long start,
- unsigned long end)
+ unsigned long end,
+ int nid)
{
unsigned long vaddr, addr, mapping_size = 0;
+ bool prev_exec, exec = false;
pgprot_t prot;
- unsigned long max_mapping_size;
-#ifdef CONFIG_STRICT_KERNEL_RWX
- int split_text_mapping = 1;
-#else
- int split_text_mapping = 0;
-#endif
+ int psize;
start = _ALIGN_UP(start, PAGE_SIZE);
for (addr = start; addr < end; addr += mapping_size) {
unsigned long gap, previous_size;
int rc;
- gap = end - addr;
+ gap = next_boundary(addr, end) - addr;
previous_size = mapping_size;
- max_mapping_size = PUD_SIZE;
+ prev_exec = exec;
-retry:
if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
- mmu_psize_defs[MMU_PAGE_1G].shift &&
- PUD_SIZE <= max_mapping_size)
+ mmu_psize_defs[MMU_PAGE_1G].shift) {
mapping_size = PUD_SIZE;
- else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
- mmu_psize_defs[MMU_PAGE_2M].shift)
+ psize = MMU_PAGE_1G;
+ } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
+ mmu_psize_defs[MMU_PAGE_2M].shift) {
mapping_size = PMD_SIZE;
- else
+ psize = MMU_PAGE_2M;
+ } else {
mapping_size = PAGE_SIZE;
-
- if (split_text_mapping && (mapping_size == PUD_SIZE) &&
- (addr <= __pa_symbol(__init_begin)) &&
- (addr + mapping_size) >= __pa_symbol(_stext)) {
- max_mapping_size = PMD_SIZE;
- goto retry;
- }
-
- if (split_text_mapping && (mapping_size == PMD_SIZE) &&
- (addr <= __pa_symbol(__init_begin)) &&
- (addr + mapping_size) >= __pa_symbol(_stext))
- mapping_size = PAGE_SIZE;
-
- if (mapping_size != previous_size) {
- print_mapping(start, addr, previous_size);
- start = addr;
+ psize = mmu_virtual_psize;
}
vaddr = (unsigned long)__va(addr);
if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
- overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size))
+ overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) {
prot = PAGE_KERNEL_X;
- else
+ exec = true;
+ } else {
prot = PAGE_KERNEL;
+ exec = false;
+ }
+
+ if (mapping_size != previous_size || exec != prev_exec) {
+ print_mapping(start, addr, previous_size, prev_exec);
+ start = addr;
+ }
- rc = radix__map_kernel_page(vaddr, addr, prot, mapping_size);
+ rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
if (rc)
return rc;
+
+ update_page_count(psize, 1);
}
- print_mapping(start, addr, mapping_size);
+ print_mapping(start, addr, mapping_size, exec);
return 0;
}
-static void __init radix_init_pgtable(void)
+void __init radix_init_pgtable(void)
{
unsigned long rts_field;
struct memblock_region *reg;
@@ -286,9 +332,16 @@ static void __init radix_init_pgtable(void)
/*
* Create the linear mapping, using standard page size for now
*/
- for_each_memblock(memory, reg)
+ for_each_memblock(memory, reg) {
+ /*
+ * The memblock allocator is up at this point, so the
+ * page tables will be allocated within the range. No
+ * need or a node (which we don't have yet).
+ */
WARN_ON(create_physical_mapping(reg->base,
- reg->base + reg->size));
+ reg->base + reg->size,
+ -1));
+ }
/* Find out how many PID bits are supported */
if (cpu_has_feature(CPU_FTR_HVMODE)) {
@@ -317,7 +370,7 @@ static void __init radix_init_pgtable(void)
* host.
*/
BUG_ON(PRTB_SIZE_SHIFT > 36);
- process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT);
+ process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
/*
* Fill in the process table.
*/
@@ -470,35 +523,6 @@ found:
return;
}
-static void update_hid_for_radix(void)
-{
- unsigned long hid0;
- unsigned long rb = 3UL << PPC_BITLSHIFT(53); /* IS = 3 */
-
- asm volatile("ptesync": : :"memory");
- /* prs = 0, ric = 2, rs = 0, r = 1 is = 3 */
- asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
- : : "r"(rb), "i"(1), "i"(0), "i"(2), "r"(0) : "memory");
- /* prs = 1, ric = 2, rs = 0, r = 1 is = 3 */
- asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
- : : "r"(rb), "i"(1), "i"(1), "i"(2), "r"(0) : "memory");
- asm volatile("eieio; tlbsync; ptesync; isync; slbia": : :"memory");
- trace_tlbie(0, 0, rb, 0, 2, 0, 1);
- trace_tlbie(0, 0, rb, 0, 2, 1, 1);
-
- /*
- * now switch the HID
- */
- hid0 = mfspr(SPRN_HID0);
- hid0 |= HID0_POWER9_RADIX;
- mtspr(SPRN_HID0, hid0);
- asm volatile("isync": : :"memory");
-
- /* Wait for it to happen */
- while (!(mfspr(SPRN_HID0) & HID0_POWER9_RADIX))
- cpu_relax();
-}
-
static void radix_init_amor(void)
{
/*
@@ -513,22 +537,12 @@ static void radix_init_amor(void)
static void radix_init_iamr(void)
{
- unsigned long iamr;
-
- /*
- * The IAMR should set to 0 on DD1.
- */
- if (cpu_has_feature(CPU_FTR_POWER9_DD1))
- iamr = 0;
- else
- iamr = (1ul << 62);
-
/*
* Radix always uses key0 of the IAMR to determine if an access is
* allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
* fetch.
*/
- mtspr(SPRN_IAMR, iamr);
+ mtspr(SPRN_IAMR, (1ul << 62));
}
void __init radix__early_init_mmu(void)
@@ -554,7 +568,6 @@ void __init radix__early_init_mmu(void)
__pud_index_size = RADIX_PUD_INDEX_SIZE;
__pgd_index_size = RADIX_PGD_INDEX_SIZE;
__pud_cache_index = RADIX_PUD_INDEX_SIZE;
- __pmd_cache_index = RADIX_PMD_INDEX_SIZE;
__pte_table_size = RADIX_PTE_TABLE_SIZE;
__pmd_table_size = RADIX_PMD_TABLE_SIZE;
__pud_table_size = RADIX_PUD_TABLE_SIZE;
@@ -575,17 +588,13 @@ void __init radix__early_init_mmu(void)
#ifdef CONFIG_PCI
pci_io_base = ISA_IO_BASE;
#endif
-
- /*
- * For now radix also use the same frag size
- */
- __pte_frag_nr = H_PTE_FRAG_NR;
- __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT;
+ __pte_frag_nr = RADIX_PTE_FRAG_NR;
+ __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;
+ __pmd_frag_nr = RADIX_PMD_FRAG_NR;
+ __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
if (!firmware_has_feature(FW_FEATURE_LPAR)) {
radix_init_native();
- if (cpu_has_feature(CPU_FTR_POWER9_DD1))
- update_hid_for_radix();
lpcr = mfspr(SPRN_LPCR);
mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
radix_init_partition_table();
@@ -611,10 +620,6 @@ void radix__early_init_mmu_secondary(void)
* update partition table control register and UPRT
*/
if (!firmware_has_feature(FW_FEATURE_LPAR)) {
-
- if (cpu_has_feature(CPU_FTR_POWER9_DD1))
- update_hid_for_radix();
-
lpcr = mfspr(SPRN_LPCR);
mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
@@ -695,7 +700,7 @@ struct change_mapping_params {
unsigned long aligned_end;
};
-static int stop_machine_change_mapping(void *data)
+static int __meminit stop_machine_change_mapping(void *data)
{
struct change_mapping_params *params =
(struct change_mapping_params *)data;
@@ -705,8 +710,8 @@ static int stop_machine_change_mapping(void *data)
spin_unlock(&init_mm.page_table_lock);
pte_clear(&init_mm, params->aligned_start, params->pte);
- create_physical_mapping(params->aligned_start, params->start);
- create_physical_mapping(params->end, params->aligned_end);
+ create_physical_mapping(params->aligned_start, params->start, -1);
+ create_physical_mapping(params->end, params->aligned_end, -1);
spin_lock(&init_mm.page_table_lock);
return 0;
}
@@ -742,7 +747,7 @@ static void remove_pte_table(pte_t *pte_start, unsigned long addr,
/*
* clear the pte and potentially split the mapping helper
*/
-static void split_kernel_mapping(unsigned long addr, unsigned long end,
+static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end,
unsigned long size, pte_t *pte)
{
unsigned long mask = ~(size - 1);
@@ -835,7 +840,7 @@ static void remove_pud_table(pud_t *pud_start, unsigned long addr,
}
}
-static void remove_pagetable(unsigned long start, unsigned long end)
+static void __meminit remove_pagetable(unsigned long start, unsigned long end)
{
unsigned long addr, next;
pud_t *pud_base;
@@ -863,12 +868,12 @@ static void remove_pagetable(unsigned long start, unsigned long end)
radix__flush_tlb_kernel_range(start, end);
}
-int __ref radix__create_section_mapping(unsigned long start, unsigned long end)
+int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid)
{
- return create_physical_mapping(start, end);
+ return create_physical_mapping(start, end, nid);
}
-int radix__remove_section_mapping(unsigned long start, unsigned long end)
+int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
{
remove_pagetable(start, end);
return 0;
@@ -876,19 +881,30 @@ int radix__remove_section_mapping(unsigned long start, unsigned long end)
#endif /* CONFIG_MEMORY_HOTPLUG */
#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
+ pgprot_t flags, unsigned int map_page_size,
+ int nid)
+{
+ return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
+}
+
int __meminit radix__vmemmap_create_mapping(unsigned long start,
unsigned long page_size,
unsigned long phys)
{
/* Create a PTE encoding */
unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW;
+ int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
+ int ret;
+
+ ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid);
+ BUG_ON(ret);
- BUG_ON(radix__map_kernel_page(start, phys, __pgprot(flags), page_size));
return 0;
}
#ifdef CONFIG_MEMORY_HOTPLUG
-void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
+void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
{
remove_pagetable(start, start + page_size);
}
@@ -905,7 +921,7 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add
#ifdef CONFIG_DEBUG_VM
WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
#endif
old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1);
@@ -1013,3 +1029,37 @@ int radix__has_transparent_hugepage(void)
return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
+ pte_t entry, unsigned long address, int psize)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
+ _PAGE_RW | _PAGE_EXEC);
+
+ unsigned long change = pte_val(entry) ^ pte_val(*ptep);
+ /*
+ * To avoid NMMU hang while relaxing access, we need mark
+ * the pte invalid in between.
+ */
+ if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) {
+ unsigned long old_pte, new_pte;
+
+ old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
+ /*
+ * new value of pte
+ */
+ new_pte = old_pte | set;
+ radix__flush_tlb_page_psize(mm, address, psize);
+ __radix_pte_update(ptep, _PAGE_INVALID, new_pte);
+ } else {
+ __radix_pte_update(ptep, 0, set);
+ /*
+ * Book3S does not require a TLB flush when relaxing access
+ * restrictions when the address space is not attached to a
+ * NMMU, because the core MMU will reload the pte after taking
+ * an access fault, which is defined by the architectue.
+ */
+ }
+ /* See ptesync comment in radix__set_pte_at */
+}
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 9f361ae571e9..010e1c616cb2 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -44,20 +44,13 @@ static inline int is_exec_fault(void)
static inline int pte_looks_normal(pte_t pte)
{
-#if defined(CONFIG_PPC_BOOK3S_64)
- if ((pte_val(pte) & (_PAGE_PRESENT | _PAGE_SPECIAL)) == _PAGE_PRESENT) {
+ if (pte_present(pte) && !pte_special(pte)) {
if (pte_ci(pte))
return 0;
if (pte_user(pte))
return 1;
}
return 0;
-#else
- return (pte_val(pte) &
- (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER |
- _PAGE_PRIVILEGED)) ==
- (_PAGE_PRESENT | _PAGE_USER);
-#endif
}
static struct page *maybe_pte_to_page(pte_t pte)
@@ -73,7 +66,7 @@ static struct page *maybe_pte_to_page(pte_t pte)
return page;
}
-#if defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0
+#ifdef CONFIG_PPC_BOOK3S
/* Server-style MMU handles coherency when hashing if HW exec permission
* is supposed per page (currently 64-bit only). If not, then, we always
@@ -106,7 +99,7 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
return pte;
}
-#else /* defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 */
+#else /* CONFIG_PPC_BOOK3S */
/* Embedded type MMU with HW exec support. This is a bit more complicated
* as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so
@@ -117,7 +110,7 @@ static pte_t set_pte_filter(pte_t pte)
struct page *pg;
/* No exec permission in the first place, move on */
- if (!(pte_val(pte) & _PAGE_EXEC) || !pte_looks_normal(pte))
+ if (!pte_exec(pte) || !pte_looks_normal(pte))
return pte;
/* If you set _PAGE_EXEC on weird pages you're on your own */
@@ -137,7 +130,7 @@ static pte_t set_pte_filter(pte_t pte)
}
/* Else, we filter out _PAGE_EXEC */
- return __pte(pte_val(pte) & ~_PAGE_EXEC);
+ return pte_exprotect(pte);
}
static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
@@ -150,7 +143,7 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
* if necessary. Also if _PAGE_EXEC is already set, same deal,
* we just bail out
*/
- if (dirty || (pte_val(pte) & _PAGE_EXEC) || !is_exec_fault())
+ if (dirty || pte_exec(pte) || !is_exec_fault())
return pte;
#ifdef CONFIG_DEBUG_VM
@@ -176,10 +169,10 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
set_bit(PG_arch_1, &pg->flags);
bail:
- return __pte(pte_val(pte) | _PAGE_EXEC);
+ return pte_mkexec(pte);
}
-#endif /* !(defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0) */
+#endif /* CONFIG_PPC_BOOK3S */
/*
* set_pte stores a linux PTE into the linux page table.
@@ -188,14 +181,13 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
pte_t pte)
{
/*
- * When handling numa faults, we already have the pte marked
- * _PAGE_PRESENT, but we can be sure that it is not in hpte.
- * Hence we can use set_pte_at for them.
+ * Make sure hardware valid bit is not set. We don't do
+ * tlb flush for this update.
*/
- VM_WARN_ON(pte_present(*ptep) && !pte_protnone(*ptep));
+ VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
/* Add the pte bit when trying to set a pte */
- pte = __pte(pte_val(pte) | _PAGE_PTE);
+ pte = pte_mkpte(pte);
/* Note: mm->context.id might not yet have been assigned as
* this context might not have been activated yet when this
@@ -221,13 +213,54 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
entry = set_access_flags_filter(entry, vma, dirty);
changed = !pte_same(*(ptep), entry);
if (changed) {
- if (!is_vm_hugetlb_page(vma))
- assert_pte_locked(vma->vm_mm, address);
- __ptep_set_access_flags(vma->vm_mm, ptep, entry, address);
- flush_tlb_page(vma, address);
+ assert_pte_locked(vma->vm_mm, address);
+ __ptep_set_access_flags(vma, ptep, entry,
+ address, mmu_virtual_psize);
+ }
+ return changed;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+extern int huge_ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t pte, int dirty)
+{
+#ifdef HUGETLB_NEED_PRELOAD
+ /*
+ * The "return 1" forces a call of update_mmu_cache, which will write a
+ * TLB entry. Without this, platforms that don't do a write of the TLB
+ * entry in the TLB miss handler asm will fault ad infinitum.
+ */
+ ptep_set_access_flags(vma, addr, ptep, pte, dirty);
+ return 1;
+#else
+ int changed, psize;
+
+ pte = set_access_flags_filter(pte, vma, dirty);
+ changed = !pte_same(*(ptep), pte);
+ if (changed) {
+
+#ifdef CONFIG_PPC_BOOK3S_64
+ struct hstate *h = hstate_vma(vma);
+
+ psize = hstate_get_psize(h);
+#ifdef CONFIG_DEBUG_VM
+ assert_spin_locked(huge_pte_lockptr(h, vma->vm_mm, ptep));
+#endif
+
+#else
+ /*
+ * Not used on non book3s64 platforms. But 8xx
+ * can possibly use tsize derived from hstate.
+ */
+ psize = 0;
+#endif
+ __ptep_set_access_flags(vma, ptep, pte, addr, psize);
}
return changed;
+#endif
}
+#endif /* CONFIG_HUGETLB_PAGE */
#ifdef CONFIG_DEBUG_VM
void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index d35d9ad3c1cd..bda3c6f1bd32 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -50,7 +50,7 @@ __ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
if (slab_is_available()) {
pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO);
} else {
- pte = __va(memblock_alloc(PAGE_SIZE, PAGE_SIZE));
+ pte = __va(memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE));
if (pte)
clear_page(pte);
}
@@ -76,56 +76,69 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
void __iomem *
ioremap(phys_addr_t addr, unsigned long size)
{
- return __ioremap_caller(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED,
- __builtin_return_address(0));
+ pgprot_t prot = pgprot_noncached(PAGE_KERNEL);
+
+ return __ioremap_caller(addr, size, prot, __builtin_return_address(0));
}
EXPORT_SYMBOL(ioremap);
void __iomem *
ioremap_wc(phys_addr_t addr, unsigned long size)
{
- return __ioremap_caller(addr, size, _PAGE_NO_CACHE,
- __builtin_return_address(0));
+ pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL);
+
+ return __ioremap_caller(addr, size, prot, __builtin_return_address(0));
}
EXPORT_SYMBOL(ioremap_wc);
void __iomem *
+ioremap_wt(phys_addr_t addr, unsigned long size)
+{
+ pgprot_t prot = pgprot_cached_wthru(PAGE_KERNEL);
+
+ return __ioremap_caller(addr, size, prot, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_wt);
+
+void __iomem *
+ioremap_coherent(phys_addr_t addr, unsigned long size)
+{
+ pgprot_t prot = pgprot_cached(PAGE_KERNEL);
+
+ return __ioremap_caller(addr, size, prot, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_coherent);
+
+void __iomem *
ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags)
{
+ pte_t pte = __pte(flags);
+
/* writeable implies dirty for kernel addresses */
- if ((flags & (_PAGE_RW | _PAGE_RO)) != _PAGE_RO)
- flags |= _PAGE_DIRTY | _PAGE_HWWRITE;
+ if (pte_write(pte))
+ pte = pte_mkdirty(pte);
/* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
- flags &= ~(_PAGE_USER | _PAGE_EXEC);
- flags |= _PAGE_PRIVILEGED;
+ pte = pte_exprotect(pte);
+ pte = pte_mkprivileged(pte);
- return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
+ return __ioremap_caller(addr, size, pte_pgprot(pte), __builtin_return_address(0));
}
EXPORT_SYMBOL(ioremap_prot);
void __iomem *
__ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
{
- return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
+ return __ioremap_caller(addr, size, __pgprot(flags), __builtin_return_address(0));
}
void __iomem *
-__ioremap_caller(phys_addr_t addr, unsigned long size, unsigned long flags,
- void *caller)
+__ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller)
{
unsigned long v, i;
phys_addr_t p;
int err;
- /* Make sure we have the base flags */
- if ((flags & _PAGE_PRESENT) == 0)
- flags |= pgprot_val(PAGE_KERNEL);
-
- /* Non-cacheable page cannot be coherent */
- if (flags & _PAGE_NO_CACHE)
- flags &= ~_PAGE_COHERENT;
-
/*
* Choose an address to map it to.
* Once the vmalloc system is running, we use it.
@@ -148,7 +161,7 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, unsigned long flags,
* mem_init() sets high_memory so only do the check after that.
*/
if (slab_is_available() && (p < virt_to_phys(high_memory)) &&
- !(__allow_ioremap_reserved && memblock_is_region_reserved(p, size))) {
+ page_is_ram(__phys_to_pfn(p))) {
printk("__ioremap(): phys addr 0x%llx is RAM lr %ps\n",
(unsigned long long)p, __builtin_return_address(0));
return NULL;
@@ -183,7 +196,7 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, unsigned long flags,
err = 0;
for (i = 0; i < size && err == 0; i += PAGE_SIZE)
- err = map_kernel_page(v+i, p+i, flags);
+ err = map_kernel_page(v + i, p + i, prot);
if (err) {
if (slab_is_available())
vunmap((void *)v);
@@ -209,7 +222,7 @@ void iounmap(volatile void __iomem *addr)
}
EXPORT_SYMBOL(iounmap);
-int map_kernel_page(unsigned long va, phys_addr_t pa, int flags)
+int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot)
{
pmd_t *pd;
pte_t *pg;
@@ -224,10 +237,8 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, int flags)
/* The PTE should never be already set nor present in the
* hash table
*/
- BUG_ON((pte_val(*pg) & (_PAGE_PRESENT | _PAGE_HASHPTE)) &&
- flags);
- set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT,
- __pgprot(flags)));
+ BUG_ON((pte_present(*pg) | pte_hashpte(*pg)) && pgprot_val(prot));
+ set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT, prot));
}
smp_wmb();
return err;
@@ -238,7 +249,7 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, int flags)
*/
static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top)
{
- unsigned long v, s, f;
+ unsigned long v, s;
phys_addr_t p;
int ktext;
@@ -248,11 +259,10 @@ static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top)
for (; s < top; s += PAGE_SIZE) {
ktext = ((char *)v >= _stext && (char *)v < etext) ||
((char *)v >= _sinittext && (char *)v < _einittext);
- f = ktext ? pgprot_val(PAGE_KERNEL_TEXT) : pgprot_val(PAGE_KERNEL);
- map_kernel_page(v, p, f);
+ map_kernel_page(v, p, ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL);
#ifdef CONFIG_PPC_STD_MMU_32
if (ktext)
- hash_preload(&init_mm, v, 0, 0x300);
+ hash_preload(&init_mm, v, false, 0x300);
#endif
v += PAGE_SIZE;
p += PAGE_SIZE;
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index adf469f312f2..fb1375c07e8c 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -33,7 +33,6 @@
#include <linux/swap.h>
#include <linux/stddef.h>
#include <linux/vmalloc.h>
-#include <linux/memblock.h>
#include <linux/slab.h>
#include <linux/hugetlb.h>
@@ -47,21 +46,14 @@
#include <asm/smp.h>
#include <asm/machdep.h>
#include <asm/tlb.h>
-#include <asm/trace.h>
#include <asm/processor.h>
#include <asm/cputable.h>
#include <asm/sections.h>
#include <asm/firmware.h>
#include <asm/dma.h>
-#include <asm/powernv.h>
#include "mmu_decl.h"
-#ifdef CONFIG_PPC_BOOK3S_64
-#if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT))
-#error TASK_SIZE_USER64 exceeds user VSID range
-#endif
-#endif
#ifdef CONFIG_PPC_BOOK3S_64
/*
@@ -80,8 +72,6 @@ unsigned long __pud_index_size;
EXPORT_SYMBOL(__pud_index_size);
unsigned long __pgd_index_size;
EXPORT_SYMBOL(__pgd_index_size);
-unsigned long __pmd_cache_index;
-EXPORT_SYMBOL(__pmd_cache_index);
unsigned long __pud_cache_index;
EXPORT_SYMBOL(__pud_cache_index);
unsigned long __pte_table_size;
@@ -123,17 +113,12 @@ unsigned long ioremap_bot = IOREMAP_BASE;
* __ioremap_at - Low level function to establish the page tables
* for an IO mapping
*/
-void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size,
- unsigned long flags)
+void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_t prot)
{
unsigned long i;
- /* Make sure we have the base flags */
- if ((flags & _PAGE_PRESENT) == 0)
- flags |= pgprot_val(PAGE_KERNEL);
-
/* We don't support the 4K PFN hack with ioremap */
- if (flags & H_PAGE_4K_PFN)
+ if (pgprot_val(prot) & H_PAGE_4K_PFN)
return NULL;
WARN_ON(pa & ~PAGE_MASK);
@@ -141,7 +126,7 @@ void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size,
WARN_ON(size & ~PAGE_MASK);
for (i = 0; i < size; i += PAGE_SIZE)
- if (map_kernel_page((unsigned long)ea+i, pa+i, flags))
+ if (map_kernel_page((unsigned long)ea + i, pa + i, prot))
return NULL;
return (void __iomem *)ea;
@@ -162,7 +147,7 @@ void __iounmap_at(void *ea, unsigned long size)
}
void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size,
- unsigned long flags, void *caller)
+ pgprot_t prot, void *caller)
{
phys_addr_t paligned;
void __iomem *ret;
@@ -192,11 +177,11 @@ void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size,
return NULL;
area->phys_addr = paligned;
- ret = __ioremap_at(paligned, area->addr, size, flags);
+ ret = __ioremap_at(paligned, area->addr, size, prot);
if (!ret)
vunmap(area->addr);
} else {
- ret = __ioremap_at(paligned, (void *)ioremap_bot, size, flags);
+ ret = __ioremap_at(paligned, (void *)ioremap_bot, size, prot);
if (ret)
ioremap_bot += size;
}
@@ -209,49 +194,59 @@ void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size,
void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
unsigned long flags)
{
- return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
+ return __ioremap_caller(addr, size, __pgprot(flags), __builtin_return_address(0));
}
void __iomem * ioremap(phys_addr_t addr, unsigned long size)
{
- unsigned long flags = pgprot_val(pgprot_noncached(__pgprot(0)));
+ pgprot_t prot = pgprot_noncached(PAGE_KERNEL);
void *caller = __builtin_return_address(0);
if (ppc_md.ioremap)
- return ppc_md.ioremap(addr, size, flags, caller);
- return __ioremap_caller(addr, size, flags, caller);
+ return ppc_md.ioremap(addr, size, prot, caller);
+ return __ioremap_caller(addr, size, prot, caller);
}
void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size)
{
- unsigned long flags = pgprot_val(pgprot_noncached_wc(__pgprot(0)));
+ pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL);
void *caller = __builtin_return_address(0);
if (ppc_md.ioremap)
- return ppc_md.ioremap(addr, size, flags, caller);
- return __ioremap_caller(addr, size, flags, caller);
+ return ppc_md.ioremap(addr, size, prot, caller);
+ return __ioremap_caller(addr, size, prot, caller);
+}
+
+void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size)
+{
+ pgprot_t prot = pgprot_cached(PAGE_KERNEL);
+ void *caller = __builtin_return_address(0);
+
+ if (ppc_md.ioremap)
+ return ppc_md.ioremap(addr, size, prot, caller);
+ return __ioremap_caller(addr, size, prot, caller);
}
void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size,
unsigned long flags)
{
+ pte_t pte = __pte(flags);
void *caller = __builtin_return_address(0);
/* writeable implies dirty for kernel addresses */
- if (flags & _PAGE_WRITE)
- flags |= _PAGE_DIRTY;
+ if (pte_write(pte))
+ pte = pte_mkdirty(pte);
/* we don't want to let _PAGE_EXEC leak out */
- flags &= ~_PAGE_EXEC;
+ pte = pte_exprotect(pte);
/*
* Force kernel mapping.
*/
- flags &= ~_PAGE_USER;
- flags |= _PAGE_PRIVILEGED;
+ pte = pte_mkprivileged(pte);
if (ppc_md.ioremap)
- return ppc_md.ioremap(addr, size, flags, caller);
- return __ioremap_caller(addr, size, flags, caller);
+ return ppc_md.ioremap(addr, size, pte_pgprot(pte), caller);
+ return __ioremap_caller(addr, size, pte_pgprot(pte), caller);
}
@@ -316,177 +311,11 @@ struct page *pud_page(pud_t pud)
*/
struct page *pmd_page(pmd_t pmd)
{
- if (pmd_trans_huge(pmd) || pmd_huge(pmd) || pmd_devmap(pmd))
+ if (pmd_large(pmd) || pmd_huge(pmd) || pmd_devmap(pmd))
return pte_page(pmd_pte(pmd));
return virt_to_page(pmd_page_vaddr(pmd));
}
-#ifdef CONFIG_PPC_64K_PAGES
-static pte_t *get_from_cache(struct mm_struct *mm)
-{
- void *pte_frag, *ret;
-
- spin_lock(&mm->page_table_lock);
- ret = mm->context.pte_frag;
- if (ret) {
- pte_frag = ret + PTE_FRAG_SIZE;
- /*
- * If we have taken up all the fragments mark PTE page NULL
- */
- if (((unsigned long)pte_frag & ~PAGE_MASK) == 0)
- pte_frag = NULL;
- mm->context.pte_frag = pte_frag;
- }
- spin_unlock(&mm->page_table_lock);
- return (pte_t *)ret;
-}
-
-static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
-{
- void *ret = NULL;
- struct page *page;
-
- if (!kernel) {
- page = alloc_page(PGALLOC_GFP | __GFP_ACCOUNT);
- if (!page)
- return NULL;
- if (!pgtable_page_ctor(page)) {
- __free_page(page);
- return NULL;
- }
- } else {
- page = alloc_page(PGALLOC_GFP);
- if (!page)
- return NULL;
- }
-
- ret = page_address(page);
- spin_lock(&mm->page_table_lock);
- /*
- * If we find pgtable_page set, we return
- * the allocated page with single fragement
- * count.
- */
- if (likely(!mm->context.pte_frag)) {
- set_page_count(page, PTE_FRAG_NR);
- mm->context.pte_frag = ret + PTE_FRAG_SIZE;
- }
- spin_unlock(&mm->page_table_lock);
-
- return (pte_t *)ret;
-}
-
-pte_t *pte_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
-{
- pte_t *pte;
-
- pte = get_from_cache(mm);
- if (pte)
- return pte;
-
- return __alloc_for_cache(mm, kernel);
-}
-#endif /* CONFIG_PPC_64K_PAGES */
-
-void pte_fragment_free(unsigned long *table, int kernel)
-{
- struct page *page = virt_to_page(table);
- if (put_page_testzero(page)) {
- if (!kernel)
- pgtable_page_dtor(page);
- free_unref_page(page);
- }
-}
-
-#ifdef CONFIG_SMP
-void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
-{
- unsigned long pgf = (unsigned long)table;
-
- BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
- pgf |= shift;
- tlb_remove_table(tlb, (void *)pgf);
-}
-
-void __tlb_remove_table(void *_table)
-{
- void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
- unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
-
- if (!shift)
- /* PTE page needs special handling */
- pte_fragment_free(table, 0);
- else {
- BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
- kmem_cache_free(PGT_CACHE(shift), table);
- }
-}
-#else
-void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
-{
- if (!shift) {
- /* PTE page needs special handling */
- pte_fragment_free(table, 0);
- } else {
- BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
- kmem_cache_free(PGT_CACHE(shift), table);
- }
-}
-#endif
-
-#ifdef CONFIG_PPC_BOOK3S_64
-void __init mmu_partition_table_init(void)
-{
- unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
- unsigned long ptcr;
-
- BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large.");
- partition_tb = __va(memblock_alloc_base(patb_size, patb_size,
- MEMBLOCK_ALLOC_ANYWHERE));
-
- /* Initialize the Partition Table with no entries */
- memset((void *)partition_tb, 0, patb_size);
-
- /*
- * update partition table control register,
- * 64 K size.
- */
- ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12);
- mtspr(SPRN_PTCR, ptcr);
- powernv_set_nmmu_ptcr(ptcr);
-}
-
-void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
- unsigned long dw1)
-{
- unsigned long old = be64_to_cpu(partition_tb[lpid].patb0);
-
- partition_tb[lpid].patb0 = cpu_to_be64(dw0);
- partition_tb[lpid].patb1 = cpu_to_be64(dw1);
-
- /*
- * Global flush of TLBs and partition table caches for this lpid.
- * The type of flush (hash or radix) depends on what the previous
- * use of this partition ID was, not the new use.
- */
- asm volatile("ptesync" : : : "memory");
- if (old & PATB_HR) {
- asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
- "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
- asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
- "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
- trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 1);
- } else {
- asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
- "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
- trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0);
- }
- /* do we need fixup here ?*/
- asm volatile("eieio; tlbsync; ptesync" : : : "memory");
-}
-EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry);
-#endif /* CONFIG_PPC_BOOK3S_64 */
-
#ifdef CONFIG_STRICT_KERNEL_RWX
void mark_rodata_ro(void)
{
diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index ba71c5481f42..b271b283c785 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -14,9 +14,12 @@ DEFINE_STATIC_KEY_TRUE(pkey_disabled);
bool pkey_execute_disable_supported;
int pkeys_total; /* Total pkeys as per device tree */
bool pkeys_devtree_defined; /* pkey property exported by device tree */
-u32 initial_allocation_mask; /* Bits set for reserved keys */
-u64 pkey_amr_uamor_mask; /* Bits in AMR/UMOR not to be touched */
+u32 initial_allocation_mask; /* Bits set for the initially allocated keys */
+u32 reserved_allocation_mask; /* Bits set for reserved keys */
+u64 pkey_amr_mask; /* Bits in AMR not to be touched */
u64 pkey_iamr_mask; /* Bits in AMR not to be touched */
+u64 pkey_uamor_mask; /* Bits in UMOR not to be touched */
+int execute_only_key = 2;
#define AMR_BITS_PER_PKEY 2
#define AMR_RD_BIT 0x1UL
@@ -42,7 +45,7 @@ static void scan_pkey_feature(void)
* Since any pkey can be used for data or execute, we will just treat
* all keys as equal and track them as one entity.
*/
- pkeys_total = be32_to_cpu(vals[0]);
+ pkeys_total = vals[0];
pkeys_devtree_defined = true;
}
@@ -91,7 +94,7 @@ int pkey_initialize(void)
* arch-neutral code.
*/
pkeys_total = min_t(int, pkeys_total,
- (ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT));
+ ((ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)+1));
if (!pkey_mmu_enabled() || radix_enabled() || !pkeys_total)
static_branch_enable(&pkey_disabled);
@@ -119,23 +122,39 @@ int pkey_initialize(void)
#else
os_reserved = 0;
#endif
- /*
- * Bits are in LE format. NOTE: 1, 0 are reserved.
- * key 0 is the default key, which allows read/write/execute.
- * key 1 is recommended not to be used. PowerISA(3.0) page 1015,
- * programming note.
- */
- initial_allocation_mask = ~0x0;
+ /* Bits are in LE format. */
+ reserved_allocation_mask = (0x1 << 1) | (0x1 << execute_only_key);
/* register mask is in BE format */
- pkey_amr_uamor_mask = ~0x0ul;
+ pkey_amr_mask = ~0x0ul;
+ pkey_amr_mask &= ~(0x3ul << pkeyshift(0));
+
pkey_iamr_mask = ~0x0ul;
+ pkey_iamr_mask &= ~(0x3ul << pkeyshift(0));
+ pkey_iamr_mask &= ~(0x3ul << pkeyshift(execute_only_key));
- for (i = 2; i < (pkeys_total - os_reserved); i++) {
- initial_allocation_mask &= ~(0x1 << i);
- pkey_amr_uamor_mask &= ~(0x3ul << pkeyshift(i));
- pkey_iamr_mask &= ~(0x1ul << pkeyshift(i));
+ pkey_uamor_mask = ~0x0ul;
+ pkey_uamor_mask &= ~(0x3ul << pkeyshift(0));
+ pkey_uamor_mask &= ~(0x3ul << pkeyshift(execute_only_key));
+
+ /* mark the rest of the keys as reserved and hence unavailable */
+ for (i = (pkeys_total - os_reserved); i < pkeys_total; i++) {
+ reserved_allocation_mask |= (0x1 << i);
+ pkey_uamor_mask &= ~(0x3ul << pkeyshift(i));
+ }
+ initial_allocation_mask = reserved_allocation_mask | (0x1 << 0);
+
+ if (unlikely((pkeys_total - os_reserved) <= execute_only_key)) {
+ /*
+ * Insufficient number of keys to support
+ * execute only key. Mark it unavailable.
+ * Any AMR, UAMOR, IAMR bit set for
+ * this key is irrelevant since this key
+ * can never be allocated.
+ */
+ execute_only_key = -1;
}
+
return 0;
}
@@ -146,8 +165,7 @@ void pkey_mm_init(struct mm_struct *mm)
if (static_branch_likely(&pkey_disabled))
return;
mm_pkey_allocation_map(mm) = initial_allocation_mask;
- /* -1 means unallocated or invalid */
- mm->context.execute_only_pkey = -1;
+ mm->context.execute_only_pkey = execute_only_key;
}
static inline u64 read_amr(void)
@@ -216,33 +234,6 @@ static inline void init_iamr(int pkey, u8 init_bits)
write_iamr(old_iamr | new_iamr_bits);
}
-static void pkey_status_change(int pkey, bool enable)
-{
- u64 old_uamor;
-
- /* Reset the AMR and IAMR bits for this key */
- init_amr(pkey, 0x0);
- init_iamr(pkey, 0x0);
-
- /* Enable/disable key */
- old_uamor = read_uamor();
- if (enable)
- old_uamor |= (0x3ul << pkeyshift(pkey));
- else
- old_uamor &= ~(0x3ul << pkeyshift(pkey));
- write_uamor(old_uamor);
-}
-
-void __arch_activate_pkey(int pkey)
-{
- pkey_status_change(pkey, true);
-}
-
-void __arch_deactivate_pkey(int pkey)
-{
- pkey_status_change(pkey, false);
-}
-
/*
* Set the access rights in AMR IAMR and UAMOR registers for @pkey to that
* specified in @init_val.
@@ -292,9 +283,6 @@ void thread_pkey_regs_restore(struct thread_struct *new_thread,
if (static_branch_likely(&pkey_disabled))
return;
- /*
- * TODO: Just set UAMOR to zero if @new_thread hasn't used any keys yet.
- */
if (old_thread->amr != new_thread->amr)
write_amr(new_thread->amr);
if (old_thread->iamr != new_thread->iamr)
@@ -308,9 +296,13 @@ void thread_pkey_regs_init(struct thread_struct *thread)
if (static_branch_likely(&pkey_disabled))
return;
- write_amr(read_amr() & pkey_amr_uamor_mask);
- write_iamr(read_iamr() & pkey_iamr_mask);
- write_uamor(read_uamor() & pkey_amr_uamor_mask);
+ thread->amr = pkey_amr_mask;
+ thread->iamr = pkey_iamr_mask;
+ thread->uamor = pkey_uamor_mask;
+
+ write_uamor(pkey_uamor_mask);
+ write_amr(pkey_amr_mask);
+ write_iamr(pkey_iamr_mask);
}
static inline bool pkey_allows_readwrite(int pkey)
@@ -325,48 +317,7 @@ static inline bool pkey_allows_readwrite(int pkey)
int __execute_only_pkey(struct mm_struct *mm)
{
- bool need_to_set_mm_pkey = false;
- int execute_only_pkey = mm->context.execute_only_pkey;
- int ret;
-
- /* Do we need to assign a pkey for mm's execute-only maps? */
- if (execute_only_pkey == -1) {
- /* Go allocate one to use, which might fail */
- execute_only_pkey = mm_pkey_alloc(mm);
- if (execute_only_pkey < 0)
- return -1;
- need_to_set_mm_pkey = true;
- }
-
- /*
- * We do not want to go through the relatively costly dance to set AMR
- * if we do not need to. Check it first and assume that if the
- * execute-only pkey is readwrite-disabled than we do not have to set it
- * ourselves.
- */
- if (!need_to_set_mm_pkey && !pkey_allows_readwrite(execute_only_pkey))
- return execute_only_pkey;
-
- /*
- * Set up AMR so that it denies access for everything other than
- * execution.
- */
- ret = __arch_set_user_pkey_access(current, execute_only_pkey,
- PKEY_DISABLE_ACCESS |
- PKEY_DISABLE_WRITE);
- /*
- * If the AMR-set operation failed somehow, just return 0 and
- * effectively disable execute-only support.
- */
- if (ret) {
- mm_pkey_free(mm, execute_only_pkey);
- return -1;
- }
-
- /* We got one, store it and use it from here on out */
- if (need_to_set_mm_pkey)
- mm->context.execute_only_pkey = execute_only_pkey;
- return execute_only_pkey;
+ return mm->context.execute_only_pkey;
}
static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
@@ -386,9 +337,9 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot,
{
/*
* If the currently associated pkey is execute-only, but the requested
- * protection requires read or write, move it back to the default pkey.
+ * protection is not execute-only, move it back to the default pkey.
*/
- if (vma_is_pkey_exec_only(vma) && (prot & (PROT_READ | PROT_WRITE)))
+ if (vma_is_pkey_exec_only(vma) && (prot != PROT_EXEC))
return 0;
/*
@@ -410,9 +361,6 @@ static bool pkey_access_permitted(int pkey, bool write, bool execute)
int pkey_shift;
u64 amr;
- if (!pkey)
- return true;
-
if (!is_pkey_enabled(pkey))
return true;
diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
index 2a049fb8523d..f6f575bae3bc 100644
--- a/arch/powerpc/mm/ppc_mmu_32.c
+++ b/arch/powerpc/mm/ppc_mmu_32.c
@@ -163,11 +163,11 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys,
* Preload a translation in the hash table
*/
void hash_preload(struct mm_struct *mm, unsigned long ea,
- unsigned long access, unsigned long trap)
+ bool is_exec, unsigned long trap)
{
pmd_t *pmd;
- if (Hash == 0)
+ if (!Hash)
return;
pmd = pmd_offset(pud_offset(pgd_offset(mm, ea), ea), ea);
if (!pmd_none(*pmd))
@@ -224,7 +224,7 @@ void __init MMU_init_hw(void)
* Find some memory for the hash table.
*/
if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322);
- Hash = __va(memblock_alloc(Hash_size, Hash_size));
+ Hash = __va(memblock_phys_alloc(Hash_size, Hash_size));
memset(Hash, 0, Hash_size);
_SDR1 = __pa(Hash) | SDR1_LOW_BITS;
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 13cfe413b40d..bc3914d54e26 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -14,14 +14,17 @@
* 2 of the License, or (at your option) any later version.
*/
+#include <asm/asm-prototypes.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
#include <asm/paca.h>
+#include <asm/ppc-opcode.h>
#include <asm/cputable.h>
#include <asm/cacheflush.h>
#include <asm/smp.h>
#include <linux/compiler.h>
+#include <linux/context_tracking.h>
#include <linux/mm_types.h>
#include <asm/udbg.h>
@@ -29,11 +32,10 @@
enum slb_index {
LINEAR_INDEX = 0, /* Kernel linear map (0xc000000000000000) */
- VMALLOC_INDEX = 1, /* Kernel virtual map (0xd000000000000000) */
- KSTACK_INDEX = 2, /* Kernel stack map */
+ KSTACK_INDEX = 1, /* Kernel stack map */
};
-extern void slb_allocate(unsigned long ea);
+static long slb_allocate_user(struct mm_struct *mm, unsigned long ea);
#define slb_esid_mask(ssize) \
(((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T)
@@ -44,13 +46,35 @@ static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index;
}
-static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
+static inline unsigned long __mk_vsid_data(unsigned long vsid, int ssize,
unsigned long flags)
{
- return (get_kernel_vsid(ea, ssize) << slb_vsid_shift(ssize)) | flags |
+ return (vsid << slb_vsid_shift(ssize)) | flags |
((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT);
}
+static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
+ unsigned long flags)
+{
+ return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags);
+}
+
+static void assert_slb_presence(bool present, unsigned long ea)
+{
+#ifdef CONFIG_DEBUG_VM
+ unsigned long tmp;
+
+ WARN_ON_ONCE(mfmsr() & MSR_EE);
+
+ if (!cpu_has_feature(CPU_FTR_ARCH_206))
+ return;
+
+ asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0");
+
+ WARN_ON(present == (tmp == 0));
+#endif
+}
+
static inline void slb_shadow_update(unsigned long ea, int ssize,
unsigned long flags,
enum slb_index index)
@@ -62,14 +86,14 @@ static inline void slb_shadow_update(unsigned long ea, int ssize,
* updating it. No write barriers are needed here, provided
* we only update the current CPU's SLB shadow buffer.
*/
- p->save_area[index].esid = 0;
- p->save_area[index].vsid = cpu_to_be64(mk_vsid_data(ea, ssize, flags));
- p->save_area[index].esid = cpu_to_be64(mk_esid_data(ea, ssize, index));
+ WRITE_ONCE(p->save_area[index].esid, 0);
+ WRITE_ONCE(p->save_area[index].vsid, cpu_to_be64(mk_vsid_data(ea, ssize, flags)));
+ WRITE_ONCE(p->save_area[index].esid, cpu_to_be64(mk_esid_data(ea, ssize, index)));
}
static inline void slb_shadow_clear(enum slb_index index)
{
- get_slb_shadow()->save_area[index].esid = 0;
+ WRITE_ONCE(get_slb_shadow()->save_area[index].esid, cpu_to_be64(index));
}
static inline void create_shadowed_slbe(unsigned long ea, int ssize,
@@ -83,54 +107,61 @@ static inline void create_shadowed_slbe(unsigned long ea, int ssize,
*/
slb_shadow_update(ea, ssize, flags, index);
+ assert_slb_presence(false, ea);
asm volatile("slbmte %0,%1" :
: "r" (mk_vsid_data(ea, ssize, flags)),
"r" (mk_esid_data(ea, ssize, index))
: "memory" );
}
-static void __slb_flush_and_rebolt(void)
+/*
+ * Insert bolted entries into SLB (which may not be empty, so don't clear
+ * slb_cache_ptr).
+ */
+void __slb_restore_bolted_realmode(void)
{
- /* If you change this make sure you change SLB_NUM_BOLTED
- * and PR KVM appropriately too. */
- unsigned long linear_llp, vmalloc_llp, lflags, vflags;
- unsigned long ksp_esid_data, ksp_vsid_data;
-
- linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
- vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
- lflags = SLB_VSID_KERNEL | linear_llp;
- vflags = SLB_VSID_KERNEL | vmalloc_llp;
+ struct slb_shadow *p = get_slb_shadow();
+ enum slb_index index;
- ksp_esid_data = mk_esid_data(get_paca()->kstack, mmu_kernel_ssize, KSTACK_INDEX);
- if ((ksp_esid_data & ~0xfffffffUL) <= PAGE_OFFSET) {
- ksp_esid_data &= ~SLB_ESID_V;
- ksp_vsid_data = 0;
- slb_shadow_clear(KSTACK_INDEX);
- } else {
- /* Update stack entry; others don't change */
- slb_shadow_update(get_paca()->kstack, mmu_kernel_ssize, lflags, KSTACK_INDEX);
- ksp_vsid_data =
- be64_to_cpu(get_slb_shadow()->save_area[KSTACK_INDEX].vsid);
+ /* No isync needed because realmode. */
+ for (index = 0; index < SLB_NUM_BOLTED; index++) {
+ asm volatile("slbmte %0,%1" :
+ : "r" (be64_to_cpu(p->save_area[index].vsid)),
+ "r" (be64_to_cpu(p->save_area[index].esid)));
}
- /* We need to do this all in asm, so we're sure we don't touch
- * the stack between the slbia and rebolting it. */
- asm volatile("isync\n"
- "slbia\n"
- /* Slot 1 - first VMALLOC segment */
- "slbmte %0,%1\n"
- /* Slot 2 - kernel stack */
- "slbmte %2,%3\n"
- "isync"
- :: "r"(mk_vsid_data(VMALLOC_START, mmu_kernel_ssize, vflags)),
- "r"(mk_esid_data(VMALLOC_START, mmu_kernel_ssize, VMALLOC_INDEX)),
- "r"(ksp_vsid_data),
- "r"(ksp_esid_data)
- : "memory");
+ assert_slb_presence(true, local_paca->kstack);
+}
+
+/*
+ * Insert the bolted entries into an empty SLB.
+ */
+void slb_restore_bolted_realmode(void)
+{
+ __slb_restore_bolted_realmode();
+ get_paca()->slb_cache_ptr = 0;
+
+ get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+ get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
}
-void slb_flush_and_rebolt(void)
+/*
+ * This flushes all SLB entries including 0, so it must be realmode.
+ */
+void slb_flush_all_realmode(void)
{
+ asm volatile("slbmte %0,%0; slbia" : : "r" (0));
+}
+
+/*
+ * This flushes non-bolted entries, it can be run in virtual mode. Must
+ * be called with interrupts disabled.
+ */
+void slb_flush_and_restore_bolted(void)
+{
+ struct slb_shadow *p = get_slb_shadow();
+
+ BUILD_BUG_ON(SLB_NUM_BOLTED != 2);
WARN_ON(!irqs_disabled());
@@ -140,56 +171,241 @@ void slb_flush_and_rebolt(void)
*/
hard_irq_disable();
- __slb_flush_and_rebolt();
+ asm volatile("isync\n"
+ "slbia\n"
+ "slbmte %0, %1\n"
+ "isync\n"
+ :: "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].vsid)),
+ "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].esid))
+ : "memory");
+ assert_slb_presence(true, get_paca()->kstack);
+
get_paca()->slb_cache_ptr = 0;
+
+ get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+ get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
+}
+
+void slb_save_contents(struct slb_entry *slb_ptr)
+{
+ int i;
+ unsigned long e, v;
+
+ /* Save slb_cache_ptr value. */
+ get_paca()->slb_save_cache_ptr = get_paca()->slb_cache_ptr;
+
+ if (!slb_ptr)
+ return;
+
+ for (i = 0; i < mmu_slb_size; i++) {
+ asm volatile("slbmfee %0,%1" : "=r" (e) : "r" (i));
+ asm volatile("slbmfev %0,%1" : "=r" (v) : "r" (i));
+ slb_ptr->esid = e;
+ slb_ptr->vsid = v;
+ slb_ptr++;
+ }
+}
+
+void slb_dump_contents(struct slb_entry *slb_ptr)
+{
+ int i, n;
+ unsigned long e, v;
+ unsigned long llp;
+
+ if (!slb_ptr)
+ return;
+
+ pr_err("SLB contents of cpu 0x%x\n", smp_processor_id());
+ pr_err("Last SLB entry inserted at slot %d\n", get_paca()->stab_rr);
+
+ for (i = 0; i < mmu_slb_size; i++) {
+ e = slb_ptr->esid;
+ v = slb_ptr->vsid;
+ slb_ptr++;
+
+ if (!e && !v)
+ continue;
+
+ pr_err("%02d %016lx %016lx\n", i, e, v);
+
+ if (!(e & SLB_ESID_V)) {
+ pr_err("\n");
+ continue;
+ }
+ llp = v & SLB_VSID_LLP;
+ if (v & SLB_VSID_B_1T) {
+ pr_err(" 1T ESID=%9lx VSID=%13lx LLP:%3lx\n",
+ GET_ESID_1T(e),
+ (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T, llp);
+ } else {
+ pr_err(" 256M ESID=%9lx VSID=%13lx LLP:%3lx\n",
+ GET_ESID(e),
+ (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT, llp);
+ }
+ }
+ pr_err("----------------------------------\n");
+
+ /* Dump slb cache entires as well. */
+ pr_err("SLB cache ptr value = %d\n", get_paca()->slb_save_cache_ptr);
+ pr_err("Valid SLB cache entries:\n");
+ n = min_t(int, get_paca()->slb_save_cache_ptr, SLB_CACHE_ENTRIES);
+ for (i = 0; i < n; i++)
+ pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]);
+ pr_err("Rest of SLB cache entries:\n");
+ for (i = n; i < SLB_CACHE_ENTRIES; i++)
+ pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]);
}
void slb_vmalloc_update(void)
{
- unsigned long vflags;
+ /*
+ * vmalloc is not bolted, so just have to flush non-bolted.
+ */
+ slb_flush_and_restore_bolted();
+}
+
+static bool preload_hit(struct thread_info *ti, unsigned long esid)
+{
+ unsigned char i;
- vflags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmalloc_psize].sllp;
- slb_shadow_update(VMALLOC_START, mmu_kernel_ssize, vflags, VMALLOC_INDEX);
- slb_flush_and_rebolt();
+ for (i = 0; i < ti->slb_preload_nr; i++) {
+ unsigned char idx;
+
+ idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
+ if (esid == ti->slb_preload_esid[idx])
+ return true;
+ }
+ return false;
}
-/* Helper function to compare esids. There are four cases to handle.
- * 1. The system is not 1T segment size capable. Use the GET_ESID compare.
- * 2. The system is 1T capable, both addresses are < 1T, use the GET_ESID compare.
- * 3. The system is 1T capable, only one of the two addresses is > 1T. This is not a match.
- * 4. The system is 1T capable, both addresses are > 1T, use the GET_ESID_1T macro to compare.
- */
-static inline int esids_match(unsigned long addr1, unsigned long addr2)
+static bool preload_add(struct thread_info *ti, unsigned long ea)
{
- int esid_1t_count;
+ unsigned char idx;
+ unsigned long esid;
- /* System is not 1T segment size capable. */
- if (!mmu_has_feature(MMU_FTR_1T_SEGMENT))
- return (GET_ESID(addr1) == GET_ESID(addr2));
+ if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) {
+ /* EAs are stored >> 28 so 256MB segments don't need clearing */
+ if (ea & ESID_MASK_1T)
+ ea &= ESID_MASK_1T;
+ }
+
+ esid = ea >> SID_SHIFT;
- esid_1t_count = (((addr1 >> SID_SHIFT_1T) != 0) +
- ((addr2 >> SID_SHIFT_1T) != 0));
+ if (preload_hit(ti, esid))
+ return false;
- /* both addresses are < 1T */
- if (esid_1t_count == 0)
- return (GET_ESID(addr1) == GET_ESID(addr2));
+ idx = (ti->slb_preload_tail + ti->slb_preload_nr) % SLB_PRELOAD_NR;
+ ti->slb_preload_esid[idx] = esid;
+ if (ti->slb_preload_nr == SLB_PRELOAD_NR)
+ ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR;
+ else
+ ti->slb_preload_nr++;
- /* One address < 1T, the other > 1T. Not a match */
- if (esid_1t_count == 1)
- return 0;
+ return true;
+}
- /* Both addresses are > 1T. */
- return (GET_ESID_1T(addr1) == GET_ESID_1T(addr2));
+static void preload_age(struct thread_info *ti)
+{
+ if (!ti->slb_preload_nr)
+ return;
+ ti->slb_preload_nr--;
+ ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR;
}
+void slb_setup_new_exec(void)
+{
+ struct thread_info *ti = current_thread_info();
+ struct mm_struct *mm = current->mm;
+ unsigned long exec = 0x10000000;
+
+ WARN_ON(irqs_disabled());
+
+ /*
+ * preload cache can only be used to determine whether a SLB
+ * entry exists if it does not start to overflow.
+ */
+ if (ti->slb_preload_nr + 2 > SLB_PRELOAD_NR)
+ return;
+
+ hard_irq_disable();
+
+ /*
+ * We have no good place to clear the slb preload cache on exec,
+ * flush_thread is about the earliest arch hook but that happens
+ * after we switch to the mm and have aleady preloaded the SLBEs.
+ *
+ * For the most part that's probably okay to use entries from the
+ * previous exec, they will age out if unused. It may turn out to
+ * be an advantage to clear the cache before switching to it,
+ * however.
+ */
+
+ /*
+ * preload some userspace segments into the SLB.
+ * Almost all 32 and 64bit PowerPC executables are linked at
+ * 0x10000000 so it makes sense to preload this segment.
+ */
+ if (!is_kernel_addr(exec)) {
+ if (preload_add(ti, exec))
+ slb_allocate_user(mm, exec);
+ }
+
+ /* Libraries and mmaps. */
+ if (!is_kernel_addr(mm->mmap_base)) {
+ if (preload_add(ti, mm->mmap_base))
+ slb_allocate_user(mm, mm->mmap_base);
+ }
+
+ /* see switch_slb */
+ asm volatile("isync" : : : "memory");
+
+ local_irq_enable();
+}
+
+void preload_new_slb_context(unsigned long start, unsigned long sp)
+{
+ struct thread_info *ti = current_thread_info();
+ struct mm_struct *mm = current->mm;
+ unsigned long heap = mm->start_brk;
+
+ WARN_ON(irqs_disabled());
+
+ /* see above */
+ if (ti->slb_preload_nr + 3 > SLB_PRELOAD_NR)
+ return;
+
+ hard_irq_disable();
+
+ /* Userspace entry address. */
+ if (!is_kernel_addr(start)) {
+ if (preload_add(ti, start))
+ slb_allocate_user(mm, start);
+ }
+
+ /* Top of stack, grows down. */
+ if (!is_kernel_addr(sp)) {
+ if (preload_add(ti, sp))
+ slb_allocate_user(mm, sp);
+ }
+
+ /* Bottom of heap, grows up. */
+ if (heap && !is_kernel_addr(heap)) {
+ if (preload_add(ti, heap))
+ slb_allocate_user(mm, heap);
+ }
+
+ /* see switch_slb */
+ asm volatile("isync" : : : "memory");
+
+ local_irq_enable();
+}
+
+
/* Flush all user entries from the segment table of the current processor. */
void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
{
- unsigned long offset;
- unsigned long slbie_data = 0;
- unsigned long pc = KSTK_EIP(tsk);
- unsigned long stack = KSTK_ESP(tsk);
- unsigned long exec_base;
+ struct thread_info *ti = task_thread_info(tsk);
+ unsigned char i;
/*
* We need interrupts hard-disabled here, not just soft-disabled,
@@ -198,91 +414,107 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
* which would update the slb_cache/slb_cache_ptr fields in the PACA.
*/
hard_irq_disable();
- offset = get_paca()->slb_cache_ptr;
- if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) &&
- offset <= SLB_CACHE_ENTRIES) {
- int i;
- asm volatile("isync" : : : "memory");
- for (i = 0; i < offset; i++) {
- slbie_data = (unsigned long)get_paca()->slb_cache[i]
- << SID_SHIFT; /* EA */
- slbie_data |= user_segment_size(slbie_data)
- << SLBIE_SSIZE_SHIFT;
- slbie_data |= SLBIE_C; /* C set for user addresses */
- asm volatile("slbie %0" : : "r" (slbie_data));
- }
- asm volatile("isync" : : : "memory");
+ asm volatile("isync" : : : "memory");
+ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+ /*
+ * SLBIA IH=3 invalidates all Class=1 SLBEs and their
+ * associated lookaside structures, which matches what
+ * switch_slb wants. So ARCH_300 does not use the slb
+ * cache.
+ */
+ asm volatile(PPC_SLBIA(3));
} else {
- __slb_flush_and_rebolt();
- }
+ unsigned long offset = get_paca()->slb_cache_ptr;
+
+ if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) &&
+ offset <= SLB_CACHE_ENTRIES) {
+ unsigned long slbie_data = 0;
+
+ for (i = 0; i < offset; i++) {
+ unsigned long ea;
+
+ ea = (unsigned long)
+ get_paca()->slb_cache[i] << SID_SHIFT;
+ /*
+ * Could assert_slb_presence(true) here, but
+ * hypervisor or machine check could have come
+ * in and removed the entry at this point.
+ */
+
+ slbie_data = ea;
+ slbie_data |= user_segment_size(slbie_data)
+ << SLBIE_SSIZE_SHIFT;
+ slbie_data |= SLBIE_C; /* user slbs have C=1 */
+ asm volatile("slbie %0" : : "r" (slbie_data));
+ }
+
+ /* Workaround POWER5 < DD2.1 issue */
+ if (!cpu_has_feature(CPU_FTR_ARCH_207S) && offset == 1)
+ asm volatile("slbie %0" : : "r" (slbie_data));
+
+ } else {
+ struct slb_shadow *p = get_slb_shadow();
+ unsigned long ksp_esid_data =
+ be64_to_cpu(p->save_area[KSTACK_INDEX].esid);
+ unsigned long ksp_vsid_data =
+ be64_to_cpu(p->save_area[KSTACK_INDEX].vsid);
+
+ asm volatile(PPC_SLBIA(1) "\n"
+ "slbmte %0,%1\n"
+ "isync"
+ :: "r"(ksp_vsid_data),
+ "r"(ksp_esid_data));
+
+ get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+ }
- /* Workaround POWER5 < DD2.1 issue */
- if (offset == 1 || offset > SLB_CACHE_ENTRIES)
- asm volatile("slbie %0" : : "r" (slbie_data));
+ get_paca()->slb_cache_ptr = 0;
+ }
+ get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
- get_paca()->slb_cache_ptr = 0;
copy_mm_to_paca(mm);
/*
- * preload some userspace segments into the SLB.
- * Almost all 32 and 64bit PowerPC executables are linked at
- * 0x10000000 so it makes sense to preload this segment.
+ * We gradually age out SLBs after a number of context switches to
+ * reduce reload overhead of unused entries (like we do with FP/VEC
+ * reload). Each time we wrap 256 switches, take an entry out of the
+ * SLB preload cache.
*/
- exec_base = 0x10000000;
+ tsk->thread.load_slb++;
+ if (!tsk->thread.load_slb) {
+ unsigned long pc = KSTK_EIP(tsk);
- if (is_kernel_addr(pc) || is_kernel_addr(stack) ||
- is_kernel_addr(exec_base))
- return;
+ preload_age(ti);
+ preload_add(ti, pc);
+ }
- slb_allocate(pc);
+ for (i = 0; i < ti->slb_preload_nr; i++) {
+ unsigned char idx;
+ unsigned long ea;
- if (!esids_match(pc, stack))
- slb_allocate(stack);
+ idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
+ ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
- if (!esids_match(pc, exec_base) &&
- !esids_match(stack, exec_base))
- slb_allocate(exec_base);
-}
-
-static inline void patch_slb_encoding(unsigned int *insn_addr,
- unsigned int immed)
-{
+ slb_allocate_user(mm, ea);
+ }
/*
- * This function patches either an li or a cmpldi instruction with
- * a new immediate value. This relies on the fact that both li
- * (which is actually addi) and cmpldi both take a 16-bit immediate
- * value, and it is situated in the same location in the instruction,
- * ie. bits 16-31 (Big endian bit order) or the lower 16 bits.
- * The signedness of the immediate operand differs between the two
- * instructions however this code is only ever patching a small value,
- * much less than 1 << 15, so we can get away with it.
- * To patch the value we read the existing instruction, clear the
- * immediate value, and or in our new value, then write the instruction
- * back.
+ * Synchronize slbmte preloads with possible subsequent user memory
+ * address accesses by the kernel (user mode won't happen until
+ * rfid, which is safe).
*/
- unsigned int insn = (*insn_addr & 0xffff0000) | immed;
- patch_instruction(insn_addr, insn);
+ asm volatile("isync" : : : "memory");
}
-extern u32 slb_miss_kernel_load_linear[];
-extern u32 slb_miss_kernel_load_io[];
-extern u32 slb_compare_rr_to_size[];
-extern u32 slb_miss_kernel_load_vmemmap[];
-
void slb_set_size(u16 size)
{
- if (mmu_slb_size == size)
- return;
-
mmu_slb_size = size;
- patch_slb_encoding(slb_compare_rr_to_size, mmu_slb_size);
}
void slb_initialize(void)
{
unsigned long linear_llp, vmalloc_llp, io_llp;
- unsigned long lflags, vflags;
+ unsigned long lflags;
static int slb_encoding_inited;
#ifdef CONFIG_SPARSEMEM_VMEMMAP
unsigned long vmemmap_llp;
@@ -298,34 +530,24 @@ void slb_initialize(void)
#endif
if (!slb_encoding_inited) {
slb_encoding_inited = 1;
- patch_slb_encoding(slb_miss_kernel_load_linear,
- SLB_VSID_KERNEL | linear_llp);
- patch_slb_encoding(slb_miss_kernel_load_io,
- SLB_VSID_KERNEL | io_llp);
- patch_slb_encoding(slb_compare_rr_to_size,
- mmu_slb_size);
-
pr_devel("SLB: linear LLP = %04lx\n", linear_llp);
pr_devel("SLB: io LLP = %04lx\n", io_llp);
-
#ifdef CONFIG_SPARSEMEM_VMEMMAP
- patch_slb_encoding(slb_miss_kernel_load_vmemmap,
- SLB_VSID_KERNEL | vmemmap_llp);
pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp);
#endif
}
- get_paca()->stab_rr = SLB_NUM_BOLTED;
+ get_paca()->stab_rr = SLB_NUM_BOLTED - 1;
+ get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+ get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
lflags = SLB_VSID_KERNEL | linear_llp;
- vflags = SLB_VSID_KERNEL | vmalloc_llp;
/* Invalidate the entire SLB (even entry 0) & all the ERATS */
asm volatile("isync":::"memory");
asm volatile("slbmte %0,%0"::"r" (0) : "memory");
asm volatile("isync; slbia; isync":::"memory");
create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, LINEAR_INDEX);
- create_shadowed_slbe(VMALLOC_START, mmu_kernel_ssize, vflags, VMALLOC_INDEX);
/* For the boot cpu, we're running on the stack in init_thread_union,
* which is in the first segment of the linear mapping, and also
@@ -340,3 +562,260 @@ void slb_initialize(void)
asm volatile("isync":::"memory");
}
+
+static void slb_cache_update(unsigned long esid_data)
+{
+ int slb_cache_index;
+
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ return; /* ISAv3.0B and later does not use slb_cache */
+
+ /*
+ * Now update slb cache entries
+ */
+ slb_cache_index = local_paca->slb_cache_ptr;
+ if (slb_cache_index < SLB_CACHE_ENTRIES) {
+ /*
+ * We have space in slb cache for optimized switch_slb().
+ * Top 36 bits from esid_data as per ISA
+ */
+ local_paca->slb_cache[slb_cache_index++] = esid_data >> 28;
+ local_paca->slb_cache_ptr++;
+ } else {
+ /*
+ * Our cache is full and the current cache content strictly
+ * doesn't indicate the active SLB conents. Bump the ptr
+ * so that switch_slb() will ignore the cache.
+ */
+ local_paca->slb_cache_ptr = SLB_CACHE_ENTRIES + 1;
+ }
+}
+
+static enum slb_index alloc_slb_index(bool kernel)
+{
+ enum slb_index index;
+
+ /*
+ * The allocation bitmaps can become out of synch with the SLB
+ * when the _switch code does slbie when bolting a new stack
+ * segment and it must not be anywhere else in the SLB. This leaves
+ * a kernel allocated entry that is unused in the SLB. With very
+ * large systems or small segment sizes, the bitmaps could slowly
+ * fill with these entries. They will eventually be cleared out
+ * by the round robin allocator in that case, so it's probably not
+ * worth accounting for.
+ */
+
+ /*
+ * SLBs beyond 32 entries are allocated with stab_rr only
+ * POWER7/8/9 have 32 SLB entries, this could be expanded if a
+ * future CPU has more.
+ */
+ if (local_paca->slb_used_bitmap != U32_MAX) {
+ index = ffz(local_paca->slb_used_bitmap);
+ local_paca->slb_used_bitmap |= 1U << index;
+ if (kernel)
+ local_paca->slb_kern_bitmap |= 1U << index;
+ } else {
+ /* round-robin replacement of slb starting at SLB_NUM_BOLTED. */
+ index = local_paca->stab_rr;
+ if (index < (mmu_slb_size - 1))
+ index++;
+ else
+ index = SLB_NUM_BOLTED;
+ local_paca->stab_rr = index;
+ if (index < 32) {
+ if (kernel)
+ local_paca->slb_kern_bitmap |= 1U << index;
+ else
+ local_paca->slb_kern_bitmap &= ~(1U << index);
+ }
+ }
+ BUG_ON(index < SLB_NUM_BOLTED);
+
+ return index;
+}
+
+static long slb_insert_entry(unsigned long ea, unsigned long context,
+ unsigned long flags, int ssize, bool kernel)
+{
+ unsigned long vsid;
+ unsigned long vsid_data, esid_data;
+ enum slb_index index;
+
+ vsid = get_vsid(context, ea, ssize);
+ if (!vsid)
+ return -EFAULT;
+
+ /*
+ * There must not be a kernel SLB fault in alloc_slb_index or before
+ * slbmte here or the allocation bitmaps could get out of whack with
+ * the SLB.
+ *
+ * User SLB faults or preloads take this path which might get inlined
+ * into the caller, so add compiler barriers here to ensure unsafe
+ * memory accesses do not come between.
+ */
+ barrier();
+
+ index = alloc_slb_index(kernel);
+
+ vsid_data = __mk_vsid_data(vsid, ssize, flags);
+ esid_data = mk_esid_data(ea, ssize, index);
+
+ /*
+ * No need for an isync before or after this slbmte. The exception
+ * we enter with and the rfid we exit with are context synchronizing.
+ * User preloads should add isync afterwards in case the kernel
+ * accesses user memory before it returns to userspace with rfid.
+ */
+ assert_slb_presence(false, ea);
+ asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data));
+
+ barrier();
+
+ if (!kernel)
+ slb_cache_update(esid_data);
+
+ return 0;
+}
+
+static long slb_allocate_kernel(unsigned long ea, unsigned long id)
+{
+ unsigned long context;
+ unsigned long flags;
+ int ssize;
+
+ if (id == KERNEL_REGION_ID) {
+
+ /* We only support upto MAX_PHYSMEM_BITS */
+ if ((ea & ~REGION_MASK) > (1UL << MAX_PHYSMEM_BITS))
+ return -EFAULT;
+
+ flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp;
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ } else if (id == VMEMMAP_REGION_ID) {
+
+ if ((ea & ~REGION_MASK) >= (1ULL << MAX_EA_BITS_PER_CONTEXT))
+ return -EFAULT;
+
+ flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp;
+#endif
+ } else if (id == VMALLOC_REGION_ID) {
+
+ if ((ea & ~REGION_MASK) >= (1ULL << MAX_EA_BITS_PER_CONTEXT))
+ return -EFAULT;
+
+ if (ea < H_VMALLOC_END)
+ flags = local_paca->vmalloc_sllp;
+ else
+ flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp;
+ } else {
+ return -EFAULT;
+ }
+
+ ssize = MMU_SEGSIZE_1T;
+ if (!mmu_has_feature(MMU_FTR_1T_SEGMENT))
+ ssize = MMU_SEGSIZE_256M;
+
+ context = get_kernel_context(ea);
+ return slb_insert_entry(ea, context, flags, ssize, true);
+}
+
+static long slb_allocate_user(struct mm_struct *mm, unsigned long ea)
+{
+ unsigned long context;
+ unsigned long flags;
+ int bpsize;
+ int ssize;
+
+ /*
+ * consider this as bad access if we take a SLB miss
+ * on an address above addr limit.
+ */
+ if (ea >= mm->context.slb_addr_limit)
+ return -EFAULT;
+
+ context = get_user_context(&mm->context, ea);
+ if (!context)
+ return -EFAULT;
+
+ if (unlikely(ea >= H_PGTABLE_RANGE)) {
+ WARN_ON(1);
+ return -EFAULT;
+ }
+
+ ssize = user_segment_size(ea);
+
+ bpsize = get_slice_psize(mm, ea);
+ flags = SLB_VSID_USER | mmu_psize_defs[bpsize].sllp;
+
+ return slb_insert_entry(ea, context, flags, ssize, false);
+}
+
+long do_slb_fault(struct pt_regs *regs, unsigned long ea)
+{
+ unsigned long id = REGION_ID(ea);
+
+ /* IRQs are not reconciled here, so can't check irqs_disabled */
+ VM_WARN_ON(mfmsr() & MSR_EE);
+
+ if (unlikely(!(regs->msr & MSR_RI)))
+ return -EINVAL;
+
+ /*
+ * SLB kernel faults must be very careful not to touch anything
+ * that is not bolted. E.g., PACA and global variables are okay,
+ * mm->context stuff is not.
+ *
+ * SLB user faults can access all of kernel memory, but must be
+ * careful not to touch things like IRQ state because it is not
+ * "reconciled" here. The difficulty is that we must use
+ * fast_exception_return to return from kernel SLB faults without
+ * looking at possible non-bolted memory. We could test user vs
+ * kernel faults in the interrupt handler asm and do a full fault,
+ * reconcile, ret_from_except for user faults which would make them
+ * first class kernel code. But for performance it's probably nicer
+ * if they go via fast_exception_return too.
+ */
+ if (id >= KERNEL_REGION_ID) {
+ long err;
+#ifdef CONFIG_DEBUG_VM
+ /* Catch recursive kernel SLB faults. */
+ BUG_ON(local_paca->in_kernel_slb_handler);
+ local_paca->in_kernel_slb_handler = 1;
+#endif
+ err = slb_allocate_kernel(ea, id);
+#ifdef CONFIG_DEBUG_VM
+ local_paca->in_kernel_slb_handler = 0;
+#endif
+ return err;
+ } else {
+ struct mm_struct *mm = current->mm;
+ long err;
+
+ if (unlikely(!mm))
+ return -EFAULT;
+
+ err = slb_allocate_user(mm, ea);
+ if (!err)
+ preload_add(current_thread_info(), ea);
+
+ return err;
+ }
+}
+
+void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err)
+{
+ if (err == -EFAULT) {
+ if (user_mode(regs))
+ _exception(SIGSEGV, regs, SEGV_BNDERR, ea);
+ else
+ bad_page_fault(regs, ea, SIGSEGV);
+ } else if (err == -EINVAL) {
+ unrecoverable_exception(regs);
+ } else {
+ BUG();
+ }
+}
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
deleted file mode 100644
index 2cf5ef3fc50d..000000000000
--- a/arch/powerpc/mm/slb_low.S
+++ /dev/null
@@ -1,327 +0,0 @@
-/*
- * Low-level SLB routines
- *
- * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
- *
- * Based on earlier C version:
- * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
- * Copyright (c) 2001 Dave Engebretsen
- * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <asm/processor.h>
-#include <asm/ppc_asm.h>
-#include <asm/asm-offsets.h>
-#include <asm/cputable.h>
-#include <asm/page.h>
-#include <asm/mmu.h>
-#include <asm/pgtable.h>
-#include <asm/firmware.h>
-
-/*
- * This macro generates asm code to compute the VSID scramble
- * function. Used in slb_allocate() and do_stab_bolted. The function
- * computed is: (protovsid*VSID_MULTIPLIER) % VSID_MODULUS
- *
- * rt = register containing the proto-VSID and into which the
- * VSID will be stored
- * rx = scratch register (clobbered)
- * rf = flags
- *
- * - rt and rx must be different registers
- * - The answer will end up in the low VSID_BITS bits of rt. The higher
- * bits may contain other garbage, so you may need to mask the
- * result.
- */
-#define ASM_VSID_SCRAMBLE(rt, rx, rf, size) \
- lis rx,VSID_MULTIPLIER_##size@h; \
- ori rx,rx,VSID_MULTIPLIER_##size@l; \
- mulld rt,rt,rx; /* rt = rt * MULTIPLIER */ \
-/* \
- * powermac get slb fault before feature fixup, so make 65 bit part \
- * the default part of feature fixup \
- */ \
-BEGIN_MMU_FTR_SECTION \
- srdi rx,rt,VSID_BITS_65_##size; \
- clrldi rt,rt,(64-VSID_BITS_65_##size); \
- add rt,rt,rx; \
- addi rx,rt,1; \
- srdi rx,rx,VSID_BITS_65_##size; \
- add rt,rt,rx; \
- rldimi rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_65_##size)); \
-MMU_FTR_SECTION_ELSE \
- srdi rx,rt,VSID_BITS_##size; \
- clrldi rt,rt,(64-VSID_BITS_##size); \
- add rt,rt,rx; /* add high and low bits */ \
- addi rx,rt,1; \
- srdi rx,rx,VSID_BITS_##size; /* extract 2^VSID_BITS bit */ \
- add rt,rt,rx; \
- rldimi rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_##size)); \
-ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_68_BIT_VA)
-
-
-/* void slb_allocate(unsigned long ea);
- *
- * Create an SLB entry for the given EA (user or kernel).
- * r3 = faulting address, r13 = PACA
- * r9, r10, r11 are clobbered by this function
- * r3 is preserved.
- * No other registers are examined or changed.
- */
-_GLOBAL(slb_allocate)
- /*
- * check for bad kernel/user address
- * (ea & ~REGION_MASK) >= PGTABLE_RANGE
- */
- rldicr. r9,r3,4,(63 - H_PGTABLE_EADDR_SIZE - 4)
- bne- 8f
-
- srdi r9,r3,60 /* get region */
- srdi r10,r3,SID_SHIFT /* get esid */
- cmpldi cr7,r9,0xc /* cmp PAGE_OFFSET for later use */
-
- /* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */
- blt cr7,0f /* user or kernel? */
-
- /* Check if hitting the linear mapping or some other kernel space
- */
- bne cr7,1f
-
- /* Linear mapping encoding bits, the "li" instruction below will
- * be patched by the kernel at boot
- */
-.globl slb_miss_kernel_load_linear
-slb_miss_kernel_load_linear:
- li r11,0
- /*
- * context = (ea >> 60) - (0xc - 1)
- * r9 = region id.
- */
- subi r9,r9,KERNEL_REGION_CONTEXT_OFFSET
-
-BEGIN_FTR_SECTION
- b .Lslb_finish_load
-END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
- b .Lslb_finish_load_1T
-
-1:
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- cmpldi cr0,r9,0xf
- bne 1f
-/* Check virtual memmap region. To be patched at kernel boot */
-.globl slb_miss_kernel_load_vmemmap
-slb_miss_kernel_load_vmemmap:
- li r11,0
- b 6f
-1:
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
-
- /*
- * r10 contains the ESID, which is the original faulting EA shifted
- * right by 28 bits. We need to compare that with (H_VMALLOC_END >> 28)
- * which is 0xd00038000. That can't be used as an immediate, even if we
- * ignored the 0xd, so we have to load it into a register, and we only
- * have one register free. So we must load all of (H_VMALLOC_END >> 28)
- * into a register and compare ESID against that.
- */
- lis r11,(H_VMALLOC_END >> 32)@h // r11 = 0xffffffffd0000000
- ori r11,r11,(H_VMALLOC_END >> 32)@l // r11 = 0xffffffffd0003800
- // Rotate left 4, then mask with 0xffffffff0
- rldic r11,r11,4,28 // r11 = 0xd00038000
- cmpld r10,r11 // if r10 >= r11
- bge 5f // goto io_mapping
-
- /*
- * vmalloc mapping gets the encoding from the PACA as the mapping
- * can be demoted from 64K -> 4K dynamically on some machines.
- */
- lhz r11,PACAVMALLOCSLLP(r13)
- b 6f
-5:
- /* IO mapping */
-.globl slb_miss_kernel_load_io
-slb_miss_kernel_load_io:
- li r11,0
-6:
- /*
- * context = (ea >> 60) - (0xc - 1)
- * r9 = region id.
- */
- subi r9,r9,KERNEL_REGION_CONTEXT_OFFSET
-
-BEGIN_FTR_SECTION
- b .Lslb_finish_load
-END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
- b .Lslb_finish_load_1T
-
-0: /*
- * For userspace addresses, make sure this is region 0.
- */
- cmpdi r9, 0
- bne- 8f
- /*
- * user space make sure we are within the allowed limit
- */
- ld r11,PACA_SLB_ADDR_LIMIT(r13)
- cmpld r3,r11
- bge- 8f
-
- /* when using slices, we extract the psize off the slice bitmaps
- * and then we need to get the sllp encoding off the mmu_psize_defs
- * array.
- *
- * XXX This is a bit inefficient especially for the normal case,
- * so we should try to implement a fast path for the standard page
- * size using the old sllp value so we avoid the array. We cannot
- * really do dynamic patching unfortunately as processes might flip
- * between 4k and 64k standard page size
- */
-#ifdef CONFIG_PPC_MM_SLICES
- /* r10 have esid */
- cmpldi r10,16
- /* below SLICE_LOW_TOP */
- blt 5f
- /*
- * Handle hpsizes,
- * r9 is get_paca()->context.high_slices_psize[index], r11 is mask_index
- */
- srdi r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT + 1) /* index */
- addi r9,r11,PACAHIGHSLICEPSIZE
- lbzx r9,r13,r9 /* r9 is hpsizes[r11] */
- /* r11 = (r10 >> (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)) & 0x1 */
- rldicl r11,r10,(64 - (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)),63
- b 6f
-
-5:
- /*
- * Handle lpsizes
- * r9 is get_paca()->context.low_slices_psize, r11 is index
- */
- ld r9,PACALOWSLICESPSIZE(r13)
- mr r11,r10
-6:
- sldi r11,r11,2 /* index * 4 */
- /* Extract the psize and multiply to get an array offset */
- srd r9,r9,r11
- andi. r9,r9,0xf
- mulli r9,r9,MMUPSIZEDEFSIZE
-
- /* Now get to the array and obtain the sllp
- */
- ld r11,PACATOC(r13)
- ld r11,mmu_psize_defs@got(r11)
- add r11,r11,r9
- ld r11,MMUPSIZESLLP(r11)
- ori r11,r11,SLB_VSID_USER
-#else
- /* paca context sllp already contains the SLB_VSID_USER bits */
- lhz r11,PACACONTEXTSLLP(r13)
-#endif /* CONFIG_PPC_MM_SLICES */
-
- ld r9,PACACONTEXTID(r13)
-BEGIN_FTR_SECTION
- cmpldi r10,0x1000
- bge .Lslb_finish_load_1T
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
- b .Lslb_finish_load
-
-8: /* invalid EA - return an error indication */
- crset 4*cr0+eq /* indicate failure */
- blr
-
-/*
- * Finish loading of an SLB entry and return
- *
- * r3 = EA, r9 = context, r10 = ESID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET
- */
-.Lslb_finish_load:
- rldimi r10,r9,ESID_BITS,0
- ASM_VSID_SCRAMBLE(r10,r9,r11,256M)
- /* r3 = EA, r11 = VSID data */
- /*
- * Find a slot, round robin. Previously we tried to find a
- * free slot first but that took too long. Unfortunately we
- * dont have any LRU information to help us choose a slot.
- */
-
- mr r9,r3
-
- /* slb_finish_load_1T continues here. r9=EA with non-ESID bits clear */
-7: ld r10,PACASTABRR(r13)
- addi r10,r10,1
- /* This gets soft patched on boot. */
-.globl slb_compare_rr_to_size
-slb_compare_rr_to_size:
- cmpldi r10,0
-
- blt+ 4f
- li r10,SLB_NUM_BOLTED
-
-4:
- std r10,PACASTABRR(r13)
-
-3:
- rldimi r9,r10,0,36 /* r9 = EA[0:35] | entry */
- oris r10,r9,SLB_ESID_V@h /* r10 = r9 | SLB_ESID_V */
-
- /* r9 = ESID data, r11 = VSID data */
-
- /*
- * No need for an isync before or after this slbmte. The exception
- * we enter with and the rfid we exit with are context synchronizing.
- */
- slbmte r11,r10
-
- /* we're done for kernel addresses */
- crclr 4*cr0+eq /* set result to "success" */
- bgelr cr7
-
- /* Update the slb cache */
- lhz r9,PACASLBCACHEPTR(r13) /* offset = paca->slb_cache_ptr */
- cmpldi r9,SLB_CACHE_ENTRIES
- bge 1f
-
- /* still room in the slb cache */
- sldi r11,r9,2 /* r11 = offset * sizeof(u32) */
- srdi r10,r10,28 /* get the 36 bits of the ESID */
- add r11,r11,r13 /* r11 = (u32 *)paca + offset */
- stw r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */
- addi r9,r9,1 /* offset++ */
- b 2f
-1: /* offset >= SLB_CACHE_ENTRIES */
- li r9,SLB_CACHE_ENTRIES+1
-2:
- sth r9,PACASLBCACHEPTR(r13) /* paca->slb_cache_ptr = offset */
- crclr 4*cr0+eq /* set result to "success" */
- blr
-
-/*
- * Finish loading of a 1T SLB entry (for the kernel linear mapping) and return.
- *
- * r3 = EA, r9 = context, r10 = ESID(256MB), r11 = flags, clobbers r9
- */
-.Lslb_finish_load_1T:
- srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */
- rldimi r10,r9,ESID_BITS_1T,0
- ASM_VSID_SCRAMBLE(r10,r9,r11,1T)
-
- li r10,MMU_SEGSIZE_1T
- rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */
-
- /* r3 = EA, r11 = VSID data */
- clrrdi r9,r3,SID_SHIFT_1T /* clear out non-ESID bits */
- b 7b
-
-
-_ASM_NOKPROBE_SYMBOL(slb_allocate)
-_ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_linear)
-_ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_io)
-_ASM_NOKPROBE_SYMBOL(slb_compare_rr_to_size)
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-_ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_vmemmap)
-#endif
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 23ec2c5e3b78..06898c13901d 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -31,58 +31,62 @@
#include <linux/spinlock.h>
#include <linux/export.h>
#include <linux/hugetlb.h>
+#include <linux/sched/mm.h>
#include <asm/mman.h>
#include <asm/mmu.h>
#include <asm/copro.h>
#include <asm/hugetlb.h>
+#include <asm/mmu_context.h>
static DEFINE_SPINLOCK(slice_convert_lock);
-/*
- * One bit per slice. We have lower slices which cover 256MB segments
- * upto 4G range. That gets us 16 low slices. For the rest we track slices
- * in 1TB size.
- */
-struct slice_mask {
- u64 low_slices;
- DECLARE_BITMAP(high_slices, SLICE_NUM_HIGH);
-};
#ifdef DEBUG
int _slice_debug = 1;
-static void slice_print_mask(const char *label, struct slice_mask mask)
+static void slice_print_mask(const char *label, const struct slice_mask *mask)
{
if (!_slice_debug)
return;
- pr_devel("%s low_slice: %*pbl\n", label, (int)SLICE_NUM_LOW, &mask.low_slices);
- pr_devel("%s high_slice: %*pbl\n", label, (int)SLICE_NUM_HIGH, mask.high_slices);
+ pr_devel("%s low_slice: %*pbl\n", label,
+ (int)SLICE_NUM_LOW, &mask->low_slices);
+ pr_devel("%s high_slice: %*pbl\n", label,
+ (int)SLICE_NUM_HIGH, mask->high_slices);
}
#define slice_dbg(fmt...) do { if (_slice_debug) pr_devel(fmt); } while (0)
#else
-static void slice_print_mask(const char *label, struct slice_mask mask) {}
+static void slice_print_mask(const char *label, const struct slice_mask *mask) {}
#define slice_dbg(fmt...)
#endif
+static inline bool slice_addr_is_low(unsigned long addr)
+{
+ u64 tmp = (u64)addr;
+
+ return tmp < SLICE_LOW_TOP;
+}
+
static void slice_range_to_mask(unsigned long start, unsigned long len,
struct slice_mask *ret)
{
unsigned long end = start + len - 1;
ret->low_slices = 0;
- bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
+ if (SLICE_NUM_HIGH)
+ bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
- if (start < SLICE_LOW_TOP) {
- unsigned long mend = min(end, (SLICE_LOW_TOP - 1));
+ if (slice_addr_is_low(start)) {
+ unsigned long mend = min(end,
+ (unsigned long)(SLICE_LOW_TOP - 1));
ret->low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1))
- (1u << GET_LOW_SLICE_INDEX(start));
}
- if ((start + len) > SLICE_LOW_TOP) {
+ if (SLICE_NUM_HIGH && !slice_addr_is_low(end)) {
unsigned long start_index = GET_HIGH_SLICE_INDEX(start);
unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT));
unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index;
@@ -113,11 +117,13 @@ static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice)
unsigned long start = slice << SLICE_HIGH_SHIFT;
unsigned long end = start + (1ul << SLICE_HIGH_SHIFT);
+#ifdef CONFIG_PPC64
/* Hack, so that each addresses is controlled by exactly one
* of the high or low area bitmaps, the first high area starts
* at 4GB, not 0 */
if (start == 0)
start = SLICE_LOW_TOP;
+#endif
return !slice_area_is_free(mm, start, end - start);
}
@@ -128,13 +134,14 @@ static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret,
unsigned long i;
ret->low_slices = 0;
- bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
+ if (SLICE_NUM_HIGH)
+ bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
for (i = 0; i < SLICE_NUM_LOW; i++)
if (!slice_low_has_vma(mm, i))
ret->low_slices |= 1u << i;
- if (high_limit <= SLICE_LOW_TOP)
+ if (slice_addr_is_low(high_limit - 1))
return;
for (i = 0; i < GET_HIGH_SLICE_INDEX(high_limit); i++)
@@ -142,53 +149,75 @@ static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret,
__set_bit(i, ret->high_slices);
}
-static void slice_mask_for_size(struct mm_struct *mm, int psize, struct slice_mask *ret,
- unsigned long high_limit)
+#ifdef CONFIG_PPC_BOOK3S_64
+static struct slice_mask *slice_mask_for_size(struct mm_struct *mm, int psize)
{
- unsigned char *hpsizes;
- int index, mask_index;
- unsigned long i;
- u64 lpsizes;
-
- ret->low_slices = 0;
- bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
+#ifdef CONFIG_PPC_64K_PAGES
+ if (psize == MMU_PAGE_64K)
+ return &mm->context.mask_64k;
+#endif
+ if (psize == MMU_PAGE_4K)
+ return &mm->context.mask_4k;
+#ifdef CONFIG_HUGETLB_PAGE
+ if (psize == MMU_PAGE_16M)
+ return &mm->context.mask_16m;
+ if (psize == MMU_PAGE_16G)
+ return &mm->context.mask_16g;
+#endif
+ BUG();
+}
+#elif defined(CONFIG_PPC_8xx)
+static struct slice_mask *slice_mask_for_size(struct mm_struct *mm, int psize)
+{
+ if (psize == mmu_virtual_psize)
+ return &mm->context.mask_base_psize;
+#ifdef CONFIG_HUGETLB_PAGE
+ if (psize == MMU_PAGE_512K)
+ return &mm->context.mask_512k;
+ if (psize == MMU_PAGE_8M)
+ return &mm->context.mask_8m;
+#endif
+ BUG();
+}
+#else
+#error "Must define the slice masks for page sizes supported by the platform"
+#endif
- lpsizes = mm->context.low_slices_psize;
- for (i = 0; i < SLICE_NUM_LOW; i++)
- if (((lpsizes >> (i * 4)) & 0xf) == psize)
- ret->low_slices |= 1u << i;
+static bool slice_check_range_fits(struct mm_struct *mm,
+ const struct slice_mask *available,
+ unsigned long start, unsigned long len)
+{
+ unsigned long end = start + len - 1;
+ u64 low_slices = 0;
- if (high_limit <= SLICE_LOW_TOP)
- return;
+ if (slice_addr_is_low(start)) {
+ unsigned long mend = min(end,
+ (unsigned long)(SLICE_LOW_TOP - 1));
- hpsizes = mm->context.high_slices_psize;
- for (i = 0; i < GET_HIGH_SLICE_INDEX(high_limit); i++) {
- mask_index = i & 0x1;
- index = i >> 1;
- if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize)
- __set_bit(i, ret->high_slices);
+ low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1))
+ - (1u << GET_LOW_SLICE_INDEX(start));
}
-}
+ if ((low_slices & available->low_slices) != low_slices)
+ return false;
-static int slice_check_fit(struct mm_struct *mm,
- struct slice_mask mask, struct slice_mask available)
-{
- DECLARE_BITMAP(result, SLICE_NUM_HIGH);
- /*
- * Make sure we just do bit compare only to the max
- * addr limit and not the full bit map size.
- */
- unsigned long slice_count = GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit);
+ if (SLICE_NUM_HIGH && !slice_addr_is_low(end)) {
+ unsigned long start_index = GET_HIGH_SLICE_INDEX(start);
+ unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT));
+ unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index;
+ unsigned long i;
- bitmap_and(result, mask.high_slices,
- available.high_slices, slice_count);
+ for (i = start_index; i < start_index + count; i++) {
+ if (!test_bit(i, available->high_slices))
+ return false;
+ }
+ }
- return (mask.low_slices & available.low_slices) == mask.low_slices &&
- bitmap_equal(result, mask.high_slices, slice_count);
+ return true;
}
static void slice_flush_segments(void *parm)
{
+#ifdef CONFIG_PPC64
struct mm_struct *mm = parm;
unsigned long flags;
@@ -198,42 +227,66 @@ static void slice_flush_segments(void *parm)
copy_mm_to_paca(current->active_mm);
local_irq_save(flags);
- slb_flush_and_rebolt();
+ slb_flush_and_restore_bolted();
local_irq_restore(flags);
+#endif
}
-static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psize)
+static void slice_convert(struct mm_struct *mm,
+ const struct slice_mask *mask, int psize)
{
int index, mask_index;
/* Write the new slice psize bits */
- unsigned char *hpsizes;
- u64 lpsizes;
+ unsigned char *hpsizes, *lpsizes;
+ struct slice_mask *psize_mask, *old_mask;
unsigned long i, flags;
+ int old_psize;
slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize);
slice_print_mask(" mask", mask);
+ psize_mask = slice_mask_for_size(mm, psize);
+
/* We need to use a spinlock here to protect against
* concurrent 64k -> 4k demotion ...
*/
spin_lock_irqsave(&slice_convert_lock, flags);
lpsizes = mm->context.low_slices_psize;
- for (i = 0; i < SLICE_NUM_LOW; i++)
- if (mask.low_slices & (1u << i))
- lpsizes = (lpsizes & ~(0xful << (i * 4))) |
- (((unsigned long)psize) << (i * 4));
+ for (i = 0; i < SLICE_NUM_LOW; i++) {
+ if (!(mask->low_slices & (1u << i)))
+ continue;
+
+ mask_index = i & 0x1;
+ index = i >> 1;
- /* Assign the value back */
- mm->context.low_slices_psize = lpsizes;
+ /* Update the slice_mask */
+ old_psize = (lpsizes[index] >> (mask_index * 4)) & 0xf;
+ old_mask = slice_mask_for_size(mm, old_psize);
+ old_mask->low_slices &= ~(1u << i);
+ psize_mask->low_slices |= 1u << i;
+
+ /* Update the sizes array */
+ lpsizes[index] = (lpsizes[index] & ~(0xf << (mask_index * 4))) |
+ (((unsigned long)psize) << (mask_index * 4));
+ }
hpsizes = mm->context.high_slices_psize;
for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit); i++) {
+ if (!test_bit(i, mask->high_slices))
+ continue;
+
mask_index = i & 0x1;
index = i >> 1;
- if (test_bit(i, mask.high_slices))
- hpsizes[index] = (hpsizes[index] &
- ~(0xf << (mask_index * 4))) |
+
+ /* Update the slice_mask */
+ old_psize = (hpsizes[index] >> (mask_index * 4)) & 0xf;
+ old_mask = slice_mask_for_size(mm, old_psize);
+ __clear_bit(i, old_mask->high_slices);
+ __set_bit(i, psize_mask->high_slices);
+
+ /* Update the sizes array */
+ hpsizes[index] = (hpsizes[index] & ~(0xf << (mask_index * 4))) |
(((unsigned long)psize) << (mask_index * 4));
}
@@ -254,26 +307,25 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
* 'available' slice_mark.
*/
static bool slice_scan_available(unsigned long addr,
- struct slice_mask available,
- int end,
- unsigned long *boundary_addr)
+ const struct slice_mask *available,
+ int end, unsigned long *boundary_addr)
{
unsigned long slice;
- if (addr < SLICE_LOW_TOP) {
+ if (slice_addr_is_low(addr)) {
slice = GET_LOW_SLICE_INDEX(addr);
*boundary_addr = (slice + end) << SLICE_LOW_SHIFT;
- return !!(available.low_slices & (1u << slice));
+ return !!(available->low_slices & (1u << slice));
} else {
slice = GET_HIGH_SLICE_INDEX(addr);
*boundary_addr = (slice + end) ?
((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP;
- return !!test_bit(slice, available.high_slices);
+ return !!test_bit(slice, available->high_slices);
}
}
static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
unsigned long len,
- struct slice_mask available,
+ const struct slice_mask *available,
int psize, unsigned long high_limit)
{
int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
@@ -319,7 +371,7 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
static unsigned long slice_find_area_topdown(struct mm_struct *mm,
unsigned long len,
- struct slice_mask available,
+ const struct slice_mask *available,
int psize, unsigned long high_limit)
{
int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
@@ -377,7 +429,7 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
- struct slice_mask mask, int psize,
+ const struct slice_mask *mask, int psize,
int topdown, unsigned long high_limit)
{
if (topdown)
@@ -386,23 +438,33 @@ static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
return slice_find_area_bottomup(mm, len, mask, psize, high_limit);
}
-static inline void slice_or_mask(struct slice_mask *dst, struct slice_mask *src)
+static inline void slice_copy_mask(struct slice_mask *dst,
+ const struct slice_mask *src)
{
- DECLARE_BITMAP(result, SLICE_NUM_HIGH);
-
- dst->low_slices |= src->low_slices;
- bitmap_or(result, dst->high_slices, src->high_slices, SLICE_NUM_HIGH);
- bitmap_copy(dst->high_slices, result, SLICE_NUM_HIGH);
+ dst->low_slices = src->low_slices;
+ if (!SLICE_NUM_HIGH)
+ return;
+ bitmap_copy(dst->high_slices, src->high_slices, SLICE_NUM_HIGH);
}
-static inline void slice_andnot_mask(struct slice_mask *dst, struct slice_mask *src)
+static inline void slice_or_mask(struct slice_mask *dst,
+ const struct slice_mask *src1,
+ const struct slice_mask *src2)
{
- DECLARE_BITMAP(result, SLICE_NUM_HIGH);
-
- dst->low_slices &= ~src->low_slices;
+ dst->low_slices = src1->low_slices | src2->low_slices;
+ if (!SLICE_NUM_HIGH)
+ return;
+ bitmap_or(dst->high_slices, src1->high_slices, src2->high_slices, SLICE_NUM_HIGH);
+}
- bitmap_andnot(result, dst->high_slices, src->high_slices, SLICE_NUM_HIGH);
- bitmap_copy(dst->high_slices, result, SLICE_NUM_HIGH);
+static inline void slice_andnot_mask(struct slice_mask *dst,
+ const struct slice_mask *src1,
+ const struct slice_mask *src2)
+{
+ dst->low_slices = src1->low_slices & ~src2->low_slices;
+ if (!SLICE_NUM_HIGH)
+ return;
+ bitmap_andnot(dst->high_slices, src1->high_slices, src2->high_slices, SLICE_NUM_HIGH);
}
#ifdef CONFIG_PPC_64K_PAGES
@@ -415,10 +477,10 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
unsigned long flags, unsigned int psize,
int topdown)
{
- struct slice_mask mask;
struct slice_mask good_mask;
struct slice_mask potential_mask;
- struct slice_mask compat_mask;
+ const struct slice_mask *maskp;
+ const struct slice_mask *compat_maskp = NULL;
int fixed = (flags & MAP_FIXED);
int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
unsigned long page_size = 1UL << pshift;
@@ -442,23 +504,16 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
}
if (high_limit > mm->context.slb_addr_limit) {
+ /*
+ * Increasing the slb_addr_limit does not require
+ * slice mask cache to be recalculated because it should
+ * be already initialised beyond the old address limit.
+ */
mm->context.slb_addr_limit = high_limit;
+
on_each_cpu(slice_flush_segments, mm, 1);
}
- /*
- * init different masks
- */
- mask.low_slices = 0;
- bitmap_zero(mask.high_slices, SLICE_NUM_HIGH);
-
- /* silence stupid warning */;
- potential_mask.low_slices = 0;
- bitmap_zero(potential_mask.high_slices, SLICE_NUM_HIGH);
-
- compat_mask.low_slices = 0;
- bitmap_zero(compat_mask.high_slices, SLICE_NUM_HIGH);
-
/* Sanity checks */
BUG_ON(mm->task_size == 0);
BUG_ON(mm->context.slb_addr_limit == 0);
@@ -481,8 +536,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
/* First make up a "good" mask of slices that have the right size
* already
*/
- slice_mask_for_size(mm, psize, &good_mask, high_limit);
- slice_print_mask(" good_mask", good_mask);
+ maskp = slice_mask_for_size(mm, psize);
/*
* Here "good" means slices that are already the right page size,
@@ -503,40 +557,47 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
* search in good | compat | free, found => convert free.
*/
-#ifdef CONFIG_PPC_64K_PAGES
- /* If we support combo pages, we can allow 64k pages in 4k slices */
- if (psize == MMU_PAGE_64K) {
- slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask, high_limit);
+ /*
+ * If we support combo pages, we can allow 64k pages in 4k slices
+ * The mask copies could be avoided in most cases here if we had
+ * a pointer to good mask for the next code to use.
+ */
+ if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && psize == MMU_PAGE_64K) {
+ compat_maskp = slice_mask_for_size(mm, MMU_PAGE_4K);
if (fixed)
- slice_or_mask(&good_mask, &compat_mask);
+ slice_or_mask(&good_mask, maskp, compat_maskp);
+ else
+ slice_copy_mask(&good_mask, maskp);
+ } else {
+ slice_copy_mask(&good_mask, maskp);
}
-#endif
+
+ slice_print_mask(" good_mask", &good_mask);
+ if (compat_maskp)
+ slice_print_mask(" compat_mask", compat_maskp);
/* First check hint if it's valid or if we have MAP_FIXED */
if (addr != 0 || fixed) {
- /* Build a mask for the requested range */
- slice_range_to_mask(addr, len, &mask);
- slice_print_mask(" mask", mask);
-
/* Check if we fit in the good mask. If we do, we just return,
* nothing else to do
*/
- if (slice_check_fit(mm, mask, good_mask)) {
+ if (slice_check_range_fits(mm, &good_mask, addr, len)) {
slice_dbg(" fits good !\n");
- return addr;
+ newaddr = addr;
+ goto return_addr;
}
} else {
/* Now let's see if we can find something in the existing
* slices for that size
*/
- newaddr = slice_find_area(mm, len, good_mask,
+ newaddr = slice_find_area(mm, len, &good_mask,
psize, topdown, high_limit);
if (newaddr != -ENOMEM) {
/* Found within the good mask, we don't have to setup,
* we thus return directly
*/
slice_dbg(" found area at 0x%lx\n", newaddr);
- return newaddr;
+ goto return_addr;
}
}
/*
@@ -544,12 +605,15 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
* empty and thus can be converted
*/
slice_mask_for_free(mm, &potential_mask, high_limit);
- slice_or_mask(&potential_mask, &good_mask);
- slice_print_mask(" potential", potential_mask);
+ slice_or_mask(&potential_mask, &potential_mask, &good_mask);
+ slice_print_mask(" potential", &potential_mask);
- if ((addr != 0 || fixed) && slice_check_fit(mm, mask, potential_mask)) {
- slice_dbg(" fits potential !\n");
- goto convert;
+ if (addr != 0 || fixed) {
+ if (slice_check_range_fits(mm, &potential_mask, addr, len)) {
+ slice_dbg(" fits potential !\n");
+ newaddr = addr;
+ goto convert;
+ }
}
/* If we have MAP_FIXED and failed the above steps, then error out */
@@ -562,46 +626,64 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
* anywhere in the good area.
*/
if (addr) {
- addr = slice_find_area(mm, len, good_mask,
- psize, topdown, high_limit);
- if (addr != -ENOMEM) {
- slice_dbg(" found area at 0x%lx\n", addr);
- return addr;
+ newaddr = slice_find_area(mm, len, &good_mask,
+ psize, topdown, high_limit);
+ if (newaddr != -ENOMEM) {
+ slice_dbg(" found area at 0x%lx\n", newaddr);
+ goto return_addr;
}
}
/* Now let's see if we can find something in the existing slices
* for that size plus free slices
*/
- addr = slice_find_area(mm, len, potential_mask,
- psize, topdown, high_limit);
+ newaddr = slice_find_area(mm, len, &potential_mask,
+ psize, topdown, high_limit);
#ifdef CONFIG_PPC_64K_PAGES
- if (addr == -ENOMEM && psize == MMU_PAGE_64K) {
+ if (newaddr == -ENOMEM && psize == MMU_PAGE_64K) {
/* retry the search with 4k-page slices included */
- slice_or_mask(&potential_mask, &compat_mask);
- addr = slice_find_area(mm, len, potential_mask,
- psize, topdown, high_limit);
+ slice_or_mask(&potential_mask, &potential_mask, compat_maskp);
+ newaddr = slice_find_area(mm, len, &potential_mask,
+ psize, topdown, high_limit);
}
#endif
- if (addr == -ENOMEM)
+ if (newaddr == -ENOMEM)
return -ENOMEM;
- slice_range_to_mask(addr, len, &mask);
- slice_dbg(" found potential area at 0x%lx\n", addr);
- slice_print_mask(" mask", mask);
+ slice_range_to_mask(newaddr, len, &potential_mask);
+ slice_dbg(" found potential area at 0x%lx\n", newaddr);
+ slice_print_mask(" mask", &potential_mask);
convert:
- slice_andnot_mask(&mask, &good_mask);
- slice_andnot_mask(&mask, &compat_mask);
- if (mask.low_slices || !bitmap_empty(mask.high_slices, SLICE_NUM_HIGH)) {
- slice_convert(mm, mask, psize);
+ /*
+ * Try to allocate the context before we do slice convert
+ * so that we handle the context allocation failure gracefully.
+ */
+ if (need_extra_context(mm, newaddr)) {
+ if (alloc_extended_context(mm, newaddr) < 0)
+ return -ENOMEM;
+ }
+
+ slice_andnot_mask(&potential_mask, &potential_mask, &good_mask);
+ if (compat_maskp && !fixed)
+ slice_andnot_mask(&potential_mask, &potential_mask, compat_maskp);
+ if (potential_mask.low_slices ||
+ (SLICE_NUM_HIGH &&
+ !bitmap_empty(potential_mask.high_slices, SLICE_NUM_HIGH))) {
+ slice_convert(mm, &potential_mask, psize);
if (psize > MMU_PAGE_BASE)
on_each_cpu(slice_flush_segments, mm, 1);
}
- return addr;
+ return newaddr;
+return_addr:
+ if (need_extra_context(mm, newaddr)) {
+ if (alloc_extended_context(mm, newaddr) < 0)
+ return -ENOMEM;
+ }
+ return newaddr;
}
EXPORT_SYMBOL_GPL(slice_get_unmapped_area);
@@ -627,95 +709,75 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp,
unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr)
{
- unsigned char *hpsizes;
+ unsigned char *psizes;
int index, mask_index;
- /*
- * Radix doesn't use slice, but can get enabled along with MMU_SLICE
- */
- if (radix_enabled()) {
-#ifdef CONFIG_PPC_64K_PAGES
- return MMU_PAGE_64K;
-#else
- return MMU_PAGE_4K;
-#endif
- }
- if (addr < SLICE_LOW_TOP) {
- u64 lpsizes;
- lpsizes = mm->context.low_slices_psize;
+ VM_BUG_ON(radix_enabled());
+
+ if (slice_addr_is_low(addr)) {
+ psizes = mm->context.low_slices_psize;
index = GET_LOW_SLICE_INDEX(addr);
- return (lpsizes >> (index * 4)) & 0xf;
+ } else {
+ psizes = mm->context.high_slices_psize;
+ index = GET_HIGH_SLICE_INDEX(addr);
}
- hpsizes = mm->context.high_slices_psize;
- index = GET_HIGH_SLICE_INDEX(addr);
mask_index = index & 0x1;
- return (hpsizes[index >> 1] >> (mask_index * 4)) & 0xf;
+ return (psizes[index >> 1] >> (mask_index * 4)) & 0xf;
}
EXPORT_SYMBOL_GPL(get_slice_psize);
-/*
- * This is called by hash_page when it needs to do a lazy conversion of
- * an address space from real 64K pages to combo 4K pages (typically
- * when hitting a non cacheable mapping on a processor or hypervisor
- * that won't allow them for 64K pages).
- *
- * This is also called in init_new_context() to change back the user
- * psize from whatever the parent context had it set to
- * N.B. This may be called before mm->context.id has been set.
- *
- * This function will only change the content of the {low,high)_slice_psize
- * masks, it will not flush SLBs as this shall be handled lazily by the
- * caller.
- */
-void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
+void slice_init_new_context_exec(struct mm_struct *mm)
{
- int index, mask_index;
- unsigned char *hpsizes;
- unsigned long flags, lpsizes;
- unsigned int old_psize;
- int i;
+ unsigned char *hpsizes, *lpsizes;
+ struct slice_mask *mask;
+ unsigned int psize = mmu_virtual_psize;
- slice_dbg("slice_set_user_psize(mm=%p, psize=%d)\n", mm, psize);
+ slice_dbg("slice_init_new_context_exec(mm=%p)\n", mm);
- VM_BUG_ON(radix_enabled());
- spin_lock_irqsave(&slice_convert_lock, flags);
-
- old_psize = mm->context.user_psize;
- slice_dbg(" old_psize=%d\n", old_psize);
- if (old_psize == psize)
- goto bail;
+ /*
+ * In the case of exec, use the default limit. In the
+ * case of fork it is just inherited from the mm being
+ * duplicated.
+ */
+#ifdef CONFIG_PPC64
+ mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64;
+#else
+ mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW;
+#endif
mm->context.user_psize = psize;
- wmb();
+ /*
+ * Set all slice psizes to the default.
+ */
lpsizes = mm->context.low_slices_psize;
- for (i = 0; i < SLICE_NUM_LOW; i++)
- if (((lpsizes >> (i * 4)) & 0xf) == old_psize)
- lpsizes = (lpsizes & ~(0xful << (i * 4))) |
- (((unsigned long)psize) << (i * 4));
- /* Assign the value back */
- mm->context.low_slices_psize = lpsizes;
+ memset(lpsizes, (psize << 4) | psize, SLICE_NUM_LOW >> 1);
hpsizes = mm->context.high_slices_psize;
- for (i = 0; i < SLICE_NUM_HIGH; i++) {
- mask_index = i & 0x1;
- index = i >> 1;
- if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == old_psize)
- hpsizes[index] = (hpsizes[index] &
- ~(0xf << (mask_index * 4))) |
- (((unsigned long)psize) << (mask_index * 4));
- }
+ memset(hpsizes, (psize << 4) | psize, SLICE_NUM_HIGH >> 1);
+ /*
+ * Slice mask cache starts zeroed, fill the default size cache.
+ */
+ mask = slice_mask_for_size(mm, psize);
+ mask->low_slices = ~0UL;
+ if (SLICE_NUM_HIGH)
+ bitmap_fill(mask->high_slices, SLICE_NUM_HIGH);
+}
+#ifdef CONFIG_PPC_BOOK3S_64
+void slice_setup_new_exec(void)
+{
+ struct mm_struct *mm = current->mm;
+ slice_dbg("slice_setup_new_exec(mm=%p)\n", mm);
- slice_dbg(" lsps=%lx, hsps=%lx\n",
- (unsigned long)mm->context.low_slices_psize,
- (unsigned long)mm->context.high_slices_psize);
+ if (!is_32bit_task())
+ return;
- bail:
- spin_unlock_irqrestore(&slice_convert_lock, flags);
+ mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW;
}
+#endif
void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
unsigned long len, unsigned int psize)
@@ -725,7 +787,7 @@ void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
VM_BUG_ON(radix_enabled());
slice_range_to_mask(start, len, &mask);
- slice_convert(mm, mask, psize);
+ slice_convert(mm, &mask, psize);
}
#ifdef CONFIG_HUGETLB_PAGE
@@ -748,33 +810,27 @@ void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
* for now as we only use slices with hugetlbfs enabled. This should
* be fixed as the generic code gets fixed.
*/
-int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
+int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
unsigned long len)
{
- struct slice_mask mask, available;
+ const struct slice_mask *maskp;
unsigned int psize = mm->context.user_psize;
- unsigned long high_limit = mm->context.slb_addr_limit;
- if (radix_enabled())
- return 0;
+ VM_BUG_ON(radix_enabled());
- slice_range_to_mask(addr, len, &mask);
- slice_mask_for_size(mm, psize, &available, high_limit);
+ maskp = slice_mask_for_size(mm, psize);
#ifdef CONFIG_PPC_64K_PAGES
/* We need to account for 4k slices too */
if (psize == MMU_PAGE_64K) {
- struct slice_mask compat_mask;
- slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask, high_limit);
- slice_or_mask(&available, &compat_mask);
+ const struct slice_mask *compat_maskp;
+ struct slice_mask available;
+
+ compat_maskp = slice_mask_for_size(mm, MMU_PAGE_4K);
+ slice_or_mask(&available, maskp, compat_maskp);
+ return !slice_check_range_fits(mm, &available, addr, len);
}
#endif
-#if 0 /* too verbose */
- slice_dbg("is_hugepage_only_range(mm=%p, addr=%lx, len=%lx)\n",
- mm, addr, len);
- slice_print_mask(" mask", mask);
- slice_print_mask(" available", available);
-#endif
- return !slice_check_fit(mm, mask, available);
+ return !slice_check_range_fits(mm, maskp, addr, len);
}
#endif
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index f14a07c2fb90..3327551c8b47 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -13,10 +13,10 @@
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
+#include <linux/syscalls.h>
#include <asm/pgtable.h>
#include <linux/uaccess.h>
-#include <asm/tlbflush.h>
/*
* Free all pages allocated for subpage protection maps and pointers.
@@ -185,7 +185,8 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
* in a 2-bit field won't allow writes to a page that is otherwise
* write-protected.
*/
-long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
+SYSCALL_DEFINE3(subpage_prot, unsigned long, addr,
+ unsigned long, len, u32 __user *, map)
{
struct mm_struct *mm = current->mm;
struct subpage_prot_table *spt = &mm->context.spt;
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index a07f5372a4bf..6a23b9ebd2a1 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -12,6 +12,8 @@
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/memblock.h>
+#include <linux/mmu_context.h>
+#include <linux/sched/mm.h>
#include <asm/ppc-opcode.h>
#include <asm/tlb.h>
@@ -33,13 +35,12 @@ static inline void tlbiel_radix_set_isa300(unsigned int set, unsigned int is,
{
unsigned long rb;
unsigned long rs;
- unsigned int r = 1; /* radix format */
rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
rs = ((unsigned long)pid << PPC_BITLSHIFT(31));
- asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4)
- : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "r"(r)
+ asm volatile(PPC_TLBIEL(%0, %1, %2, %3, 1)
+ : : "r"(rb), "r"(rs), "i"(ric), "i"(prs)
: "memory");
}
@@ -98,7 +99,7 @@ static inline void __tlbiel_pid(unsigned long pid, int set,
rb |= set << PPC_BITLSHIFT(51);
rs = ((unsigned long)pid) << PPC_BITLSHIFT(31);
prs = 1; /* process scoped */
- r = 1; /* raidx format */
+ r = 1; /* radix format */
asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
: : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
@@ -112,13 +113,60 @@ static inline void __tlbie_pid(unsigned long pid, unsigned long ric)
rb = PPC_BIT(53); /* IS = 1 */
rs = pid << PPC_BITLSHIFT(31);
prs = 1; /* process scoped */
- r = 1; /* raidx format */
+ r = 1; /* radix format */
asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
: : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
trace_tlbie(0, 0, rb, rs, ric, prs, r);
}
+static inline void __tlbiel_lpid(unsigned long lpid, int set,
+ unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = PPC_BIT(52); /* IS = 2 */
+ rb |= set << PPC_BITLSHIFT(51);
+ rs = 0; /* LPID comes from LPIDR */
+ prs = 0; /* partition scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
+}
+
+static inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = PPC_BIT(52); /* IS = 2 */
+ rs = lpid;
+ prs = 0; /* partition scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
+}
+
+static inline void __tlbiel_lpid_guest(unsigned long lpid, int set,
+ unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = PPC_BIT(52); /* IS = 2 */
+ rb |= set << PPC_BITLSHIFT(51);
+ rs = 0; /* LPID comes from LPIDR */
+ prs = 1; /* process scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
+}
+
+
static inline void __tlbiel_va(unsigned long va, unsigned long pid,
unsigned long ap, unsigned long ric)
{
@@ -128,7 +176,7 @@ static inline void __tlbiel_va(unsigned long va, unsigned long pid,
rb |= ap << PPC_BITLSHIFT(58);
rs = pid << PPC_BITLSHIFT(31);
prs = 1; /* process scoped */
- r = 1; /* raidx format */
+ r = 1; /* radix format */
asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
: : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
@@ -144,13 +192,29 @@ static inline void __tlbie_va(unsigned long va, unsigned long pid,
rb |= ap << PPC_BITLSHIFT(58);
rs = pid << PPC_BITLSHIFT(31);
prs = 1; /* process scoped */
- r = 1; /* raidx format */
+ r = 1; /* radix format */
asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
: : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
trace_tlbie(0, 0, rb, rs, ric, prs, r);
}
+static inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid,
+ unsigned long ap, unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = va & ~(PPC_BITMASK(52, 63));
+ rb |= ap << PPC_BITLSHIFT(58);
+ rs = lpid;
+ prs = 0; /* partition scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
+}
+
static inline void fixup_tlbie(void)
{
unsigned long pid = 0;
@@ -162,6 +226,16 @@ static inline void fixup_tlbie(void)
}
}
+static inline void fixup_tlbie_lpid(unsigned long lpid)
+{
+ unsigned long va = ((1UL << 52) - 1);
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_lpid_va(va, lpid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB);
+ }
+}
+
/*
* We use 128 set in radix mode and 256 set in hpt mode.
*/
@@ -215,6 +289,87 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
+static inline void _tlbiel_lpid(unsigned long lpid, unsigned long ric)
+{
+ int set;
+
+ VM_BUG_ON(mfspr(SPRN_LPID) != lpid);
+
+ asm volatile("ptesync": : :"memory");
+
+ /*
+ * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
+ * also flush the entire Page Walk Cache.
+ */
+ __tlbiel_lpid(lpid, 0, ric);
+
+ /* For PWC, only one flush is needed */
+ if (ric == RIC_FLUSH_PWC) {
+ asm volatile("ptesync": : :"memory");
+ return;
+ }
+
+ /* For the remaining sets, just flush the TLB */
+ for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
+ __tlbiel_lpid(lpid, set, RIC_FLUSH_TLB);
+
+ asm volatile("ptesync": : :"memory");
+ asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
+}
+
+static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric)
+{
+ asm volatile("ptesync": : :"memory");
+
+ /*
+ * Workaround the fact that the "ric" argument to __tlbie_pid
+ * must be a compile-time contraint to match the "i" constraint
+ * in the asm statement.
+ */
+ switch (ric) {
+ case RIC_FLUSH_TLB:
+ __tlbie_lpid(lpid, RIC_FLUSH_TLB);
+ break;
+ case RIC_FLUSH_PWC:
+ __tlbie_lpid(lpid, RIC_FLUSH_PWC);
+ break;
+ case RIC_FLUSH_ALL:
+ default:
+ __tlbie_lpid(lpid, RIC_FLUSH_ALL);
+ }
+ fixup_tlbie_lpid(lpid);
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+static inline void _tlbiel_lpid_guest(unsigned long lpid, unsigned long ric)
+{
+ int set;
+
+ VM_BUG_ON(mfspr(SPRN_LPID) != lpid);
+
+ asm volatile("ptesync": : :"memory");
+
+ /*
+ * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
+ * also flush the entire Page Walk Cache.
+ */
+ __tlbiel_lpid_guest(lpid, 0, ric);
+
+ /* For PWC, only one flush is needed */
+ if (ric == RIC_FLUSH_PWC) {
+ asm volatile("ptesync": : :"memory");
+ return;
+ }
+
+ /* For the remaining sets, just flush the TLB */
+ for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
+ __tlbiel_lpid_guest(lpid, set, RIC_FLUSH_TLB);
+
+ asm volatile("ptesync": : :"memory");
+ asm volatile(PPC_INVALIDATE_ERAT : : :"memory");
+}
+
+
static inline void __tlbiel_va_range(unsigned long start, unsigned long end,
unsigned long pid, unsigned long page_size,
unsigned long psize)
@@ -269,6 +424,17 @@ static inline void _tlbie_va(unsigned long va, unsigned long pid,
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
+static inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid,
+ unsigned long psize, unsigned long ric)
+{
+ unsigned long ap = mmu_get_ap(psize);
+
+ asm volatile("ptesync": : :"memory");
+ __tlbie_lpid_va(va, lpid, ap, ric);
+ fixup_tlbie_lpid(lpid);
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
static inline void _tlbie_va_range(unsigned long start, unsigned long end,
unsigned long pid, unsigned long page_size,
unsigned long psize, bool also_pwc)
@@ -341,6 +507,15 @@ void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmadd
}
EXPORT_SYMBOL(radix__local_flush_tlb_page);
+static bool mm_is_singlethreaded(struct mm_struct *mm)
+{
+ if (atomic_read(&mm->context.copros) > 0)
+ return false;
+ if (atomic_read(&mm->mm_users) <= 1 && current->mm == mm)
+ return true;
+ return false;
+}
+
static bool mm_needs_flush_escalation(struct mm_struct *mm)
{
/*
@@ -348,10 +523,47 @@ static bool mm_needs_flush_escalation(struct mm_struct *mm)
* caching PTEs and not flushing them properly when
* RIC = 0 for a PID/LPID invalidate
*/
- return atomic_read(&mm->context.copros) != 0;
+ if (atomic_read(&mm->context.copros) > 0)
+ return true;
+ return false;
}
#ifdef CONFIG_SMP
+static void do_exit_flush_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+ unsigned long pid = mm->context.id;
+
+ if (current->mm == mm)
+ return; /* Local CPU */
+
+ if (current->active_mm == mm) {
+ /*
+ * Must be a kernel thread because sender is single-threaded.
+ */
+ BUG_ON(current->mm);
+ mmgrab(&init_mm);
+ switch_mm(mm, &init_mm, current);
+ current->active_mm = &init_mm;
+ mmdrop(mm);
+ }
+ _tlbiel_pid(pid, RIC_FLUSH_ALL);
+}
+
+static void exit_flush_lazy_tlbs(struct mm_struct *mm)
+{
+ /*
+ * Would be nice if this was async so it could be run in
+ * parallel with our local flush, but generic code does not
+ * give a good API for it. Could extend the generic code or
+ * make a special powerpc IPI for flushing TLBs.
+ * For now it's not too performance critical.
+ */
+ smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb,
+ (void *)mm, 1);
+ mm_reset_thread_local(mm);
+}
+
void radix__flush_tlb_mm(struct mm_struct *mm)
{
unsigned long pid;
@@ -361,18 +573,30 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
return;
preempt_disable();
+ /*
+ * Order loads of mm_cpumask vs previous stores to clear ptes before
+ * the invalidate. See barrier in switch_mm_irqs_off
+ */
+ smp_mb();
if (!mm_is_thread_local(mm)) {
+ if (unlikely(mm_is_singlethreaded(mm))) {
+ exit_flush_lazy_tlbs(mm);
+ goto local;
+ }
+
if (mm_needs_flush_escalation(mm))
_tlbie_pid(pid, RIC_FLUSH_ALL);
else
_tlbie_pid(pid, RIC_FLUSH_TLB);
- } else
+ } else {
+local:
_tlbiel_pid(pid, RIC_FLUSH_TLB);
+ }
preempt_enable();
}
EXPORT_SYMBOL(radix__flush_tlb_mm);
-void radix__flush_all_mm(struct mm_struct *mm)
+static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
{
unsigned long pid;
@@ -381,12 +605,25 @@ void radix__flush_all_mm(struct mm_struct *mm)
return;
preempt_disable();
- if (!mm_is_thread_local(mm))
+ smp_mb(); /* see radix__flush_tlb_mm */
+ if (!mm_is_thread_local(mm)) {
+ if (unlikely(mm_is_singlethreaded(mm))) {
+ if (!fullmm) {
+ exit_flush_lazy_tlbs(mm);
+ goto local;
+ }
+ }
_tlbie_pid(pid, RIC_FLUSH_ALL);
- else
+ } else {
+local:
_tlbiel_pid(pid, RIC_FLUSH_ALL);
+ }
preempt_enable();
}
+void radix__flush_all_mm(struct mm_struct *mm)
+{
+ __flush_all_mm(mm, false);
+}
EXPORT_SYMBOL(radix__flush_all_mm);
void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
@@ -405,10 +642,17 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
return;
preempt_disable();
- if (!mm_is_thread_local(mm))
+ smp_mb(); /* see radix__flush_tlb_mm */
+ if (!mm_is_thread_local(mm)) {
+ if (unlikely(mm_is_singlethreaded(mm))) {
+ exit_flush_lazy_tlbs(mm);
+ goto local;
+ }
_tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
- else
+ } else {
+local:
_tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+ }
preempt_enable();
}
@@ -446,35 +690,38 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2;
-void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end)
+static inline void __radix__flush_tlb_range(struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ bool flush_all_sizes)
{
- struct mm_struct *mm = vma->vm_mm;
unsigned long pid;
unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift;
unsigned long page_size = 1UL << page_shift;
unsigned long nr_pages = (end - start) >> page_shift;
bool local, full;
-#ifdef CONFIG_HUGETLB_PAGE
- if (is_vm_hugetlb_page(vma))
- return radix__flush_hugetlb_tlb_range(vma, start, end);
-#endif
-
pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
return;
preempt_disable();
- if (mm_is_thread_local(mm)) {
- local = true;
- full = (end == TLB_FLUSH_ALL ||
- nr_pages > tlb_local_single_page_flush_ceiling);
- } else {
+ smp_mb(); /* see radix__flush_tlb_mm */
+ if (!mm_is_thread_local(mm)) {
+ if (unlikely(mm_is_singlethreaded(mm))) {
+ if (end != TLB_FLUSH_ALL) {
+ exit_flush_lazy_tlbs(mm);
+ goto is_local;
+ }
+ }
local = false;
full = (end == TLB_FLUSH_ALL ||
nr_pages > tlb_single_page_flush_ceiling);
+ } else {
+is_local:
+ local = true;
+ full = (end == TLB_FLUSH_ALL ||
+ nr_pages > tlb_local_single_page_flush_ceiling);
}
if (full) {
@@ -487,37 +734,64 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
_tlbie_pid(pid, RIC_FLUSH_TLB);
}
} else {
- bool hflush = false;
+ bool hflush = flush_all_sizes;
+ bool gflush = flush_all_sizes;
unsigned long hstart, hend;
+ unsigned long gstart, gend;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- hstart = (start + HPAGE_PMD_SIZE - 1) >> HPAGE_PMD_SHIFT;
- hend = end >> HPAGE_PMD_SHIFT;
- if (hstart < hend) {
- hstart <<= HPAGE_PMD_SHIFT;
- hend <<= HPAGE_PMD_SHIFT;
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
hflush = true;
+
+ if (hflush) {
+ hstart = (start + PMD_SIZE - 1) & PMD_MASK;
+ hend = end & PMD_MASK;
+ if (hstart == hend)
+ hflush = false;
+ }
+
+ if (gflush) {
+ gstart = (start + PUD_SIZE - 1) & PUD_MASK;
+ gend = end & PUD_MASK;
+ if (gstart == gend)
+ gflush = false;
}
-#endif
asm volatile("ptesync": : :"memory");
if (local) {
__tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize);
if (hflush)
__tlbiel_va_range(hstart, hend, pid,
- HPAGE_PMD_SIZE, MMU_PAGE_2M);
+ PMD_SIZE, MMU_PAGE_2M);
+ if (gflush)
+ __tlbiel_va_range(gstart, gend, pid,
+ PUD_SIZE, MMU_PAGE_1G);
asm volatile("ptesync": : :"memory");
} else {
__tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize);
if (hflush)
__tlbie_va_range(hstart, hend, pid,
- HPAGE_PMD_SIZE, MMU_PAGE_2M);
+ PMD_SIZE, MMU_PAGE_2M);
+ if (gflush)
+ __tlbie_va_range(gstart, gend, pid,
+ PUD_SIZE, MMU_PAGE_1G);
fixup_tlbie();
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
}
preempt_enable();
}
+
+void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end)
+
+{
+#ifdef CONFIG_HUGETLB_PAGE
+ if (is_vm_hugetlb_page(vma))
+ return radix__flush_hugetlb_tlb_range(vma, start, end);
+#endif
+
+ __radix__flush_tlb_range(vma->vm_mm, start, end, false);
+}
EXPORT_SYMBOL(radix__flush_tlb_range);
static int radix_get_mmu_psize(int page_size)
@@ -535,6 +809,58 @@ static int radix_get_mmu_psize(int page_size)
return psize;
}
+/*
+ * Flush partition scoped LPID address translation for all CPUs.
+ */
+void radix__flush_tlb_lpid_page(unsigned int lpid,
+ unsigned long addr,
+ unsigned long page_size)
+{
+ int psize = radix_get_mmu_psize(page_size);
+
+ _tlbie_lpid_va(addr, lpid, psize, RIC_FLUSH_TLB);
+}
+EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid_page);
+
+/*
+ * Flush partition scoped PWC from LPID for all CPUs.
+ */
+void radix__flush_pwc_lpid(unsigned int lpid)
+{
+ _tlbie_lpid(lpid, RIC_FLUSH_PWC);
+}
+EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid);
+
+/*
+ * Flush partition scoped translations from LPID (=LPIDR)
+ */
+void radix__flush_tlb_lpid(unsigned int lpid)
+{
+ _tlbie_lpid(lpid, RIC_FLUSH_ALL);
+}
+EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid);
+
+/*
+ * Flush partition scoped translations from LPID (=LPIDR)
+ */
+void radix__local_flush_tlb_lpid(unsigned int lpid)
+{
+ _tlbiel_lpid(lpid, RIC_FLUSH_ALL);
+}
+EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid);
+
+/*
+ * Flush process scoped translations from LPID (=LPIDR).
+ * Important difference, the guest normally manages its own translations,
+ * but some cases e.g., vCPU CPU migration require KVM to flush.
+ */
+void radix__local_flush_tlb_lpid_guest(unsigned int lpid)
+{
+ _tlbiel_lpid_guest(lpid, RIC_FLUSH_ALL);
+}
+EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid_guest);
+
+
static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
unsigned long end, int psize);
@@ -543,6 +869,8 @@ void radix__tlb_flush(struct mmu_gather *tlb)
int psize = 0;
struct mm_struct *mm = tlb->mm;
int page_size = tlb->page_size;
+ unsigned long start = tlb->start;
+ unsigned long end = tlb->end;
/*
* if page size is not something we understand, do a full mm flush
@@ -552,16 +880,46 @@ void radix__tlb_flush(struct mmu_gather *tlb)
* See the comment for radix in arch_exit_mmap().
*/
if (tlb->fullmm) {
- radix__flush_all_mm(mm);
+ __flush_all_mm(mm, true);
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
+ } else if (mm_tlb_flush_nested(mm)) {
+ /*
+ * If there is a concurrent invalidation that is clearing ptes,
+ * then it's possible this invalidation will miss one of those
+ * cleared ptes and miss flushing the TLB. If this invalidate
+ * returns before the other one flushes TLBs, that can result
+ * in it returning while there are still valid TLBs inside the
+ * range to be invalidated.
+ *
+ * See mm/memory.c:tlb_finish_mmu() for more details.
+ *
+ * The solution to this is ensure the entire range is always
+ * flushed here. The problem for powerpc is that the flushes
+ * are page size specific, so this "forced flush" would not
+ * do the right thing if there are a mix of page sizes in
+ * the range to be invalidated. So use __flush_tlb_range
+ * which invalidates all possible page sizes in the range.
+ *
+ * PWC flush probably is not be required because the core code
+ * shouldn't free page tables in this path, but accounting
+ * for the possibility makes us a bit more robust.
+ *
+ * need_flush_all is an uncommon case because page table
+ * teardown should be done with exclusive locks held (but
+ * after locks are dropped another invalidate could come
+ * in), it could be optimized further if necessary.
+ */
+ if (!tlb->need_flush_all)
+ __radix__flush_tlb_range(mm, start, end, true);
+ else
+ radix__flush_all_mm(mm);
+#endif
} else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
if (!tlb->need_flush_all)
radix__flush_tlb_mm(mm);
else
radix__flush_all_mm(mm);
} else {
- unsigned long start = tlb->start;
- unsigned long end = tlb->end;
-
if (!tlb->need_flush_all)
radix__flush_tlb_range_psize(mm, start, end, psize);
else
@@ -585,24 +943,33 @@ static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
return;
preempt_disable();
- if (mm_is_thread_local(mm)) {
- local = true;
- full = (end == TLB_FLUSH_ALL ||
- nr_pages > tlb_local_single_page_flush_ceiling);
- } else {
+ smp_mb(); /* see radix__flush_tlb_mm */
+ if (!mm_is_thread_local(mm)) {
+ if (unlikely(mm_is_singlethreaded(mm))) {
+ if (end != TLB_FLUSH_ALL) {
+ exit_flush_lazy_tlbs(mm);
+ goto is_local;
+ }
+ }
local = false;
full = (end == TLB_FLUSH_ALL ||
nr_pages > tlb_single_page_flush_ceiling);
+ } else {
+is_local:
+ local = true;
+ full = (end == TLB_FLUSH_ALL ||
+ nr_pages > tlb_local_single_page_flush_ceiling);
}
if (full) {
- if (!local && mm_needs_flush_escalation(mm))
- also_pwc = true;
-
- if (local)
+ if (local) {
_tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
- else
- _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL: RIC_FLUSH_TLB);
+ } else {
+ if (mm_needs_flush_escalation(mm))
+ also_pwc = true;
+
+ _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
+ }
} else {
if (local)
_tlbiel_va_range(start, end, pid, page_size, psize, also_pwc);
@@ -643,11 +1010,16 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
/* Otherwise first do the PWC, then iterate the pages. */
preempt_disable();
-
- if (mm_is_thread_local(mm)) {
- _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
- } else {
+ smp_mb(); /* see radix__flush_tlb_mm */
+ if (!mm_is_thread_local(mm)) {
+ if (unlikely(mm_is_singlethreaded(mm))) {
+ exit_flush_lazy_tlbs(mm);
+ goto local;
+ }
_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
+ } else {
+local:
+ _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
}
preempt_enable();
@@ -668,7 +1040,7 @@ void radix__flush_tlb_all(void)
rb = 0x3 << PPC_BITLSHIFT(53); /* IS = 3 */
prs = 0; /* partition scoped */
- r = 1; /* raidx format */
+ r = 1; /* radix format */
rs = 1 & ((1UL << 32) - 1); /* any LPID value to flush guest mappings */
asm volatile("ptesync": : :"memory");
@@ -685,28 +1057,10 @@ void radix__flush_tlb_all(void)
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
-void radix__flush_tlb_pte_p9_dd1(unsigned long old_pte, struct mm_struct *mm,
- unsigned long address)
-{
- /*
- * We track page size in pte only for DD1, So we can
- * call this only on DD1.
- */
- if (!cpu_has_feature(CPU_FTR_POWER9_DD1)) {
- VM_WARN_ON(1);
- return;
- }
-
- if (old_pte & R_PAGE_LARGE)
- radix__flush_tlb_page_psize(mm, address, MMU_PAGE_2M);
- else
- radix__flush_tlb_page_psize(mm, address, mmu_virtual_psize);
-}
-
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
extern void radix_kvm_prefetch_workaround(struct mm_struct *mm)
{
- unsigned int pid = mm->context.id;
+ unsigned long pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
return;
@@ -734,7 +1088,9 @@ extern void radix_kvm_prefetch_workaround(struct mm_struct *mm)
for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) {
if (sib == cpu)
continue;
- if (paca[sib].kvm_hstate.kvm_vcpu)
+ if (!cpu_possible(sib))
+ continue;
+ if (paca_ptrs[sib]->kvm_hstate.kvm_vcpu)
flush = true;
}
if (flush)
diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c
index 702d7689d714..cf8472cf3d59 100644
--- a/arch/powerpc/mm/tlb_hash32.c
+++ b/arch/powerpc/mm/tlb_hash32.c
@@ -41,7 +41,7 @@ void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr)
{
unsigned long ptephys;
- if (Hash != 0) {
+ if (Hash) {
ptephys = __pa(ptep) & PAGE_MASK;
flush_hash_pages(mm->context.id, addr, ptephys, 1);
}
@@ -54,7 +54,7 @@ EXPORT_SYMBOL(flush_hash_entry);
*/
void tlb_flush(struct mmu_gather *tlb)
{
- if (Hash == 0) {
+ if (!Hash) {
/*
* 603 needs to flush the whole TLB here since
* it doesn't use a hash table.
@@ -84,7 +84,7 @@ static void flush_range(struct mm_struct *mm, unsigned long start,
int count;
unsigned int ctx = mm->context.id;
- if (Hash == 0) {
+ if (!Hash) {
_tlbia();
return;
}
@@ -124,7 +124,7 @@ void flush_tlb_mm(struct mm_struct *mm)
{
struct vm_area_struct *mp;
- if (Hash == 0) {
+ if (!Hash) {
_tlbia();
return;
}
@@ -145,7 +145,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
struct mm_struct *mm;
pmd_t *pmd;
- if (Hash == 0) {
+ if (!Hash) {
_tlbie(vmaddr);
return;
}
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index 9b23f12e863c..87d71dd25441 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -89,7 +89,7 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
/* Build full vaddr */
if (!is_kernel_addr(addr)) {
ssize = user_segment_size(addr);
- vsid = get_vsid(mm->context.id, addr, ssize);
+ vsid = get_user_vsid(&mm->context, addr, ssize);
} else {
vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
ssize = mmu_kernel_ssize;
diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
index eb82d787d99a..7fd20c52a8ec 100644
--- a/arch/powerpc/mm/tlb_low_64e.S
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -22,6 +22,7 @@
#include <asm/ppc-opcode.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_booke_hv_asm.h>
+#include <asm/feature-fixups.h>
#ifdef CONFIG_PPC_64K_PAGES
#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE+1)
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index 15fe5f0c8665..ae5d568e267f 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -503,6 +503,9 @@ static void setup_page_sizes(void)
for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
struct mmu_psize_def *def = &mmu_psize_defs[psize];
+ if (!def->shift)
+ continue;
+
if (tlb1ps & (1U << (def->shift - 10))) {
def->flags |= MMU_PAGE_SIZE_DIRECT;
diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S
index 048b8e9f4492..e066a658acac 100644
--- a/arch/powerpc/mm/tlb_nohash_low.S
+++ b/arch/powerpc/mm/tlb_nohash_low.S
@@ -34,6 +34,8 @@
#include <asm/asm-offsets.h>
#include <asm/processor.h>
#include <asm/bug.h>
+#include <asm/asm-compat.h>
+#include <asm/feature-fixups.h>
#if defined(CONFIG_40x)
OpenPOWER on IntegriCloud