diff options
Diffstat (limited to 'arch/powerpc/mm')
51 files changed, 3027 insertions, 2076 deletions
diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c index 82b1ff759e26..12d92518e898 100644 --- a/arch/powerpc/mm/44x_mmu.c +++ b/arch/powerpc/mm/44x_mmu.c @@ -229,7 +229,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base, } #ifdef CONFIG_SMP -void mmu_init_secondary(int cpu) +void __init mmu_init_secondary(int cpu) { unsigned long addr; unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1); diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c index 849f50cd62f2..01b7f5107c3a 100644 --- a/arch/powerpc/mm/8xx_mmu.c +++ b/arch/powerpc/mm/8xx_mmu.c @@ -13,6 +13,7 @@ */ #include <linux/memblock.h> +#include <linux/mmu_context.h> #include <asm/fixmap.h> #include <asm/code-patching.h> @@ -67,7 +68,7 @@ void __init MMU_init_hw(void) /* PIN up to the 3 first 8Mb after IMMR in DTLB table */ #ifdef CONFIG_PIN_TLB_DATA unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000; - unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_PRIVILEGED | _PAGE_DIRTY; + unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY; #ifdef CONFIG_PIN_TLB_IMMR int i = 29; #else @@ -79,7 +80,7 @@ void __init MMU_init_hw(void) for (; i < 32 && mem >= LARGE_PAGE_SIZE_8M; i++) { mtspr(SPRN_MD_CTR, ctr | (i << 8)); mtspr(SPRN_MD_EPN, (unsigned long)__va(addr) | MD_EVALID); - mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID | M_APG2); + mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID); mtspr(SPRN_MD_RPN, addr | flags | _PAGE_PRESENT); addr += LARGE_PAGE_SIZE_8M; mem -= LARGE_PAGE_SIZE_8M; @@ -91,29 +92,19 @@ static void __init mmu_mapin_immr(void) { unsigned long p = PHYS_IMMR_BASE; unsigned long v = VIRT_IMMR_BASE; - unsigned long f = pgprot_val(PAGE_KERNEL_NCG); int offset; for (offset = 0; offset < IMMR_SIZE; offset += PAGE_SIZE) - map_kernel_page(v + offset, p + offset, f); + map_kernel_page(v + offset, p + offset, PAGE_KERNEL_NCG); } -/* Address of instructions to patch */ -#ifndef CONFIG_PIN_TLB_IMMR -extern unsigned int DTLBMiss_jmp; -#endif -extern unsigned int DTLBMiss_cmp, FixupDAR_cmp; -#ifndef CONFIG_PIN_TLB_TEXT -extern unsigned int ITLBMiss_cmp; -#endif - -static void __init mmu_patch_cmp_limit(unsigned int *addr, unsigned long mapped) +static void __init mmu_patch_cmp_limit(s32 *site, unsigned long mapped) { - unsigned int instr = *addr; + unsigned int instr = *(unsigned int *)patch_site_addr(site); instr &= 0xffff0000; instr |= (unsigned long)__va(mapped) >> 16; - patch_instruction(addr, instr); + patch_instruction_site(site, instr); } unsigned long __init mmu_mapin_ram(unsigned long top) @@ -124,17 +115,17 @@ unsigned long __init mmu_mapin_ram(unsigned long top) mapped = 0; mmu_mapin_immr(); #ifndef CONFIG_PIN_TLB_IMMR - patch_instruction(&DTLBMiss_jmp, PPC_INST_NOP); + patch_instruction_site(&patch__dtlbmiss_immr_jmp, PPC_INST_NOP); #endif #ifndef CONFIG_PIN_TLB_TEXT - mmu_patch_cmp_limit(&ITLBMiss_cmp, 0); + mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, 0); #endif } else { mapped = top & ~(LARGE_PAGE_SIZE_8M - 1); } - mmu_patch_cmp_limit(&DTLBMiss_cmp, mapped); - mmu_patch_cmp_limit(&FixupDAR_cmp, mapped); + mmu_patch_cmp_limit(&patch__dtlbmiss_linmem_top, mapped); + mmu_patch_cmp_limit(&patch__fixupdar_linmem_top, mapped); /* If the size of RAM is not an exact power of two, we may not * have covered RAM in its entirety with 8 MiB @@ -192,7 +183,7 @@ void set_context(unsigned long id, pgd_t *pgd) mtspr(SPRN_M_TW, __pa(pgd) - offset); /* Update context */ - mtspr(SPRN_M_CASID, id); + mtspr(SPRN_M_CASID, id - 1); /* sync */ mb(); } diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index f06f3577d8d1..ca96e7be4d0e 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -3,10 +3,10 @@ # Makefile for the linux ppc-specific parts of the memory manager. # -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror - ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) +CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE) + obj-y := fault.o mem.o pgtable.o mmap.o \ init_$(BITS).o pgtable_$(BITS).o \ init-common.o mmu_context.o drmem.o @@ -15,11 +15,11 @@ obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(BITS)e.o hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o -obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o +obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-hash64.o hash_utils_64.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o tlb-radix.o obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(BITS).o -ifeq ($(CONFIG_PPC_BOOK3S_64),y) +ifdef CONFIG_PPC_BOOK3S_64 obj-$(CONFIG_PPC_4K_PAGES) += hash64_4k.o obj-$(CONFIG_PPC_64K_PAGES) += hash64_64k.o endif @@ -31,7 +31,7 @@ obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o obj-$(CONFIG_PPC_SPLPAR) += vphn.o obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-y += hugetlbpage.o -ifeq ($(CONFIG_HUGETLB_PAGE),y) +ifdef CONFIG_HUGETLB_PAGE obj-$(CONFIG_PPC_BOOK3S_64) += hugetlbpage-hash64.o obj-$(CONFIG_PPC_RADIX_MMU) += hugetlbpage-radix.o obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o @@ -43,5 +43,12 @@ obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o obj-$(CONFIG_SPAPR_TCE_IOMMU) += mmu_context_iommu.o obj-$(CONFIG_PPC_PTDUMP) += dump_linuxpagetables.o +ifdef CONFIG_PPC_PTDUMP +obj-$(CONFIG_4xx) += dump_linuxpagetables-generic.o +obj-$(CONFIG_PPC_8xx) += dump_linuxpagetables-8xx.o +obj-$(CONFIG_PPC_BOOK3E_MMU) += dump_linuxpagetables-generic.o +obj-$(CONFIG_PPC_BOOK3S_32) += dump_linuxpagetables-generic.o +obj-$(CONFIG_PPC_BOOK3S_64) += dump_linuxpagetables-book3s64.o +endif obj-$(CONFIG_PPC_HTDUMP) += dump_hashpagetable.o obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index 697b70ad1195..c8da352e8686 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -34,7 +34,7 @@ * to handle fortunately. */ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea, - unsigned long dsisr, unsigned *flt) + unsigned long dsisr, vm_fault_t *flt) { struct vm_area_struct *vma; unsigned long is_write; @@ -112,7 +112,7 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb) return 1; psize = get_slice_psize(mm, ea); ssize = user_segment_size(ea); - vsid = get_vsid(mm->context.id, ea, ssize); + vsid = get_user_vsid(&mm->context, ea, ssize); vsidkey = SLB_VSID_USER; break; case VMALLOC_REGION_ID: diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c index 382528475433..b6e7b5952ab5 100644 --- a/arch/powerpc/mm/dma-noncoherent.c +++ b/arch/powerpc/mm/dma-noncoherent.c @@ -228,7 +228,7 @@ __dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, gfp_t do { SetPageReserved(page); map_kernel_page(vaddr, page_to_phys(page), - pgprot_val(pgprot_noncached(PAGE_KERNEL))); + pgprot_noncached(PAGE_KERNEL)); page++; vaddr += PAGE_SIZE; } while (size -= PAGE_SIZE); diff --git a/arch/powerpc/mm/dump_hashpagetable.c b/arch/powerpc/mm/dump_hashpagetable.c index 14cfb11b09d0..869294695048 100644 --- a/arch/powerpc/mm/dump_hashpagetable.c +++ b/arch/powerpc/mm/dump_hashpagetable.c @@ -19,7 +19,6 @@ #include <linux/mm.h> #include <linux/sched.h> #include <linux/seq_file.h> -#include <asm/fixmap.h> #include <asm/pgtable.h> #include <linux/const.h> #include <asm/page.h> @@ -260,7 +259,7 @@ static int pseries_find(unsigned long ea, int psize, bool primary, u64 *v, u64 * /* to check in the secondary hash table, we invert the hash */ if (!primary) hash = ~hash; - hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; /* see if we can find an entry in the hpte with this hash */ for (i = 0; i < HPTES_PER_GROUP; i += 4, hpte_group += 4) { lpar_rc = plpar_pte_read_4(0, hpte_group, (void *)ptes); diff --git a/arch/powerpc/mm/dump_linuxpagetables-8xx.c b/arch/powerpc/mm/dump_linuxpagetables-8xx.c new file mode 100644 index 000000000000..ab9e3f24db2f --- /dev/null +++ b/arch/powerpc/mm/dump_linuxpagetables-8xx.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * From split of dump_linuxpagetables.c + * Copyright 2016, Rashmica Gupta, IBM Corp. + * + */ +#include <linux/kernel.h> +#include <asm/pgtable.h> + +#include "dump_linuxpagetables.h" + +static const struct flag_info flag_array[] = { + { + .mask = _PAGE_SH, + .val = 0, + .set = "user", + .clear = " ", + }, { + .mask = _PAGE_RO | _PAGE_NA, + .val = 0, + .set = "rw", + }, { + .mask = _PAGE_RO | _PAGE_NA, + .val = _PAGE_RO, + .set = "r ", + }, { + .mask = _PAGE_RO | _PAGE_NA, + .val = _PAGE_NA, + .set = " ", + }, { + .mask = _PAGE_EXEC, + .val = _PAGE_EXEC, + .set = " X ", + .clear = " ", + }, { + .mask = _PAGE_PRESENT, + .val = _PAGE_PRESENT, + .set = "present", + .clear = " ", + }, { + .mask = _PAGE_GUARDED, + .val = _PAGE_GUARDED, + .set = "guarded", + .clear = " ", + }, { + .mask = _PAGE_DIRTY, + .val = _PAGE_DIRTY, + .set = "dirty", + .clear = " ", + }, { + .mask = _PAGE_ACCESSED, + .val = _PAGE_ACCESSED, + .set = "accessed", + .clear = " ", + }, { + .mask = _PAGE_NO_CACHE, + .val = _PAGE_NO_CACHE, + .set = "no cache", + .clear = " ", + }, { + .mask = _PAGE_SPECIAL, + .val = _PAGE_SPECIAL, + .set = "special", + } +}; + +struct pgtable_level pg_level[5] = { + { + }, { /* pgd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pud */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pmd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pte */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, +}; diff --git a/arch/powerpc/mm/dump_linuxpagetables-book3s64.c b/arch/powerpc/mm/dump_linuxpagetables-book3s64.c new file mode 100644 index 000000000000..ed6fcf78256e --- /dev/null +++ b/arch/powerpc/mm/dump_linuxpagetables-book3s64.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * From split of dump_linuxpagetables.c + * Copyright 2016, Rashmica Gupta, IBM Corp. + * + */ +#include <linux/kernel.h> +#include <asm/pgtable.h> + +#include "dump_linuxpagetables.h" + +static const struct flag_info flag_array[] = { + { + .mask = _PAGE_PRIVILEGED, + .val = 0, + .set = "user", + .clear = " ", + }, { + .mask = _PAGE_READ, + .val = _PAGE_READ, + .set = "r", + .clear = " ", + }, { + .mask = _PAGE_WRITE, + .val = _PAGE_WRITE, + .set = "w", + .clear = " ", + }, { + .mask = _PAGE_EXEC, + .val = _PAGE_EXEC, + .set = " X ", + .clear = " ", + }, { + .mask = _PAGE_PTE, + .val = _PAGE_PTE, + .set = "pte", + .clear = " ", + }, { + .mask = _PAGE_PRESENT, + .val = _PAGE_PRESENT, + .set = "valid", + .clear = " ", + }, { + .mask = _PAGE_PRESENT | _PAGE_INVALID, + .val = 0, + .set = " ", + .clear = "present", + }, { + .mask = H_PAGE_HASHPTE, + .val = H_PAGE_HASHPTE, + .set = "hpte", + .clear = " ", + }, { + .mask = _PAGE_DIRTY, + .val = _PAGE_DIRTY, + .set = "dirty", + .clear = " ", + }, { + .mask = _PAGE_ACCESSED, + .val = _PAGE_ACCESSED, + .set = "accessed", + .clear = " ", + }, { + .mask = _PAGE_NON_IDEMPOTENT, + .val = _PAGE_NON_IDEMPOTENT, + .set = "non-idempotent", + .clear = " ", + }, { + .mask = _PAGE_TOLERANT, + .val = _PAGE_TOLERANT, + .set = "tolerant", + .clear = " ", + }, { + .mask = H_PAGE_BUSY, + .val = H_PAGE_BUSY, + .set = "busy", + }, { +#ifdef CONFIG_PPC_64K_PAGES + .mask = H_PAGE_COMBO, + .val = H_PAGE_COMBO, + .set = "combo", + }, { + .mask = H_PAGE_4K_PFN, + .val = H_PAGE_4K_PFN, + .set = "4K_pfn", + }, { +#else /* CONFIG_PPC_64K_PAGES */ + .mask = H_PAGE_F_GIX, + .val = H_PAGE_F_GIX, + .set = "f_gix", + .is_val = true, + .shift = H_PAGE_F_GIX_SHIFT, + }, { + .mask = H_PAGE_F_SECOND, + .val = H_PAGE_F_SECOND, + .set = "f_second", + }, { +#endif /* CONFIG_PPC_64K_PAGES */ + .mask = _PAGE_SPECIAL, + .val = _PAGE_SPECIAL, + .set = "special", + } +}; + +struct pgtable_level pg_level[5] = { + { + }, { /* pgd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pud */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pmd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pte */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, +}; diff --git a/arch/powerpc/mm/dump_linuxpagetables-generic.c b/arch/powerpc/mm/dump_linuxpagetables-generic.c new file mode 100644 index 000000000000..1e3829ec1348 --- /dev/null +++ b/arch/powerpc/mm/dump_linuxpagetables-generic.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * From split of dump_linuxpagetables.c + * Copyright 2016, Rashmica Gupta, IBM Corp. + * + */ +#include <linux/kernel.h> +#include <asm/pgtable.h> + +#include "dump_linuxpagetables.h" + +static const struct flag_info flag_array[] = { + { + .mask = _PAGE_USER, + .val = _PAGE_USER, + .set = "user", + .clear = " ", + }, { + .mask = _PAGE_RW, + .val = _PAGE_RW, + .set = "rw", + .clear = "r ", + }, { +#ifndef CONFIG_PPC_BOOK3S_32 + .mask = _PAGE_EXEC, + .val = _PAGE_EXEC, + .set = " X ", + .clear = " ", + }, { +#endif + .mask = _PAGE_PRESENT, + .val = _PAGE_PRESENT, + .set = "present", + .clear = " ", + }, { + .mask = _PAGE_GUARDED, + .val = _PAGE_GUARDED, + .set = "guarded", + .clear = " ", + }, { + .mask = _PAGE_DIRTY, + .val = _PAGE_DIRTY, + .set = "dirty", + .clear = " ", + }, { + .mask = _PAGE_ACCESSED, + .val = _PAGE_ACCESSED, + .set = "accessed", + .clear = " ", + }, { + .mask = _PAGE_WRITETHRU, + .val = _PAGE_WRITETHRU, + .set = "write through", + .clear = " ", + }, { + .mask = _PAGE_NO_CACHE, + .val = _PAGE_NO_CACHE, + .set = "no cache", + .clear = " ", + }, { + .mask = _PAGE_SPECIAL, + .val = _PAGE_SPECIAL, + .set = "special", + } +}; + +struct pgtable_level pg_level[5] = { + { + }, { /* pgd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pud */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pmd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pte */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, +}; diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c index 876e2a3c79f2..2b74f8adf4d0 100644 --- a/arch/powerpc/mm/dump_linuxpagetables.c +++ b/arch/powerpc/mm/dump_linuxpagetables.c @@ -27,6 +27,8 @@ #include <asm/page.h> #include <asm/pgalloc.h> +#include "dump_linuxpagetables.h" + #ifdef CONFIG_PPC32 #define KERN_VIRT_START 0 #endif @@ -101,159 +103,6 @@ static struct addr_marker address_markers[] = { { -1, NULL }, }; -struct flag_info { - u64 mask; - u64 val; - const char *set; - const char *clear; - bool is_val; - int shift; -}; - -static const struct flag_info flag_array[] = { - { - .mask = _PAGE_USER | _PAGE_PRIVILEGED, - .val = _PAGE_USER, - .set = "user", - .clear = " ", - }, { - .mask = _PAGE_RW | _PAGE_RO | _PAGE_NA, - .val = _PAGE_RW, - .set = "rw", - }, { - .mask = _PAGE_RW | _PAGE_RO | _PAGE_NA, - .val = _PAGE_RO, - .set = "ro", - }, { -#if _PAGE_NA != 0 - .mask = _PAGE_RW | _PAGE_RO | _PAGE_NA, - .val = _PAGE_RO, - .set = "na", - }, { -#endif - .mask = _PAGE_EXEC, - .val = _PAGE_EXEC, - .set = " X ", - .clear = " ", - }, { - .mask = _PAGE_PTE, - .val = _PAGE_PTE, - .set = "pte", - .clear = " ", - }, { - .mask = _PAGE_PRESENT, - .val = _PAGE_PRESENT, - .set = "present", - .clear = " ", - }, { -#ifdef CONFIG_PPC_BOOK3S_64 - .mask = H_PAGE_HASHPTE, - .val = H_PAGE_HASHPTE, -#else - .mask = _PAGE_HASHPTE, - .val = _PAGE_HASHPTE, -#endif - .set = "hpte", - .clear = " ", - }, { -#ifndef CONFIG_PPC_BOOK3S_64 - .mask = _PAGE_GUARDED, - .val = _PAGE_GUARDED, - .set = "guarded", - .clear = " ", - }, { -#endif - .mask = _PAGE_DIRTY, - .val = _PAGE_DIRTY, - .set = "dirty", - .clear = " ", - }, { - .mask = _PAGE_ACCESSED, - .val = _PAGE_ACCESSED, - .set = "accessed", - .clear = " ", - }, { -#ifndef CONFIG_PPC_BOOK3S_64 - .mask = _PAGE_WRITETHRU, - .val = _PAGE_WRITETHRU, - .set = "write through", - .clear = " ", - }, { -#endif -#ifndef CONFIG_PPC_BOOK3S_64 - .mask = _PAGE_NO_CACHE, - .val = _PAGE_NO_CACHE, - .set = "no cache", - .clear = " ", - }, { -#else - .mask = _PAGE_NON_IDEMPOTENT, - .val = _PAGE_NON_IDEMPOTENT, - .set = "non-idempotent", - .clear = " ", - }, { - .mask = _PAGE_TOLERANT, - .val = _PAGE_TOLERANT, - .set = "tolerant", - .clear = " ", - }, { -#endif -#ifdef CONFIG_PPC_BOOK3S_64 - .mask = H_PAGE_BUSY, - .val = H_PAGE_BUSY, - .set = "busy", - }, { -#ifdef CONFIG_PPC_64K_PAGES - .mask = H_PAGE_COMBO, - .val = H_PAGE_COMBO, - .set = "combo", - }, { - .mask = H_PAGE_4K_PFN, - .val = H_PAGE_4K_PFN, - .set = "4K_pfn", - }, { -#else /* CONFIG_PPC_64K_PAGES */ - .mask = H_PAGE_F_GIX, - .val = H_PAGE_F_GIX, - .set = "f_gix", - .is_val = true, - .shift = H_PAGE_F_GIX_SHIFT, - }, { - .mask = H_PAGE_F_SECOND, - .val = H_PAGE_F_SECOND, - .set = "f_second", - }, { -#endif /* CONFIG_PPC_64K_PAGES */ -#endif - .mask = _PAGE_SPECIAL, - .val = _PAGE_SPECIAL, - .set = "special", - } -}; - -struct pgtable_level { - const struct flag_info *flag; - size_t num; - u64 mask; -}; - -static struct pgtable_level pg_level[] = { - { - }, { /* pgd */ - .flag = flag_array, - .num = ARRAY_SIZE(flag_array), - }, { /* pud */ - .flag = flag_array, - .num = ARRAY_SIZE(flag_array), - }, { /* pmd */ - .flag = flag_array, - .num = ARRAY_SIZE(flag_array), - }, { /* pte */ - .flag = flag_array, - .num = ARRAY_SIZE(flag_array), - }, -}; - static void dump_flag_info(struct pg_state *st, const struct flag_info *flag, u64 pte, int num) { @@ -418,12 +267,13 @@ static void walk_pagetables(struct pg_state *st) unsigned int i; unsigned long addr; + addr = st->start_address; + /* * Traverse the linux pagetable structure and dump pages that are in * the hash pagetable. */ - for (i = 0; i < PTRS_PER_PGD; i++, pgd++) { - addr = KERN_VIRT_START + i * PGDIR_SIZE; + for (i = 0; i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) { if (!pgd_none(*pgd) && !pgd_huge(*pgd)) /* pgd exists */ walk_pud(st, pgd, addr); @@ -472,9 +322,14 @@ static int ptdump_show(struct seq_file *m, void *v) { struct pg_state st = { .seq = m, - .start_address = KERN_VIRT_START, .marker = address_markers, }; + + if (radix_enabled()) + st.start_address = PAGE_OFFSET; + else + st.start_address = KERN_VIRT_START; + /* Traverse kernel page tables */ walk_pagetables(&st); note_page(&st, 0, 0, 0); diff --git a/arch/powerpc/mm/dump_linuxpagetables.h b/arch/powerpc/mm/dump_linuxpagetables.h new file mode 100644 index 000000000000..5d513636de73 --- /dev/null +++ b/arch/powerpc/mm/dump_linuxpagetables.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/types.h> + +struct flag_info { + u64 mask; + u64 val; + const char *set; + const char *clear; + bool is_val; + int shift; +}; + +struct pgtable_level { + const struct flag_info *flag; + size_t num; + u64 mask; +}; + +extern struct pgtable_level pg_level[5]; diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 866446cf2d9a..1697e903bbf2 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -22,6 +22,7 @@ #include <linux/errno.h> #include <linux/string.h> #include <linux/types.h> +#include <linux/pagemap.h> #include <linux/ptrace.h> #include <linux/mman.h> #include <linux/mm.h> @@ -41,7 +42,6 @@ #include <asm/pgtable.h> #include <asm/mmu.h> #include <asm/mmu_context.h> -#include <asm/tlbflush.h> #include <asm/siginfo.h> #include <asm/debug.h> @@ -66,37 +66,33 @@ static inline bool notify_page_fault(struct pt_regs *regs) } /* - * Check whether the instruction at regs->nip is a store using + * Check whether the instruction inst is a store using * an update addressing form which will update r1. */ -static bool store_updates_sp(struct pt_regs *regs) +static bool store_updates_sp(unsigned int inst) { - unsigned int inst; - - if (get_user(inst, (unsigned int __user *)regs->nip)) - return false; /* check for 1 in the rA field */ if (((inst >> 16) & 0x1f) != 1) return false; /* check major opcode */ switch (inst >> 26) { - case 37: /* stwu */ - case 39: /* stbu */ - case 45: /* sthu */ - case 53: /* stfsu */ - case 55: /* stfdu */ + case OP_STWU: + case OP_STBU: + case OP_STHU: + case OP_STFSU: + case OP_STFDU: return true; - case 62: /* std or stdu */ + case OP_STD: /* std or stdu */ return (inst & 3) == 1; - case 31: + case OP_31: /* check minor opcode */ switch ((inst >> 1) & 0x3ff) { - case 181: /* stdux */ - case 183: /* stwux */ - case 247: /* stbux */ - case 439: /* sthux */ - case 695: /* stfsux */ - case 759: /* stfdux */ + case OP_31_XOP_STDUX: + case OP_31_XOP_STWUX: + case OP_31_XOP_STBUX: + case OP_31_XOP_STHUX: + case OP_31_XOP_STFSUX: + case OP_31_XOP_STFDUX: return true; } } @@ -107,8 +103,7 @@ static bool store_updates_sp(struct pt_regs *regs) */ static int -__bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code, - int pkey) +__bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code) { /* * If we are in kernel mode, bail out with a SEGV, this will @@ -118,18 +113,17 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code, if (!user_mode(regs)) return SIGSEGV; - _exception_pkey(SIGSEGV, regs, si_code, address, pkey); + _exception(SIGSEGV, regs, si_code, address); return 0; } static noinline int bad_area_nosemaphore(struct pt_regs *regs, unsigned long address) { - return __bad_area_nosemaphore(regs, address, SEGV_MAPERR, 0); + return __bad_area_nosemaphore(regs, address, SEGV_MAPERR); } -static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code, - int pkey) +static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code) { struct mm_struct *mm = current->mm; @@ -139,57 +133,66 @@ static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code, */ up_read(&mm->mmap_sem); - return __bad_area_nosemaphore(regs, address, si_code, pkey); + return __bad_area_nosemaphore(regs, address, si_code); } static noinline int bad_area(struct pt_regs *regs, unsigned long address) { - return __bad_area(regs, address, SEGV_MAPERR, 0); + return __bad_area(regs, address, SEGV_MAPERR); } static int bad_key_fault_exception(struct pt_regs *regs, unsigned long address, int pkey) { - return __bad_area_nosemaphore(regs, address, SEGV_PKUERR, pkey); + /* + * If we are in kernel mode, bail out with a SEGV, this will + * be caught by the assembly which will restore the non-volatile + * registers before calling bad_page_fault() + */ + if (!user_mode(regs)) + return SIGSEGV; + + _exception_pkey(regs, address, pkey); + + return 0; } static noinline int bad_access(struct pt_regs *regs, unsigned long address) { - return __bad_area(regs, address, SEGV_ACCERR, 0); + return __bad_area(regs, address, SEGV_ACCERR); } static int do_sigbus(struct pt_regs *regs, unsigned long address, - unsigned int fault) + vm_fault_t fault) { - siginfo_t info; - unsigned int lsb = 0; - if (!user_mode(regs)) return SIGBUS; current->thread.trap_nr = BUS_ADRERR; - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_ADRERR; - info.si_addr = (void __user *)address; #ifdef CONFIG_MEMORY_FAILURE if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { + unsigned int lsb = 0; /* shutup gcc */ + pr_err("MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", current->comm, current->pid, address); - info.si_code = BUS_MCEERR_AR; + + if (fault & VM_FAULT_HWPOISON_LARGE) + lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); + if (fault & VM_FAULT_HWPOISON) + lsb = PAGE_SHIFT; + + force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, + current); + return 0; } - if (fault & VM_FAULT_HWPOISON_LARGE) - lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); - if (fault & VM_FAULT_HWPOISON) - lsb = PAGE_SHIFT; #endif - info.si_addr_lsb = lsb; - force_sig_info(SIGBUS, &info, current); + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, current); return 0; } -static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault) +static int mm_fault_error(struct pt_regs *regs, unsigned long addr, + vm_fault_t fault) { /* * Kernel page fault interrupted by SIGKILL. We have no reason to @@ -234,8 +237,8 @@ static bool bad_kernel_fault(bool is_exec, unsigned long error_code, } static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address, - struct vm_area_struct *vma, - bool store_update_sp) + struct vm_area_struct *vma, unsigned int flags, + bool *must_retry) { /* * N.B. The POWER/Open ABI allows programs to access up to @@ -247,6 +250,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address, * expand to 1MB without further checks. */ if (address + 0x100000 < vma->vm_end) { + unsigned int __user *nip = (unsigned int __user *)regs->nip; /* get user regs even if this fault is in kernel mode */ struct pt_regs *uregs = current->thread.regs; if (uregs == NULL) @@ -264,8 +268,22 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address, * between the last mapped region and the stack will * expand the stack rather than segfaulting. */ - if (address + 2048 < uregs->gpr[1] && !store_update_sp) - return true; + if (address + 2048 >= uregs->gpr[1]) + return false; + + if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) && + access_ok(VERIFY_READ, nip, sizeof(*nip))) { + unsigned int inst; + int res; + + pagefault_disable(); + res = __get_user_inatomic(inst, nip); + pagefault_enable(); + if (!res) + return !store_updates_sp(inst); + *must_retry = true; + } + return true; } return false; } @@ -297,7 +315,12 @@ static bool access_error(bool is_write, bool is_exec, if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) return true; - + /* + * We should ideally do the vma pkey access check here. But in the + * fault path, handle_mm_fault() also does the same check. To avoid + * these multiple checks, we skip it here and handle access error due + * to pkeys later. + */ return false; } @@ -397,8 +420,8 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address, int is_exec = TRAP(regs) == 0x400; int is_user = user_mode(regs); int is_write = page_fault_is_write(error_code); - int fault, major = 0; - bool store_update_sp = false; + vm_fault_t fault, major = 0; + bool must_retry = false; if (notify_page_fault(regs)) return 0; @@ -449,9 +472,6 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address, * can result in fault, which will cause a deadlock when called with * mmap_sem held */ - if (is_write && is_user) - store_update_sp = store_updates_sp(regs); - if (is_user) flags |= FAULT_FLAG_USER; if (is_write) @@ -498,8 +518,17 @@ retry: return bad_area(regs, address); /* The stack is being expanded, check if it's valid */ - if (unlikely(bad_stack_expansion(regs, address, vma, store_update_sp))) - return bad_area(regs, address); + if (unlikely(bad_stack_expansion(regs, address, vma, flags, + &must_retry))) { + if (!must_retry) + return bad_area(regs, address); + + up_read(&mm->mmap_sem); + if (fault_in_pages_readable((const char __user *)regs->nip, + sizeof(unsigned int))) + return bad_area_nosemaphore(regs, address); + goto retry; + } /* Try to expand it */ if (unlikely(expand_stack(vma, address))) @@ -518,25 +547,16 @@ good_area: #ifdef CONFIG_PPC_MEM_KEYS /* - * if the HPTE is not hashed, hardware will not detect - * a key fault. Lets check if we failed because of a - * software detected key fault. + * we skipped checking for access error due to key earlier. + * Check that using handle_mm_fault error return. */ if (unlikely(fault & VM_FAULT_SIGSEGV) && - !arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, - is_exec, 0)) { - /* - * The PGD-PDT...PMD-PTE tree may not have been fully setup. - * Hence we cannot walk the tree to locate the PTE, to locate - * the key. Hence let's use vma_pkey() to get the key; instead - * of get_mm_addr_key(). - */ + !arch_vma_access_permitted(vma, is_write, is_exec, 0)) { + int pkey = vma_pkey(vma); - if (likely(pkey)) { - up_read(&mm->mmap_sem); - return bad_key_fault_exception(regs, address, pkey); - } + up_read(&mm->mmap_sem); + return bad_key_fault_exception(regs, address, pkey); } #endif /* CONFIG_PPC_MEM_KEYS */ diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c index d573d7d07f25..6fa6765a10eb 100644 --- a/arch/powerpc/mm/hash64_4k.c +++ b/arch/powerpc/mm/hash64_4k.c @@ -80,7 +80,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, hash = hpt_hash(vpn, shift, ssize); repeat: - hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; /* Insert into the hash table, primary slot */ slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0, @@ -89,7 +89,7 @@ repeat: * Primary is full, try the secondary */ if (unlikely(slot == -1)) { - hpte_group = ((~hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, HPTE_V_SECONDARY, @@ -97,8 +97,8 @@ repeat: MMU_PAGE_4K, ssize); if (slot == -1) { if (mftb() & 0x1) - hpte_group = ((hash & htab_hash_mask) * - HPTES_PER_GROUP) & ~0x7UL; + hpte_group = (hash & htab_hash_mask) * + HPTES_PER_GROUP; mmu_hash_ops.hpte_remove(hpte_group); /* * FIXME!! Should be try the group from which we removed ? diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c index e601d95c3b20..3afa253d7f52 100644 --- a/arch/powerpc/mm/hash64_64k.c +++ b/arch/powerpc/mm/hash64_64k.c @@ -154,7 +154,7 @@ htab_insert_hpte: } hash = hpt_hash(vpn, shift, ssize); repeat: - hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; /* Insert into the hash table, primary slot */ slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0, @@ -165,7 +165,7 @@ repeat: if (unlikely(slot == -1)) { bool soft_invalid; - hpte_group = ((~hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, HPTE_V_SECONDARY, MMU_PAGE_4K, MMU_PAGE_4K, @@ -193,8 +193,7 @@ repeat: * that we do not get the same soft-invalid slot. */ if (soft_invalid || (mftb() & 0x1)) - hpte_group = ((hash & htab_hash_mask) * - HPTES_PER_GROUP) & ~0x7UL; + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; mmu_hash_ops.hpte_remove(hpte_group); /* @@ -288,7 +287,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access, hash = hpt_hash(vpn, shift, ssize); repeat: - hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; /* Insert into the hash table, primary slot */ slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0, @@ -298,7 +297,7 @@ repeat: * Primary is full, try the secondary */ if (unlikely(slot == -1)) { - hpte_group = ((~hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, HPTE_V_SECONDARY, @@ -306,8 +305,8 @@ repeat: MMU_PAGE_64K, ssize); if (slot == -1) { if (mftb() & 0x1) - hpte_group = ((hash & htab_hash_mask) * - HPTES_PER_GROUP) & ~0x7UL; + hpte_group = (hash & htab_hash_mask) * + HPTES_PER_GROUP; mmu_hash_ops.hpte_remove(hpte_group); /* * FIXME!! Should be try the group from which we removed ? diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S index ffbd7c0bda96..26acf6c8c20c 100644 --- a/arch/powerpc/mm/hash_low_32.S +++ b/arch/powerpc/mm/hash_low_32.S @@ -27,6 +27,7 @@ #include <asm/thread_info.h> #include <asm/asm-offsets.h> #include <asm/export.h> +#include <asm/feature-fixups.h> #ifdef CONFIG_SMP .section .bss diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index 656933c85925..aaa28fd918fe 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -23,13 +23,13 @@ #include <asm/mmu.h> #include <asm/mmu_context.h> #include <asm/pgtable.h> -#include <asm/tlbflush.h> #include <asm/trace.h> #include <asm/tlb.h> #include <asm/cputable.h> #include <asm/udbg.h> #include <asm/kexec.h> #include <asm/ppc-opcode.h> +#include <asm/feature-fixups.h> #include <misc/cxl-base.h> @@ -115,6 +115,8 @@ static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) tlbiel_hash_set_isa300(0, is, 0, 2, 1); asm volatile("ptesync": : :"memory"); + + asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); } void hash__tlbiel_all(unsigned int action) @@ -140,8 +142,6 @@ void hash__tlbiel_all(unsigned int action) tlbiel_all_isa206(POWER7_TLB_SETS, is); else WARN(1, "%s called on pre-POWER7 CPU\n", __func__); - - asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); } static inline unsigned long ___tlbie(unsigned long vpn, int psize, @@ -423,9 +423,7 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, DBG_LOW(" update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)", vpn, want_v & HPTE_V_AVPN, slot, newpp); - hpte_v = be64_to_cpu(hptep->v); - if (cpu_has_feature(CPU_FTR_ARCH_300)) - hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r)); + hpte_v = hpte_get_old_v(hptep); /* * We need to invalidate the TLB always because hpte_remove doesn't do * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less @@ -439,9 +437,7 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, } else { native_lock_hpte(hptep); /* recheck with locks held */ - hpte_v = be64_to_cpu(hptep->v); - if (cpu_has_feature(CPU_FTR_ARCH_300)) - hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r)); + hpte_v = hpte_get_old_v(hptep); if (unlikely(!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))) { ret = -1; @@ -481,11 +477,9 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize) /* Bolted mappings are only ever in the primary group */ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; for (i = 0; i < HPTES_PER_GROUP; i++) { - hptep = htab_address + slot; - hpte_v = be64_to_cpu(hptep->v); - if (cpu_has_feature(CPU_FTR_ARCH_300)) - hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r)); + hptep = htab_address + slot; + hpte_v = hpte_get_old_v(hptep); if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) /* HPTE matches */ return slot; @@ -574,11 +568,19 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, DBG_LOW(" invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot); want_v = hpte_encode_avpn(vpn, bpsize, ssize); - native_lock_hpte(hptep); - hpte_v = be64_to_cpu(hptep->v); - if (cpu_has_feature(CPU_FTR_ARCH_300)) - hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r)); + hpte_v = hpte_get_old_v(hptep); + + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { + native_lock_hpte(hptep); + /* recheck with locks held */ + hpte_v = hpte_get_old_v(hptep); + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) + /* Invalidate the hpte. NOTE: this also unlocks it */ + hptep->v = 0; + else + native_unlock_hpte(hptep); + } /* * We need to invalidate the TLB always because hpte_remove doesn't do * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less @@ -586,13 +588,6 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, * (hpte_remove) because we assume the old translation is still * technically "valid". */ - if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) - native_unlock_hpte(hptep); - else - /* Invalidate the hpte. NOTE: this also unlocks it */ - hptep->v = 0; - - /* Invalidate the TLB */ tlbie(vpn, bpsize, apsize, ssize, local); local_irq_restore(flags); @@ -634,17 +629,23 @@ static void native_hugepage_invalidate(unsigned long vsid, hptep = htab_address + slot; want_v = hpte_encode_avpn(vpn, psize, ssize); - native_lock_hpte(hptep); - hpte_v = be64_to_cpu(hptep->v); - if (cpu_has_feature(CPU_FTR_ARCH_300)) - hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r)); + hpte_v = hpte_get_old_v(hptep); /* Even if we miss, we need to invalidate the TLB */ - if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) - native_unlock_hpte(hptep); - else - /* Invalidate the hpte. NOTE: this also unlocks it */ - hptep->v = 0; + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { + /* recheck with locks held */ + native_lock_hpte(hptep); + hpte_v = hpte_get_old_v(hptep); + + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { + /* + * Invalidate the hpte. NOTE: this also unlocks it + */ + + hptep->v = 0; + } else + native_unlock_hpte(hptep); + } /* * We need to do tlb invalidate for all the address, tlbie * instruction compares entry_VA in tlb with the VA specified @@ -812,16 +813,19 @@ static void native_flush_hash_range(unsigned long number, int local) slot += hidx & _PTEIDX_GROUP_IX; hptep = htab_address + slot; want_v = hpte_encode_avpn(vpn, psize, ssize); + hpte_v = hpte_get_old_v(hptep); + + if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) + continue; + /* lock and try again */ native_lock_hpte(hptep); - hpte_v = be64_to_cpu(hptep->v); - if (cpu_has_feature(CPU_FTR_ARCH_300)) - hpte_v = hpte_new_to_old_v(hpte_v, - be64_to_cpu(hptep->r)); - if (!HPTE_V_COMPARE(hpte_v, want_v) || - !(hpte_v & HPTE_V_VALID)) + hpte_v = hpte_get_old_v(hptep); + + if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) native_unlock_hpte(hptep); else hptep->v = 0; + } pte_iterate_hashed_end(); } @@ -866,18 +870,6 @@ static void native_flush_hash_range(unsigned long number, int local) local_irq_restore(flags); } -static int native_register_proc_table(unsigned long base, unsigned long page_size, - unsigned long table_size) -{ - unsigned long patb1 = base << 25; /* VSID */ - - patb1 |= (page_size << 5); /* sllp */ - patb1 |= table_size; - - partition_tb->patb1 = cpu_to_be64(patb1); - return 0; -} - void __init hpte_init_native(void) { mmu_hash_ops.hpte_invalidate = native_hpte_invalidate; @@ -889,7 +881,4 @@ void __init hpte_init_native(void) mmu_hash_ops.hpte_clear_all = native_hpte_clear; mmu_hash_ops.flush_hash_range = native_flush_hash_range; mmu_hash_ops.hugepage_invalidate = native_hugepage_invalidate; - - if (cpu_has_feature(CPU_FTR_ARCH_300)) - register_process_table = native_register_proc_table; } diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index cf290d415dcd..0cc7fbc3bd1c 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -48,7 +48,6 @@ #include <linux/uaccess.h> #include <asm/machdep.h> #include <asm/prom.h> -#include <asm/tlbflush.h> #include <asm/io.h> #include <asm/eeh.h> #include <asm/tlb.h> @@ -64,6 +63,7 @@ #include <asm/trace.h> #include <asm/ps3.h> #include <asm/pte-walk.h> +#include <asm/asm-prototypes.h> #ifdef DEBUG #define DBG(fmt...) udbg_printf(fmt) @@ -132,9 +132,10 @@ EXPORT_SYMBOL(mmu_hash_ops); * is provided by the firmware. */ -/* Pre-POWER4 CPUs (4k pages only) +/* + * Fallback (4k pages only) */ -static struct mmu_psize_def mmu_psize_defaults_old[] = { +static struct mmu_psize_def mmu_psize_defaults[] = { [MMU_PAGE_4K] = { .shift = 12, .sllp = 0, @@ -554,8 +555,8 @@ static void __init htab_scan_page_sizes(void) mmu_psize_set_default_penc(); /* Default to 4K pages only */ - memcpy(mmu_psize_defs, mmu_psize_defaults_old, - sizeof(mmu_psize_defaults_old)); + memcpy(mmu_psize_defs, mmu_psize_defaults, + sizeof(mmu_psize_defaults)); /* * Try to find the available page sizes in the device-tree @@ -571,8 +572,10 @@ static void __init htab_scan_page_sizes(void) } #ifdef CONFIG_HUGETLB_PAGE - /* Reserve 16G huge page memory sections for huge pages */ - of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL); + if (!hugetlb_disabled) { + /* Reserve 16G huge page memory sections for huge pages */ + of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL); + } #endif /* CONFIG_HUGETLB_PAGE */ } @@ -781,7 +784,7 @@ void resize_hpt_for_hotplug(unsigned long new_mem_size) } } -int hash__create_section_mapping(unsigned long start, unsigned long end) +int hash__create_section_mapping(unsigned long start, unsigned long end, int nid) { int rc = htab_bolt_mapping(start, end, __pa(start), pgprot_val(PAGE_KERNEL), mmu_linear_psize, @@ -804,31 +807,6 @@ int hash__remove_section_mapping(unsigned long start, unsigned long end) } #endif /* CONFIG_MEMORY_HOTPLUG */ -static void update_hid_for_hash(void) -{ - unsigned long hid0; - unsigned long rb = 3UL << PPC_BITLSHIFT(53); /* IS = 3 */ - - asm volatile("ptesync": : :"memory"); - /* prs = 0, ric = 2, rs = 0, r = 1 is = 3 */ - asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(0), "i"(0), "i"(2), "r"(0) : "memory"); - asm volatile("eieio; tlbsync; ptesync; isync; slbia": : :"memory"); - trace_tlbie(0, 0, rb, 0, 2, 0, 0); - - /* - * now switch the HID - */ - hid0 = mfspr(SPRN_HID0); - hid0 &= ~HID0_POWER9_RADIX; - mtspr(SPRN_HID0, hid0); - asm volatile("isync": : :"memory"); - - /* Wait for it to happen */ - while ((mfspr(SPRN_HID0) & HID0_POWER9_RADIX)) - cpu_relax(); -} - static void __init hash_init_partition_table(phys_addr_t hash_table, unsigned long htab_size) { @@ -841,8 +819,6 @@ static void __init hash_init_partition_table(phys_addr_t hash_table, htab_size = __ilog2(htab_size) - 18; mmu_partition_table_set_entry(0, hash_table | htab_size, 0); pr_info("Partition table %p\n", partition_tb); - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) - update_hid_for_hash(); } static void __init htab_initialize(void) @@ -875,6 +851,12 @@ static void __init htab_initialize(void) /* Using a hypervisor which owns the htab */ htab_address = NULL; _SDR1 = 0; + /* + * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall + * to inform the hypervisor that we wish to use the HPT. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + register_process_table(0, 0, 0); #ifdef CONFIG_FA_DUMP /* * If firmware assisted dump is active firmware preserves @@ -1003,13 +985,14 @@ void __init hash__early_init_mmu(void) */ __pte_frag_nr = H_PTE_FRAG_NR; __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT; + __pmd_frag_nr = H_PMD_FRAG_NR; + __pmd_frag_size_shift = H_PMD_FRAG_SIZE_SHIFT; __pte_index_size = H_PTE_INDEX_SIZE; __pmd_index_size = H_PMD_INDEX_SIZE; __pud_index_size = H_PUD_INDEX_SIZE; __pgd_index_size = H_PGD_INDEX_SIZE; __pud_cache_index = H_PUD_CACHE_INDEX; - __pmd_cache_index = H_PMD_CACHE_INDEX; __pte_table_size = H_PTE_TABLE_SIZE; __pmd_table_size = H_PMD_TABLE_SIZE; __pud_table_size = H_PUD_TABLE_SIZE; @@ -1018,9 +1001,9 @@ void __init hash__early_init_mmu(void) * 4k use hugepd format, so for hash set then to * zero */ - __pmd_val_bits = 0; - __pud_val_bits = 0; - __pgd_val_bits = 0; + __pmd_val_bits = HASH_PMD_VAL_BITS; + __pud_val_bits = HASH_PUD_VAL_BITS; + __pgd_val_bits = HASH_PGD_VAL_BITS; __kernel_virt_start = H_KERN_VIRT_START; __kernel_virt_size = H_KERN_VIRT_SIZE; @@ -1066,9 +1049,6 @@ void hash__early_init_mmu_secondary(void) /* Initialize hash table for that CPU */ if (!firmware_has_feature(FW_FEATURE_LPAR)) { - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) - update_hid_for_hash(); - if (!cpu_has_feature(CPU_FTR_ARCH_300)) mtspr(SPRN_SDR1, _SDR1); else @@ -1110,19 +1090,18 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) #ifdef CONFIG_PPC_MM_SLICES static unsigned int get_paca_psize(unsigned long addr) { - u64 lpsizes; - unsigned char *hpsizes; + unsigned char *psizes; unsigned long index, mask_index; if (addr < SLICE_LOW_TOP) { - lpsizes = get_paca()->mm_ctx_low_slices_psize; + psizes = get_paca()->mm_ctx_low_slices_psize; index = GET_LOW_SLICE_INDEX(addr); - return (lpsizes >> (index * 4)) & 0xF; + } else { + psizes = get_paca()->mm_ctx_high_slices_psize; + index = GET_HIGH_SLICE_INDEX(addr); } - hpsizes = get_paca()->mm_ctx_high_slices_psize; - index = GET_HIGH_SLICE_INDEX(addr); mask_index = index & 0x1; - return (hpsizes[index >> 1] >> (mask_index * 4)) & 0xF; + return (psizes[index >> 1] >> (mask_index * 4)) & 0xF; } #else @@ -1146,7 +1125,7 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr) if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) { copy_mm_to_paca(mm); - slb_flush_and_rebolt(); + slb_flush_and_restore_bolted(); } } #endif /* CONFIG_PPC_64K_PAGES */ @@ -1218,7 +1197,7 @@ static void check_paca_psize(unsigned long ea, struct mm_struct *mm, if (user_region) { if (psize != get_paca_psize(ea)) { copy_mm_to_paca(mm); - slb_flush_and_rebolt(); + slb_flush_and_restore_bolted(); } } else if (get_paca()->vmalloc_sllp != mmu_psize_defs[mmu_vmalloc_psize].sllp) { @@ -1262,7 +1241,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, } psize = get_slice_psize(mm, ea); ssize = user_segment_size(ea); - vsid = get_vsid(mm->context.id, ea, ssize); + vsid = get_user_vsid(&mm->context, ea, ssize); break; case VMALLOC_REGION_ID: vsid = get_kernel_vsid(ea, mmu_kernel_ssize); @@ -1503,7 +1482,7 @@ static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) #endif void hash_preload(struct mm_struct *mm, unsigned long ea, - unsigned long access, unsigned long trap) + bool is_exec, unsigned long trap) { int hugepage_shift; unsigned long vsid; @@ -1511,6 +1490,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, pte_t *ptep; unsigned long flags; int rc, ssize, update_flags = 0; + unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0); BUG_ON(REGION_ID(ea) != USER_REGION_ID); @@ -1527,7 +1507,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, /* Get VSID */ ssize = user_segment_size(ea); - vsid = get_vsid(mm->context.id, ea, ssize); + vsid = get_user_vsid(&mm->context, ea, ssize); if (!vsid) return; /* @@ -1773,8 +1753,7 @@ long hpte_insert_repeating(unsigned long hash, unsigned long vpn, long slot; repeat: - hpte_group = ((hash & htab_hash_mask) * - HPTES_PER_GROUP) & ~0x7UL; + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; /* Insert into the hash table, primary slot */ slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, vflags, @@ -1782,15 +1761,14 @@ repeat: /* Primary is full, try the secondary */ if (unlikely(slot == -1)) { - hpte_group = ((~hash & htab_hash_mask) * - HPTES_PER_GROUP) & ~0x7UL; + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, vflags | HPTE_V_SECONDARY, psize, psize, ssize); if (slot == -1) { if (mftb() & 0x1) - hpte_group = ((hash & htab_hash_mask) * - HPTES_PER_GROUP)&~0x7UL; + hpte_group = (hash & htab_hash_mask) * + HPTES_PER_GROUP; mmu_hash_ops.hpte_remove(hpte_group); goto repeat; diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c index 668e87d03f9e..82a0e37557a5 100644 --- a/arch/powerpc/mm/highmem.c +++ b/arch/powerpc/mm/highmem.c @@ -56,7 +56,7 @@ EXPORT_SYMBOL(kmap_atomic_prot); void __kunmap_atomic(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - int type; + int type __maybe_unused; if (vaddr < __fix_to_virt(FIX_KMAP_END)) { pagefault_enable(); diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c index f20d16f849c5..dfbc3b32f09b 100644 --- a/arch/powerpc/mm/hugepage-hash64.c +++ b/arch/powerpc/mm/hugepage-hash64.c @@ -51,6 +51,12 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, new_pmd |= _PAGE_DIRTY; } while (!pmd_xchg(pmdp, __pmd(old_pmd), __pmd(new_pmd))); + /* + * Make sure this is thp or devmap entry + */ + if (!(old_pmd & (H_PAGE_THP_HUGE | _PAGE_DEVMAP))) + return 0; + rflags = htab_convert_pte_flags(new_pmd); #if 0 @@ -128,7 +134,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, new_pmd |= H_PAGE_HASHPTE; repeat: - hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; /* Insert into the hash table, primary slot */ slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0, @@ -137,16 +143,15 @@ repeat: * Primary is full, try the secondary */ if (unlikely(slot == -1)) { - hpte_group = ((~hash & htab_hash_mask) * - HPTES_PER_GROUP) & ~0x7UL; + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, HPTE_V_SECONDARY, psize, lpsize, ssize); if (slot == -1) { if (mftb() & 0x1) - hpte_group = ((hash & htab_hash_mask) * - HPTES_PER_GROUP) & ~0x7UL; + hpte_group = (hash & htab_hash_mask) * + HPTES_PER_GROUP; mmu_hash_ops.hpte_remove(hpte_group); goto repeat; diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c index b320f5097a06..2e6a8f9345d3 100644 --- a/arch/powerpc/mm/hugetlbpage-hash64.c +++ b/arch/powerpc/mm/hugetlbpage-hash64.c @@ -62,6 +62,10 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, new_pte |= _PAGE_DIRTY; } while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + /* Make sure this is a hugetlb entry */ + if (old_pte & (H_PAGE_THP_HUGE | _PAGE_DEVMAP)) + return 0; + rflags = htab_convert_pte_flags(new_pte); if (unlikely(mmu_psize == MMU_PAGE_16G)) offset = PTRS_PER_PUD; diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 876da2bc1796..8cf035e68378 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -15,10 +15,10 @@ #include <linux/export.h> #include <linux/of_fdt.h> #include <linux/memblock.h> -#include <linux/bootmem.h> #include <linux/moduleparam.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/kmemleak.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/tlb.h> @@ -35,6 +35,8 @@ #define PAGE_SHIFT_16M 24 #define PAGE_SHIFT_16G 34 +bool hugetlb_disabled = false; + unsigned int HPAGE_SHIFT; EXPORT_SYMBOL(HPAGE_SHIFT); @@ -50,7 +52,8 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long s } static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, - unsigned long address, unsigned pdshift, unsigned pshift) + unsigned long address, unsigned int pdshift, + unsigned int pshift, spinlock_t *ptl) { struct kmem_cache *cachep; pte_t *new; @@ -80,8 +83,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, */ smp_wmb(); - spin_lock(&mm->page_table_lock); - + spin_lock(ptl); /* * We have multiple higher-level entries that point to the same * actual pte location. Fill in each as we go and backtrack on error. @@ -93,7 +95,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, break; else { #ifdef CONFIG_PPC_BOOK3S_64 - *hpdp = __hugepd(__pa(new) | + *hpdp = __hugepd(__pa(new) | HUGEPD_VAL_BITS | (shift_to_mmu_psize(pshift) << 2)); #elif defined(CONFIG_PPC_8xx) *hpdp = __hugepd(__pa(new) | _PMD_USER | @@ -110,24 +112,14 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, for (i = i - 1 ; i >= 0; i--, hpdp--) *hpdp = __hugepd(0); kmem_cache_free(cachep, new); + } else { + kmemleak_ignore(new); } - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); return 0; } /* - * These macros define how to determine which level of the page table holds - * the hpdp. - */ -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) -#define HUGEPD_PGD_SHIFT PGDIR_SHIFT -#define HUGEPD_PUD_SHIFT PUD_SHIFT -#else -#define HUGEPD_PGD_SHIFT PUD_SHIFT -#define HUGEPD_PUD_SHIFT PMD_SHIFT -#endif - -/* * At this point we do the placement change only for BOOK3S 64. This would * possibly work on other subarchs. */ @@ -139,6 +131,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz hugepd_t *hpdp = NULL; unsigned pshift = __ffs(sz); unsigned pdshift = PGDIR_SHIFT; + spinlock_t *ptl; addr &= ~(sz-1); pg = pgd_offset(mm, addr); @@ -147,39 +140,46 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz if (pshift == PGDIR_SHIFT) /* 16GB huge page */ return (pte_t *) pg; - else if (pshift > PUD_SHIFT) + else if (pshift > PUD_SHIFT) { /* * We need to use hugepd table */ + ptl = &mm->page_table_lock; hpdp = (hugepd_t *)pg; - else { + } else { pdshift = PUD_SHIFT; pu = pud_alloc(mm, pg, addr); if (pshift == PUD_SHIFT) return (pte_t *)pu; - else if (pshift > PMD_SHIFT) + else if (pshift > PMD_SHIFT) { + ptl = pud_lockptr(mm, pu); hpdp = (hugepd_t *)pu; - else { + } else { pdshift = PMD_SHIFT; pm = pmd_alloc(mm, pu, addr); if (pshift == PMD_SHIFT) /* 16MB hugepage */ return (pte_t *)pm; - else + else { + ptl = pmd_lockptr(mm, pm); hpdp = (hugepd_t *)pm; + } } } #else - if (pshift >= HUGEPD_PGD_SHIFT) { + if (pshift >= PGDIR_SHIFT) { + ptl = &mm->page_table_lock; hpdp = (hugepd_t *)pg; } else { pdshift = PUD_SHIFT; pu = pud_alloc(mm, pg, addr); - if (pshift >= HUGEPD_PUD_SHIFT) { + if (pshift >= PUD_SHIFT) { + ptl = pud_lockptr(mm, pu); hpdp = (hugepd_t *)pu; } else { pdshift = PMD_SHIFT; pm = pmd_alloc(mm, pu, addr); + ptl = pmd_lockptr(mm, pm); hpdp = (hugepd_t *)pm; } } @@ -189,7 +189,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); - if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift)) + if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, + pdshift, pshift, ptl)) return NULL; return hugepte_offset(*hpdp, addr, pdshift); @@ -329,7 +330,8 @@ static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshif if (shift >= pdshift) hugepd_free(tlb, hugepte); else - pgtable_free_tlb(tlb, hugepte, pdshift - shift); + pgtable_free_tlb(tlb, hugepte, + get_hugepd_cache_index(pdshift - shift)); } static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, @@ -500,6 +502,10 @@ struct page *follow_huge_pd(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; retry: + /* + * hugepage directory entries are protected by mm->page_table_lock + * Use this instead of huge_pte_lockptr + */ ptl = &mm->page_table_lock; spin_lock(ptl); @@ -553,9 +559,11 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct hstate *hstate = hstate_file(file); int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); +#ifdef CONFIG_PPC_RADIX_MMU if (radix_enabled()) return radix__hugetlb_get_unmapped_area(file, addr, len, pgoff, flags); +#endif return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1); } #endif @@ -563,15 +571,14 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { #ifdef CONFIG_PPC_MM_SLICES - unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); /* With radix we don't use slice, so derive it from vma*/ - if (!radix_enabled()) + if (!radix_enabled()) { + unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); + return 1UL << mmu_psize_to_shift(psize); + } #endif - if (!is_vm_hugetlb_page(vma)) - return PAGE_SIZE; - - return huge_page_size(hstate_vma(vma)); + return vma_kernel_pagesize(vma); } static inline bool is_power_of_4(unsigned long x) @@ -607,15 +614,12 @@ static int __init add_huge_page_size(unsigned long long size) * firmware we only add hugetlb support for page sizes that can be * supported by linux page table layout. * For now we have - * Radix: 2M + * Radix: 2M and 1G * Hash: 16M and 16G */ if (radix_enabled()) { - if (mmu_psize != MMU_PAGE_2M) { - if (cpu_has_feature(CPU_FTR_POWER9_DD1) || - (mmu_psize != MMU_PAGE_1G)) - return -EINVAL; - } + if (mmu_psize != MMU_PAGE_2M && mmu_psize != MMU_PAGE_1G) + return -EINVAL; } else { if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G) return -EINVAL; @@ -653,6 +657,11 @@ static int __init hugetlbpage_init(void) { int psize; + if (hugetlb_disabled) { + pr_info("HugeTLB support is disabled!\n"); + return 0; + } + #if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx) if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE)) return -ENODEV; @@ -666,15 +675,26 @@ static int __init hugetlbpage_init(void) shift = mmu_psize_to_shift(psize); - if (add_huge_page_size(1ULL << shift) < 0) +#ifdef CONFIG_PPC_BOOK3S_64 + if (shift > PGDIR_SHIFT) continue; - - if (shift < HUGEPD_PUD_SHIFT) + else if (shift > PUD_SHIFT) + pdshift = PGDIR_SHIFT; + else if (shift > PMD_SHIFT) + pdshift = PUD_SHIFT; + else + pdshift = PMD_SHIFT; +#else + if (shift < PUD_SHIFT) pdshift = PMD_SHIFT; - else if (shift < HUGEPD_PGD_SHIFT) + else if (shift < PGDIR_SHIFT) pdshift = PUD_SHIFT; else pdshift = PGDIR_SHIFT; +#endif + + if (add_huge_page_size(1ULL << shift) < 0) + continue; /* * if we have pdshift and shift value same, we don't * use pgt cache for hugepd. @@ -819,8 +839,12 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, ret_pte = (pte_t *) pmdp; goto out; } - - if (pmd_huge(pmd)) { + /* + * pmd_large check below will handle the swap pmd pte + * we need to do both the check because they are config + * dependent. + */ + if (pmd_huge(pmd) || pmd_large(pmd)) { ret_pte = (pte_t *) pmdp; goto out; } else if (is_hugepd(__hugepd(pmd_val(pmd)))) diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 6419b33ca309..3e59e5d64b01 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -88,18 +88,13 @@ void MMU_init(void); int __map_without_bats; int __map_without_ltlbs; -/* - * This tells the system to allow ioremapping memory marked as reserved. - */ -int __allow_ioremap_reserved; - /* max amount of low RAM to map in */ unsigned long __max_low_memory = MAX_LOW_MEM; /* * Check for command-line options that affect what MMU_init will do. */ -void __init MMU_setup(void) +static void __init MMU_setup(void) { /* Check for nobats option (used in mapin_ram). */ if (strstr(boot_command_line, "nobats")) { diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index fdb424a29f03..7a9886f98b0c 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -68,12 +68,6 @@ #include "mmu_decl.h" -#ifdef CONFIG_PPC_BOOK3S_64 -#if H_PGTABLE_RANGE > USER_VSID_RANGE -#warning Limited user VSID range means pagetable space is wasted -#endif -#endif /* CONFIG_PPC_BOOK3S_64 */ - phys_addr_t memstart_addr = ~0; EXPORT_SYMBOL_GPL(memstart_addr); phys_addr_t kernstart_addr; @@ -314,55 +308,6 @@ void register_page_bootmem_memmap(unsigned long section_nr, { } -/* - * We do not have access to the sparsemem vmemmap, so we fallback to - * walking the list of sparsemem blocks which we already maintain for - * the sake of crashdump. In the long run, we might want to maintain - * a tree if performance of that linear walk becomes a problem. - * - * realmode_pfn_to_page functions can fail due to: - * 1) As real sparsemem blocks do not lay in RAM continously (they - * are in virtual address space which is not available in the real mode), - * the requested page struct can be split between blocks so get_page/put_page - * may fail. - * 2) When huge pages are used, the get_page/put_page API will fail - * in real mode as the linked addresses in the page struct are virtual - * too. - */ -struct page *realmode_pfn_to_page(unsigned long pfn) -{ - struct vmemmap_backing *vmem_back; - struct page *page; - unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; - unsigned long pg_va = (unsigned long) pfn_to_page(pfn); - - for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back->list) { - if (pg_va < vmem_back->virt_addr) - continue; - - /* After vmemmap_list entry free is possible, need check all */ - if ((pg_va + sizeof(struct page)) <= - (vmem_back->virt_addr + page_size)) { - page = (struct page *) (vmem_back->phys + pg_va - - vmem_back->virt_addr); - return page; - } - } - - /* Probably that page struct is split between real pages */ - return NULL; -} -EXPORT_SYMBOL_GPL(realmode_pfn_to_page); - -#else - -struct page *realmode_pfn_to_page(unsigned long pfn) -{ - struct page *page = pfn_to_page(pfn); - return page; -} -EXPORT_SYMBOL_GPL(realmode_pfn_to_page); - #endif /* CONFIG_SPARSEMEM_VMEMMAP */ #ifdef CONFIG_PPC_BOOK3S_64 @@ -372,7 +317,7 @@ static int __init parse_disable_radix(char *p) { bool val; - if (strlen(p) == 0) + if (!p) val = true; else if (kstrtobool(p, &val)) return -EINVAL; diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index fe8c61149fb8..0a64fffabee1 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -27,12 +27,11 @@ #include <linux/mm.h> #include <linux/stddef.h> #include <linux/init.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/highmem.h> #include <linux/initrd.h> #include <linux/pagemap.h> #include <linux/suspend.h> -#include <linux/memblock.h> #include <linux/hugetlb.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -63,6 +62,7 @@ #endif unsigned long long memory_limit; +bool init_mem_is_free; #ifdef CONFIG_HIGHMEM pte_t *kmap_pte; @@ -82,17 +82,7 @@ static inline pte_t *virt_to_kpte(unsigned long vaddr) int page_is_ram(unsigned long pfn) { -#ifndef CONFIG_PPC64 /* XXX for now */ - return pfn < max_pfn; -#else - unsigned long paddr = (pfn << PAGE_SHIFT); - struct memblock_region *reg; - - for_each_memblock(memory, reg) - if (paddr >= reg->base && paddr < (reg->base + reg->size)) - return 1; - return 0; -#endif + return memblock_is_memory(__pfn_to_phys(pfn)); } pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, @@ -117,7 +107,7 @@ int memory_add_physaddr_to_nid(u64 start) } #endif -int __weak create_section_mapping(unsigned long start, unsigned long end) +int __weak create_section_mapping(unsigned long start, unsigned long end, int nid) { return -ENODEV; } @@ -127,7 +117,7 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end) return -ENODEV; } -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, +int __meminit arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; @@ -137,18 +127,19 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, resize_hpt_for_hotplug(memblock_phys_mem_size()); start = (unsigned long)__va(start); - rc = create_section_mapping(start, start + size); + rc = create_section_mapping(start, start + size, nid); if (rc) { pr_warn("Unable to create mapping for hot added memory 0x%llx..0x%llx: %d\n", start, start + size, rc); return -EFAULT; } + flush_inval_dcache_range(start, start + size); return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); } #ifdef CONFIG_MEMORY_HOTREMOVE -int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) +int __meminit arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; @@ -169,6 +160,7 @@ int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) /* Remove htab bolted mappings for this section of memory */ start = (unsigned long)__va(start); + flush_inval_dcache_range(start, start + size); ret = remove_section_mapping(start, start + size); /* Ensure all vmalloc mappings are flushed in case they also @@ -212,7 +204,7 @@ walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, EXPORT_SYMBOL_GPL(walk_system_ram_range); #ifndef CONFIG_NEED_MULTIPLE_NODES -void __init initmem_init(void) +void __init mem_topology_setup(void) { max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; min_low_pfn = MEMORY_START >> PAGE_SHIFT; @@ -223,8 +215,11 @@ void __init initmem_init(void) /* Place all memblock_regions in the same node and merge contiguous * memblock_regions */ - memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); + memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0); +} +void __init initmem_init(void) +{ /* XXX need to clip this if using highmem? */ sparse_memory_present_with_active_regions(0); sparse_init(); @@ -313,11 +308,11 @@ void __init paging_init(void) unsigned long end = __fix_to_virt(FIX_HOLE); for (; v < end; v += PAGE_SIZE) - map_kernel_page(v, 0, 0); /* XXX gross */ + map_kernel_page(v, 0, __pgprot(0)); /* XXX gross */ #endif #ifdef CONFIG_HIGHMEM - map_kernel_page(PKMAP_BASE, 0, 0); /* XXX gross */ + map_kernel_page(PKMAP_BASE, 0, __pgprot(0)); /* XXX gross */ pkmap_page_table = virt_to_kpte(PKMAP_BASE); kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); @@ -353,7 +348,7 @@ void __init mem_init(void) high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); set_max_mapnr(max_pfn); - free_all_bootmem(); + memblock_free_all(); #ifdef CONFIG_HIGHMEM { @@ -401,6 +396,7 @@ void free_initmem(void) { ppc_md.progress = ppc_printk_progress; mark_initmem_nx(); + init_mem_is_free = true; free_initmem_default(POISON_FREE_INITMEM); } @@ -512,10 +508,13 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, * We don't need to worry about _PAGE_PRESENT here because we are * called with either mm->page_table_lock held or ptl lock held */ - unsigned long access, trap; + unsigned long trap; + bool is_exec; - if (radix_enabled()) + if (radix_enabled()) { + prefetch((void *)address); return; + } /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ if (!pte_young(*ptep) || address >= TASK_SIZE) @@ -532,16 +531,16 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL; switch (trap) { case 0x300: - access = 0UL; + is_exec = false; break; case 0x400: - access = _PAGE_EXEC; + is_exec = true; break; default: return; } - hash_preload(vma->vm_mm, address, access, trap); + hash_preload(vma->vm_mm, address, is_exec, trap); #endif /* CONFIG_PPC_STD_MMU */ #if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \ && defined(CONFIG_HUGETLB_PAGE) diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c index d503f344e476..b24ce40acd47 100644 --- a/arch/powerpc/mm/mmap.c +++ b/arch/powerpc/mm/mmap.c @@ -39,12 +39,12 @@ #define MIN_GAP (128*1024*1024) #define MAX_GAP (TASK_SIZE/6*5) -static inline int mmap_is_legacy(void) +static inline int mmap_is_legacy(struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; - if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) + if (rlim_stack->rlim_cur == RLIM_INFINITY) return 1; return sysctl_legacy_va_layout; @@ -76,9 +76,10 @@ static inline unsigned long stack_maxrandom_size(void) return (1<<30); } -static inline unsigned long mmap_base(unsigned long rnd) +static inline unsigned long mmap_base(unsigned long rnd, + struct rlimit *rlim_stack) { - unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_maxrandom_size() + stack_guard_gap; /* Values close to RLIM_INFINITY can overflow. */ @@ -196,26 +197,28 @@ radix__arch_get_unmapped_area_topdown(struct file *filp, } static void radix__arch_pick_mmap_layout(struct mm_struct *mm, - unsigned long random_factor) + unsigned long random_factor, + struct rlimit *rlim_stack) { - if (mmap_is_legacy()) { + if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = radix__arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(random_factor); + mm->mmap_base = mmap_base(random_factor, rlim_stack); mm->get_unmapped_area = radix__arch_get_unmapped_area_topdown; } } #else /* dummy */ extern void radix__arch_pick_mmap_layout(struct mm_struct *mm, - unsigned long random_factor); + unsigned long random_factor, + struct rlimit *rlim_stack); #endif /* * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: */ -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; @@ -223,16 +226,17 @@ void arch_pick_mmap_layout(struct mm_struct *mm) random_factor = arch_mmap_rnd(); if (radix_enabled()) - return radix__arch_pick_mmap_layout(mm, random_factor); + return radix__arch_pick_mmap_layout(mm, random_factor, + rlim_stack); /* * Fall back to the standard layout if the personality * bit is set, or if the expected stack growth is unlimited: */ - if (mmap_is_legacy()) { + if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(random_factor); + mm->mmap_base = mmap_base(random_factor, rlim_stack); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c index 0ab297c4cfad..f84e14f23e50 100644 --- a/arch/powerpc/mm/mmu_context.c +++ b/arch/powerpc/mm/mmu_context.c @@ -57,8 +57,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * in switch_slb(), and/or the store of paca->mm_ctx_id in * copy_mm_to_paca(). * - * On the read side the barrier is in pte_xchg(), which orders - * the store to the PTE vs the load of mm_cpumask. + * On the other side, the barrier is in mm/tlb-radix.c for + * radix which orders earlier stores to clear the PTEs vs + * the load of mm_cpumask. And pte_xchg which does the same + * thing for hash. * * This full barrier is needed by membarrier when switching * between processes after store to rq->curr, before user-space diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index 3f980baade4c..510f103d7813 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -26,48 +26,16 @@ #include <asm/mmu_context.h> #include <asm/pgalloc.h> -static DEFINE_SPINLOCK(mmu_context_lock); static DEFINE_IDA(mmu_context_ida); static int alloc_context_id(int min_id, int max_id) { - int index, err; - -again: - if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL)) - return -ENOMEM; - - spin_lock(&mmu_context_lock); - err = ida_get_new_above(&mmu_context_ida, min_id, &index); - spin_unlock(&mmu_context_lock); - - if (err == -EAGAIN) - goto again; - else if (err) - return err; - - if (index > max_id) { - spin_lock(&mmu_context_lock); - ida_remove(&mmu_context_ida, index); - spin_unlock(&mmu_context_lock); - return -ENOMEM; - } - - return index; + return ida_alloc_range(&mmu_context_ida, min_id, max_id, GFP_KERNEL); } void hash__reserve_context_id(int id) { - int rc, result = 0; - - do { - if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL)) - break; - - spin_lock(&mmu_context_lock); - rc = ida_get_new_above(&mmu_context_ida, id, &result); - spin_unlock(&mmu_context_lock); - } while (rc == -EAGAIN); + int result = ida_alloc_range(&mmu_context_ida, id, id, GFP_KERNEL); WARN(result != id, "mmu: Failed to reserve context id %d (rc %d)\n", id, result); } @@ -85,6 +53,8 @@ int hash__alloc_context_id(void) } EXPORT_SYMBOL_GPL(hash__alloc_context_id); +void slb_setup_new_exec(void); + static int hash__init_new_context(struct mm_struct *mm) { int index; @@ -94,13 +64,6 @@ static int hash__init_new_context(struct mm_struct *mm) return index; /* - * In the case of exec, use the default limit, - * otherwise inherit it from the mm we are duplicating. - */ - if (!mm->context.slb_addr_limit) - mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64; - - /* * The old code would re-promote on fork, we don't do that when using * slices as it could cause problem promoting slices that have been * forced down to 4K. @@ -115,7 +78,7 @@ static int hash__init_new_context(struct mm_struct *mm) * check against 0 is OK. */ if (mm->context.id == 0) - slice_set_user_psize(mm, mmu_virtual_psize); + slice_init_new_context_exec(mm); subpage_prot_init_new_context(mm); @@ -123,6 +86,13 @@ static int hash__init_new_context(struct mm_struct *mm) return index; } +void hash__setup_new_exec(void) +{ + slice_setup_new_exec(); + + slb_setup_new_exec(); +} + static int radix__init_new_context(struct mm_struct *mm) { unsigned long rts_field; @@ -166,9 +136,8 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) mm->context.id = index; -#ifdef CONFIG_PPC_64K_PAGES mm->context.pte_frag = NULL; -#endif + mm->context.pmd_frag = NULL; #ifdef CONFIG_SPAPR_TCE_IOMMU mm_iommu_init(mm); #endif @@ -180,39 +149,64 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) void __destroy_context(int context_id) { - spin_lock(&mmu_context_lock); - ida_remove(&mmu_context_ida, context_id); - spin_unlock(&mmu_context_lock); + ida_free(&mmu_context_ida, context_id); } EXPORT_SYMBOL_GPL(__destroy_context); -#ifdef CONFIG_PPC_64K_PAGES -static void destroy_pagetable_page(struct mm_struct *mm) +static void destroy_contexts(mm_context_t *ctx) +{ + int index, context_id; + + for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) { + context_id = ctx->extended_id[index]; + if (context_id) + ida_free(&mmu_context_ida, context_id); + } +} + +static void pte_frag_destroy(void *pte_frag) { int count; - void *pte_frag; struct page *page; - pte_frag = mm->context.pte_frag; - if (!pte_frag) - return; - page = virt_to_page(pte_frag); /* drop all the pending references */ count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT; /* We allow PTE_FRAG_NR fragments from a PTE page */ - if (page_ref_sub_and_test(page, PTE_FRAG_NR - count)) { + if (atomic_sub_and_test(PTE_FRAG_NR - count, &page->pt_frag_refcount)) { pgtable_page_dtor(page); - free_unref_page(page); + __free_page(page); } } -#else -static inline void destroy_pagetable_page(struct mm_struct *mm) +static void pmd_frag_destroy(void *pmd_frag) { + int count; + struct page *page; + + page = virt_to_page(pmd_frag); + /* drop all the pending references */ + count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT; + /* We allow PTE_FRAG_NR fragments from a PTE page */ + if (atomic_sub_and_test(PMD_FRAG_NR - count, &page->pt_frag_refcount)) { + pgtable_pmd_page_dtor(page); + __free_page(page); + } +} + +static void destroy_pagetable_cache(struct mm_struct *mm) +{ + void *frag; + + frag = mm->context.pte_frag; + if (frag) + pte_frag_destroy(frag); + + frag = mm->context.pmd_frag; + if (frag) + pmd_frag_destroy(frag); return; } -#endif void destroy_context(struct mm_struct *mm) { @@ -223,13 +217,14 @@ void destroy_context(struct mm_struct *mm) WARN_ON(process_tb[mm->context.id].prtb0 != 0); else subpage_prot_free(mm); - destroy_pagetable_page(mm); - __destroy_context(mm->context.id); + destroy_contexts(&mm->context); mm->context.id = MMU_NO_CONTEXT; } void arch_exit_mmap(struct mm_struct *mm) { + destroy_pagetable_cache(mm); + if (radix_enabled()) { /* * Radix doesn't have a valid bit in the process table @@ -252,15 +247,7 @@ void arch_exit_mmap(struct mm_struct *mm) #ifdef CONFIG_PPC_RADIX_MMU void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) { - - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { - isync(); - mtspr(SPRN_PID, next->context.id); - isync(); - asm volatile(PPC_INVALIDATE_ERAT : : :"memory"); - } else { - mtspr(SPRN_PID, next->context.id); - isync(); - } + mtspr(SPRN_PID, next->context.id); + isync(); } #endif diff --git a/arch/powerpc/mm/mmu_context_hash32.c b/arch/powerpc/mm/mmu_context_hash32.c index aa5a7fd89461..921c1e33e941 100644 --- a/arch/powerpc/mm/mmu_context_hash32.c +++ b/arch/powerpc/mm/mmu_context_hash32.c @@ -27,7 +27,6 @@ #include <linux/export.h> #include <asm/mmu_context.h> -#include <asm/tlbflush.h> /* * On 32-bit PowerPC 6xx/7xx/7xxx CPUs, we use a set of 16 VSIDs diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index e0a2d8e806ed..56c2234cc6ae 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -18,15 +18,21 @@ #include <linux/migrate.h> #include <linux/hugetlb.h> #include <linux/swap.h> +#include <linux/sizes.h> #include <asm/mmu_context.h> +#include <asm/pte-walk.h> static DEFINE_MUTEX(mem_list_mutex); +#define MM_IOMMU_TABLE_GROUP_PAGE_DIRTY 0x1 +#define MM_IOMMU_TABLE_GROUP_PAGE_MASK ~(SZ_4K - 1) + struct mm_iommu_table_group_mem_t { struct list_head next; struct rcu_head rcu; unsigned long used; atomic64_t mapped; + unsigned int pageshift; u64 ua; /* userspace address */ u64 entries; /* number of entries in hpas[] */ u64 *hpas; /* vmalloc'ed */ @@ -75,8 +81,7 @@ EXPORT_SYMBOL_GPL(mm_iommu_preregistered); /* * Taken from alloc_migrate_target with changes to remove CMA allocations */ -struct page *new_iommu_non_cma_page(struct page *page, unsigned long private, - int **resultp) +struct page *new_iommu_non_cma_page(struct page *page, unsigned long private) { gfp_t gfp_mask = GFP_USER; struct page *new_page; @@ -112,7 +117,7 @@ static int mm_iommu_move_page_from_cma(struct page *page) put_page(page); /* Drop the gup reference */ ret = migrate_pages(&cma_migrate_pages, new_iommu_non_cma_page, - NULL, 0, MIGRATE_SYNC, MR_CMA); + NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE); if (ret) { if (!list_empty(&cma_migrate_pages)) putback_movable_pages(&cma_migrate_pages); @@ -126,6 +131,9 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries, { struct mm_iommu_table_group_mem_t *mem; long i, j, ret = 0, locked_entries = 0; + unsigned int pageshift; + unsigned long flags; + unsigned long cur_ua; struct page *page = NULL; mutex_lock(&mem_list_mutex); @@ -160,7 +168,13 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries, goto unlock_exit; } - mem->hpas = vzalloc(entries * sizeof(mem->hpas[0])); + /* + * For a starting point for a maximum page size calculation + * we use @ua and @entries natural alignment to allow IOMMU pages + * smaller than huge pages but still bigger than PAGE_SIZE. + */ + mem->pageshift = __ffs(ua | (entries << PAGE_SHIFT)); + mem->hpas = vzalloc(array_size(entries, sizeof(mem->hpas[0]))); if (!mem->hpas) { kfree(mem); ret = -ENOMEM; @@ -168,7 +182,8 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries, } for (i = 0; i < entries; ++i) { - if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT), + cur_ua = ua + (i << PAGE_SHIFT); + if (1 != get_user_pages_fast(cur_ua, 1/* pages */, 1/* iswrite */, &page)) { ret = -EFAULT; for (j = 0; j < i; ++j) @@ -187,7 +202,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries, if (is_migrate_cma_page(page)) { if (mm_iommu_move_page_from_cma(page)) goto populate; - if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT), + if (1 != get_user_pages_fast(cur_ua, 1/* pages */, 1/* iswrite */, &page)) { ret = -EFAULT; @@ -200,6 +215,24 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries, } } populate: + pageshift = PAGE_SHIFT; + if (mem->pageshift > PAGE_SHIFT && PageCompound(page)) { + pte_t *pte; + struct page *head = compound_head(page); + unsigned int compshift = compound_order(head); + unsigned int pteshift; + + local_irq_save(flags); /* disables as well */ + pte = find_linux_pte(mm->pgd, cur_ua, NULL, &pteshift); + + /* Double check it is still the same pinned page */ + if (pte && pte_page(*pte) == head && + pteshift == compshift + PAGE_SHIFT) + pageshift = max_t(unsigned int, pteshift, + PAGE_SHIFT); + local_irq_restore(flags); + } + mem->pageshift = min(mem->pageshift, pageshift); mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT; } @@ -234,6 +267,9 @@ static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem) if (!page) continue; + if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY) + SetPageDirty(page); + put_page(page); mem->hpas[i] = 0; } @@ -331,7 +367,6 @@ struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm, return ret; } -EXPORT_SYMBOL_GPL(mm_iommu_lookup_rm); struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm, unsigned long ua, unsigned long entries) @@ -350,7 +385,7 @@ struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm, EXPORT_SYMBOL_GPL(mm_iommu_find); long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, - unsigned long ua, unsigned long *hpa) + unsigned long ua, unsigned int pageshift, unsigned long *hpa) { const long entry = (ua - mem->ua) >> PAGE_SHIFT; u64 *va = &mem->hpas[entry]; @@ -358,14 +393,17 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, if (entry >= mem->entries) return -EFAULT; - *hpa = *va | (ua & ~PAGE_MASK); + if (pageshift > mem->pageshift) + return -EFAULT; + + *hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); return 0; } EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa); long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, - unsigned long ua, unsigned long *hpa) + unsigned long ua, unsigned int pageshift, unsigned long *hpa) { const long entry = (ua - mem->ua) >> PAGE_SHIFT; void *va = &mem->hpas[entry]; @@ -374,15 +412,38 @@ long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, if (entry >= mem->entries) return -EFAULT; + if (pageshift > mem->pageshift) + return -EFAULT; + pa = (void *) vmalloc_to_phys(va); if (!pa) return -EFAULT; - *hpa = *pa | (ua & ~PAGE_MASK); + *hpa = (*pa & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); return 0; } -EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa_rm); + +extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua) +{ + struct mm_iommu_table_group_mem_t *mem; + long entry; + void *va; + unsigned long *pa; + + mem = mm_iommu_lookup_rm(mm, ua, PAGE_SIZE); + if (!mem) + return; + + entry = (ua - mem->ua) >> PAGE_SHIFT; + va = &mem->hpas[entry]; + + pa = (void *) vmalloc_to_phys(va); + if (!pa) + return; + + *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY; +} long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem) { diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index 4554d6527682..2faca46ad720 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -44,7 +44,7 @@ #include <linux/mm.h> #include <linux/init.h> #include <linux/spinlock.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/notifier.h> #include <linux/cpu.h> #include <linux/slab.h> @@ -54,16 +54,44 @@ #include "mmu_decl.h" -static unsigned int first_context, last_context; +/* + * The MPC8xx has only 16 contexts. We rotate through them on each task switch. + * A better way would be to keep track of tasks that own contexts, and implement + * an LRU usage. That way very active tasks don't always have to pay the TLB + * reload overhead. The kernel pages are mapped shared, so the kernel can run on + * behalf of any task that makes a kernel entry. Shared does not mean they are + * not protected, just that the ASID comparison is not performed. -- Dan + * + * The IBM4xx has 256 contexts, so we can just rotate through these as a way of + * "switching" contexts. If the TID of the TLB is zero, the PID/TID comparison + * is disabled, so we can use a TID of zero to represent all kernel pages as + * shared among all contexts. -- Dan + * + * The IBM 47x core supports 16-bit PIDs, thus 65535 contexts. We should + * normally never have to steal though the facility is present if needed. + * -- BenH + */ +#define FIRST_CONTEXT 1 +#ifdef DEBUG_CLAMP_LAST_CONTEXT +#define LAST_CONTEXT DEBUG_CLAMP_LAST_CONTEXT +#elif defined(CONFIG_PPC_8xx) +#define LAST_CONTEXT 16 +#elif defined(CONFIG_PPC_47x) +#define LAST_CONTEXT 65535 +#else +#define LAST_CONTEXT 255 +#endif + static unsigned int next_context, nr_free_contexts; static unsigned long *context_map; +#ifdef CONFIG_SMP static unsigned long *stale_map[NR_CPUS]; +#endif static struct mm_struct **context_mm; static DEFINE_RAW_SPINLOCK(context_lock); -static bool no_selective_tlbil; #define CTX_MAP_SIZE \ - (sizeof(unsigned long) * (last_context / BITS_PER_LONG + 1)) + (sizeof(unsigned long) * (LAST_CONTEXT / BITS_PER_LONG + 1)) /* Steal a context from a task that has one at the moment. @@ -87,7 +115,7 @@ static unsigned int steal_context_smp(unsigned int id) struct mm_struct *mm; unsigned int cpu, max, i; - max = last_context - first_context; + max = LAST_CONTEXT - FIRST_CONTEXT; /* Attempt to free next_context first and then loop until we manage */ while (max--) { @@ -99,8 +127,8 @@ static unsigned int steal_context_smp(unsigned int id) */ if (mm->context.active) { id++; - if (id > last_context) - id = first_context; + if (id > LAST_CONTEXT) + id = FIRST_CONTEXT; continue; } pr_hardcont(" | steal %d from 0x%p", id, mm); @@ -139,10 +167,12 @@ static unsigned int steal_context_smp(unsigned int id) static unsigned int steal_all_contexts(void) { struct mm_struct *mm; +#ifdef CONFIG_SMP int cpu = smp_processor_id(); +#endif unsigned int id; - for (id = first_context; id <= last_context; id++) { + for (id = FIRST_CONTEXT; id <= LAST_CONTEXT; id++) { /* Pick up the victim mm */ mm = context_mm[id]; @@ -150,22 +180,24 @@ static unsigned int steal_all_contexts(void) /* Mark this mm as having no context anymore */ mm->context.id = MMU_NO_CONTEXT; - if (id != first_context) { + if (id != FIRST_CONTEXT) { context_mm[id] = NULL; __clear_bit(id, context_map); #ifdef DEBUG_MAP_CONSISTENCY mm->context.active = 0; #endif } +#ifdef CONFIG_SMP __clear_bit(id, stale_map[cpu]); +#endif } /* Flush the TLB for all contexts (not to be used on SMP) */ _tlbil_all(); - nr_free_contexts = last_context - first_context; + nr_free_contexts = LAST_CONTEXT - FIRST_CONTEXT; - return first_context; + return FIRST_CONTEXT; } /* Note that this will also be called on SMP if all other CPUs are @@ -176,7 +208,9 @@ static unsigned int steal_all_contexts(void) static unsigned int steal_context_up(unsigned int id) { struct mm_struct *mm; +#ifdef CONFIG_SMP int cpu = smp_processor_id(); +#endif /* Pick up the victim mm */ mm = context_mm[id]; @@ -190,7 +224,9 @@ static unsigned int steal_context_up(unsigned int id) mm->context.id = MMU_NO_CONTEXT; /* XXX This clear should ultimately be part of local_flush_tlb_mm */ +#ifdef CONFIG_SMP __clear_bit(id, stale_map[cpu]); +#endif return id; } @@ -201,7 +237,7 @@ static void context_check_map(void) unsigned int id, nrf, nact; nrf = nact = 0; - for (id = first_context; id <= last_context; id++) { + for (id = FIRST_CONTEXT; id <= LAST_CONTEXT; id++) { int used = test_bit(id, context_map); if (!used) nrf++; @@ -219,7 +255,7 @@ static void context_check_map(void) if (nact > num_online_cpus()) pr_err("MMU: More active contexts than CPUs ! (%d vs %d)\n", nact, num_online_cpus()); - if (first_context > 0 && !test_bit(0, context_map)) + if (FIRST_CONTEXT > 0 && !test_bit(0, context_map)) pr_err("MMU: Context 0 has been freed !!!\n"); } #else @@ -229,7 +265,10 @@ static void context_check_map(void) { } void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { - unsigned int i, id, cpu = smp_processor_id(); + unsigned int id; +#ifdef CONFIG_SMP + unsigned int i, cpu = smp_processor_id(); +#endif unsigned long *map; /* No lockless fast path .. yet */ @@ -263,8 +302,8 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, /* We really don't have a context, let's try to acquire one */ id = next_context; - if (id > last_context) - id = first_context; + if (id > LAST_CONTEXT) + id = FIRST_CONTEXT; map = context_map; /* No more free contexts, let's try to steal one */ @@ -277,7 +316,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, goto stolen; } #endif /* CONFIG_SMP */ - if (no_selective_tlbil) + if (IS_ENABLED(CONFIG_PPC_8xx)) id = steal_all_contexts(); else id = steal_context_up(id); @@ -287,9 +326,9 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, /* We know there's at least one free context, try to find it */ while (__test_and_set_bit(id, map)) { - id = find_next_zero_bit(map, last_context+1, id); - if (id > last_context) - id = first_context; + id = find_next_zero_bit(map, LAST_CONTEXT+1, id); + if (id > LAST_CONTEXT) + id = FIRST_CONTEXT; } stolen: next_context = id + 1; @@ -303,6 +342,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, /* If that context got marked stale on this CPU, then flush the * local TLB for it and unmark it before we use it */ +#ifdef CONFIG_SMP if (test_bit(id, stale_map[cpu])) { pr_hardcont(" | stale flush %d [%d..%d]", id, cpu_first_thread_sibling(cpu), @@ -317,6 +357,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, __clear_bit(id, stale_map[i]); } } +#endif /* Flick the MMU and release lock */ pr_hardcont(" -> %d\n", id); @@ -331,6 +372,17 @@ int init_new_context(struct task_struct *t, struct mm_struct *mm) { pr_hard("initing context for mm @%p\n", mm); +#ifdef CONFIG_PPC_MM_SLICES + /* + * We have MMU_NO_CONTEXT set to be ~0. Hence check + * explicitly against context.id == 0. This ensures that we properly + * initialize context slice details for newly allocated mm's (which will + * have id == 0) and don't alter context slice inherited via fork (which + * will have id != 0). + */ + if (mm->context.id == 0) + slice_init_new_context_exec(mm); +#endif mm->context.id = MMU_NO_CONTEXT; mm->context.active = 0; return 0; @@ -407,52 +459,13 @@ void __init mmu_context_init(void) init_mm.context.active = NR_CPUS; /* - * The MPC8xx has only 16 contexts. We rotate through them on each - * task switch. A better way would be to keep track of tasks that - * own contexts, and implement an LRU usage. That way very active - * tasks don't always have to pay the TLB reload overhead. The - * kernel pages are mapped shared, so the kernel can run on behalf - * of any task that makes a kernel entry. Shared does not mean they - * are not protected, just that the ASID comparison is not performed. - * -- Dan - * - * The IBM4xx has 256 contexts, so we can just rotate through these - * as a way of "switching" contexts. If the TID of the TLB is zero, - * the PID/TID comparison is disabled, so we can use a TID of zero - * to represent all kernel pages as shared among all contexts. - * -- Dan - * - * The IBM 47x core supports 16-bit PIDs, thus 65535 contexts. We - * should normally never have to steal though the facility is - * present if needed. - * -- BenH - */ - if (mmu_has_feature(MMU_FTR_TYPE_8xx)) { - first_context = 0; - last_context = 15; - no_selective_tlbil = true; - } else if (mmu_has_feature(MMU_FTR_TYPE_47x)) { - first_context = 1; - last_context = 65535; - no_selective_tlbil = false; - } else { - first_context = 1; - last_context = 255; - no_selective_tlbil = false; - } - -#ifdef DEBUG_CLAMP_LAST_CONTEXT - last_context = DEBUG_CLAMP_LAST_CONTEXT; -#endif - /* * Allocate the maps used by context management */ - context_map = memblock_virt_alloc(CTX_MAP_SIZE, 0); - context_mm = memblock_virt_alloc(sizeof(void *) * (last_context + 1), 0); -#ifndef CONFIG_SMP - stale_map[0] = memblock_virt_alloc(CTX_MAP_SIZE, 0); -#else - stale_map[boot_cpuid] = memblock_virt_alloc(CTX_MAP_SIZE, 0); + context_map = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES); + context_mm = memblock_alloc(sizeof(void *) * (LAST_CONTEXT + 1), + SMP_CACHE_BYTES); +#ifdef CONFIG_SMP + stale_map[boot_cpuid] = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES); cpuhp_setup_state_nocalls(CPUHP_POWERPC_MMU_CTX_PREPARE, "powerpc/mmu/ctx:prepare", @@ -461,17 +474,17 @@ void __init mmu_context_init(void) printk(KERN_INFO "MMU: Allocated %zu bytes of context maps for %d contexts\n", - 2 * CTX_MAP_SIZE + (sizeof(void *) * (last_context + 1)), - last_context - first_context + 1); + 2 * CTX_MAP_SIZE + (sizeof(void *) * (LAST_CONTEXT + 1)), + LAST_CONTEXT - FIRST_CONTEXT + 1); /* * Some processors have too few contexts to reserve one for * init_mm, and require using context 0 for a normal task. * Other processors reserve the use of context zero for the kernel. - * This code assumes first_context < 32. + * This code assumes FIRST_CONTEXT < 32. */ - context_map[0] = (1 << first_context) - 1; - next_context = first_context; - nr_free_contexts = last_context - first_context + 1; + context_map[0] = (1 << FIRST_CONTEXT) - 1; + next_context = FIRST_CONTEXT; + nr_free_contexts = LAST_CONTEXT - FIRST_CONTEXT + 1; } diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 57fbc554c785..8574fbbc45e0 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -19,10 +19,10 @@ * */ #include <linux/mm.h> -#include <asm/tlbflush.h> #include <asm/mmu.h> #ifdef CONFIG_PPC_MMU_NOHASH +#include <asm/trace.h> /* * On 40x and 8xx, we directly inline tlbia and tlbivax @@ -31,10 +31,12 @@ static inline void _tlbil_all(void) { asm volatile ("sync; tlbia; isync" : : : "memory"); + trace_tlbia(MMU_NO_CONTEXT); } static inline void _tlbil_pid(unsigned int pid) { asm volatile ("sync; tlbia; isync" : : : "memory"); + trace_tlbia(pid); } #define _tlbil_pid_noind(pid) _tlbil_pid(pid) @@ -56,6 +58,7 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid, unsigned int tsize, unsigned int ind) { asm volatile ("tlbie %0; sync" : : "r" (address) : "memory"); + trace_tlbie(0, 0, address, pid, 0, 0, 0); } #elif defined(CONFIG_PPC_BOOK3E) extern void _tlbil_va(unsigned long address, unsigned int pid, @@ -83,7 +86,7 @@ static inline void _tlbivax_bcast(unsigned long address, unsigned int pid, #else /* CONFIG_PPC_MMU_NOHASH */ extern void hash_preload(struct mm_struct *mm, unsigned long ea, - unsigned long access, unsigned long trap); + bool is_exec, unsigned long trap); extern void _tlbie(unsigned long address); @@ -98,7 +101,6 @@ extern void setbat(int index, unsigned long virt, phys_addr_t phys, unsigned int size, pgprot_t prot); extern int __map_without_bats; -extern int __allow_ioremap_reserved; extern unsigned int rtas_data, rtas_size; struct hash_pte; diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index edd8d0bc9364..ce28ae5ca080 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -11,7 +11,7 @@ #define pr_fmt(fmt) "numa: " fmt #include <linux/threads.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/mmzone.h> @@ -19,7 +19,6 @@ #include <linux/nodemask.h> #include <linux/cpu.h> #include <linux/notifier.h> -#include <linux/memblock.h> #include <linux/of.h> #include <linux/pfn.h> #include <linux/cpuset.h> @@ -788,7 +787,7 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) void *nd; int tnid; - nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); + nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); nd = __va(nd_pa); /* report and initialize */ @@ -831,18 +830,13 @@ out: of_node_put(rtas); } -void __init initmem_init(void) +void __init mem_topology_setup(void) { - int nid, cpu; - - max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; - max_pfn = max_low_pfn; + int cpu; if (parse_numa_properties()) setup_nonnuma(); - memblock_dump_all(); - /* * Modify the set of possible NUMA nodes to reflect information * available about the set of online nodes, and the set of nodes @@ -853,6 +847,23 @@ void __init initmem_init(void) find_possible_nodes(); + setup_node_to_cpumask_map(); + + reset_numa_cpu_lookup_table(); + + for_each_present_cpu(cpu) + numa_setup_cpu(cpu); +} + +void __init initmem_init(void) +{ + int nid; + + max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; + max_pfn = max_low_pfn; + + memblock_dump_all(); + for_each_online_node(nid) { unsigned long start_pfn, end_pfn; @@ -863,10 +874,6 @@ void __init initmem_init(void) sparse_init(); - setup_node_to_cpumask_map(); - - reset_numa_cpu_lookup_table(); - /* * We need the numa_cpu_lookup_table to be accurate for all CPUs, * even before we online them, so that we can use cpu_to_{node,mem} @@ -876,8 +883,6 @@ void __init initmem_init(void) */ cpuhp_setup_state_nocalls(CPUHP_POWER_NUMA_PREPARE, "powerpc/numa:prepare", ppc_numa_cpu_prepare, ppc_numa_cpu_dead); - for_each_present_cpu(cpu) - numa_setup_cpu(cpu); } static int __init early_numa(char *p) @@ -1072,7 +1077,6 @@ static int prrn_enabled; static void reset_topology_timer(void); static int topology_timer_secs = 1; static int topology_inited; -static int topology_update_needed; /* * Change polling interval for associativity changes. @@ -1105,7 +1109,7 @@ static void setup_cpu_associativity_change_counters(void) for_each_possible_cpu(cpu) { int i; u8 *counts = vphn_cpu_change_counts[cpu]; - volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts; + volatile u8 *hypervisor_counts = lppaca_of(cpu).vphn_assoc_counts; for (i = 0; i < distance_ref_points_depth; i++) counts[i] = hypervisor_counts[i]; @@ -1131,7 +1135,7 @@ static int update_cpu_associativity_changes_mask(void) for_each_possible_cpu(cpu) { int i, changed = 0; u8 *counts = vphn_cpu_change_counts[cpu]; - volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts; + volatile u8 *hypervisor_counts = lppaca_of(cpu).vphn_assoc_counts; for (i = 0; i < distance_ref_points_depth; i++) { if (hypervisor_counts[i] != counts[i]) { @@ -1174,7 +1178,7 @@ static long vphn_get_associativity(unsigned long cpu, switch (rc) { case H_FUNCTION: - printk(KERN_INFO + printk_once(KERN_INFO "VPHN is not supported. Disabling polling...\n"); stop_topology_update(); break; @@ -1199,7 +1203,9 @@ int find_and_online_cpu_nid(int cpu) int new_nid; /* Use associativity from first thread for all siblings */ - vphn_get_associativity(cpu, associativity); + if (vphn_get_associativity(cpu, associativity)) + return cpu_to_node(cpu); + new_nid = associativity_to_nid(associativity); if (new_nid < 0 || !node_possible(new_nid)) new_nid = first_online_node; @@ -1210,9 +1216,10 @@ int find_and_online_cpu_nid(int cpu) * Need to ensure that NODE_DATA is initialized for a node from * available memory (see memblock_alloc_try_nid). If unable to * init the node, then default to nearest node that has memory - * installed. + * installed. Skip onlining a node if the subsystems are not + * yet initialized. */ - if (try_online_node(new_nid)) + if (!topology_inited || try_online_node(new_nid)) new_nid = first_online_node; #else /* @@ -1300,17 +1307,14 @@ int numa_update_cpu_topology(bool cpus_locked) struct device *dev; int weight, new_nid, i = 0; - if (!prrn_enabled && !vphn_enabled) { - if (!topology_inited) - topology_update_needed = 1; + if (!prrn_enabled && !vphn_enabled && topology_inited) return 0; - } weight = cpumask_weight(&cpu_associativity_changes_mask); if (!weight) return 0; - updates = kzalloc(weight * (sizeof(*updates)), GFP_KERNEL); + updates = kcalloc(weight, sizeof(*updates), GFP_KERNEL); if (!updates) return 0; @@ -1417,7 +1421,6 @@ int numa_update_cpu_topology(bool cpus_locked) out: kfree(updates); - topology_update_needed = 0; return changed; } @@ -1451,7 +1454,8 @@ static struct timer_list topology_timer; static void reset_topology_timer(void) { - mod_timer(&topology_timer, jiffies + topology_timer_secs * HZ); + if (vphn_enabled) + mod_timer(&topology_timer, jiffies + topology_timer_secs * HZ); } #ifdef CONFIG_SMP @@ -1516,6 +1520,10 @@ int start_topology_update(void) } } + pr_info("Starting topology update%s%s\n", + (prrn_enabled ? " prrn_enabled" : ""), + (vphn_enabled ? " vphn_enabled" : "")); + return rc; } @@ -1537,6 +1545,8 @@ int stop_topology_update(void) rc = del_timer_sync(&topology_timer); } + pr_info("Stopping topology update\n"); + return rc; } @@ -1545,6 +1555,15 @@ int prrn_is_enabled(void) return prrn_enabled; } +void __init shared_proc_topology_init(void) +{ + if (lppaca_shared_proc(get_lppaca())) { + bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask), + nr_cpumask_bits); + numa_update_cpu_topology(false); + } +} + static int topology_read(struct seq_file *file, void *v) { if (vphn_enabled || prrn_enabled) @@ -1602,10 +1621,6 @@ static int topology_update_init(void) return -ENOMEM; topology_inited = 1; - if (topology_update_needed) - bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask), - nr_cpumask_bits); - return 0; } device_initcall(topology_update_init); diff --git a/arch/powerpc/mm/pgtable-book3e.c b/arch/powerpc/mm/pgtable-book3e.c index a2298930f990..e0ccf36714b2 100644 --- a/arch/powerpc/mm/pgtable-book3e.c +++ b/arch/powerpc/mm/pgtable-book3e.c @@ -42,7 +42,7 @@ int __meminit vmemmap_create_mapping(unsigned long start, * thus must have the low bits clear */ for (i = 0; i < page_size; i += PAGE_SIZE) - BUG_ON(map_kernel_page(start + i, phys, flags)); + BUG_ON(map_kernel_page(start + i, phys, __pgprot(flags))); return 0; } @@ -70,7 +70,7 @@ static __ref void *early_alloc_pgtable(unsigned long size) * map_kernel_page adds an entry to the ioremap page table * and adds an entry to the HPT, possibly bolting it */ -int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) +int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) { pgd_t *pgdp; pud_t *pudp; @@ -89,8 +89,6 @@ int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) ptep = pte_alloc_kernel(pmdp, ea); if (!ptep) return -ENOMEM; - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, - __pgprot(flags))); } else { pgdp = pgd_offset_k(ea); #ifndef __PAGETABLE_PUD_FOLDED @@ -113,9 +111,8 @@ int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) pmd_populate_kernel(&init_mm, pmdp, ptep); } ptep = pte_offset_kernel(pmdp, ea); - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, - __pgprot(flags))); } + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot)); smp_wmb(); return 0; diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c index 422e80253a33..9f93c9f985c5 100644 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -9,14 +9,22 @@ #include <linux/sched.h> #include <linux/mm_types.h> +#include <linux/memblock.h> #include <misc/cxl-base.h> #include <asm/pgalloc.h> #include <asm/tlb.h> +#include <asm/trace.h> +#include <asm/powernv.h> #include "mmu_decl.h" #include <trace/events/thp.h> +unsigned long __pmd_frag_nr; +EXPORT_SYMBOL(__pmd_frag_nr); +unsigned long __pmd_frag_size_shift; +EXPORT_SYMBOL(__pmd_frag_size_shift); + int (*register_process_table)(unsigned long base, unsigned long page_size, unsigned long tbl_size); @@ -34,13 +42,16 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, int changed; #ifdef CONFIG_DEBUG_VM WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); - assert_spin_locked(&vma->vm_mm->page_table_lock); + assert_spin_locked(pmd_lockptr(vma->vm_mm, pmdp)); #endif changed = !pmd_same(*(pmdp), entry); if (changed) { - __ptep_set_access_flags(vma->vm_mm, pmdp_ptep(pmdp), - pmd_pte(entry), address); - flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + /* + * We can use MMU_PAGE_2M here, because only radix + * path look at the psize. + */ + __ptep_set_access_flags(vma, pmdp_ptep(pmdp), + pmd_pte(entry), address, MMU_PAGE_2M); } return changed; } @@ -58,9 +69,14 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { #ifdef CONFIG_DEBUG_VM - WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); - assert_spin_locked(&mm->page_table_lock); - WARN_ON(!(pmd_trans_huge(pmd) || pmd_devmap(pmd))); + /* + * Make sure hardware valid bit is not set. We don't do + * tlb flush for this update. + */ + + WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); + assert_spin_locked(pmd_lockptr(mm, pmdp)); + WARN_ON(!(pmd_large(pmd) || pmd_devmap(pmd))); #endif trace_hugepage_set_pmd(addr, pmd_val(pmd)); return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); @@ -95,7 +111,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, { unsigned long old_pmd; - old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); + old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); /* * This ensures that generic code that rely on IRQ disabling @@ -141,7 +157,8 @@ pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) { - return; + if (radix_enabled()) + prefetch((void *)addr); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -155,15 +172,15 @@ void mmu_cleanup_all(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int create_section_mapping(unsigned long start, unsigned long end) +int __meminit create_section_mapping(unsigned long start, unsigned long end, int nid) { if (radix_enabled()) - return radix__create_section_mapping(start, end); + return radix__create_section_mapping(start, end, nid); - return hash__create_section_mapping(start, end); + return hash__create_section_mapping(start, end, nid); } -int remove_section_mapping(unsigned long start, unsigned long end) +int __meminit remove_section_mapping(unsigned long start, unsigned long end) { if (radix_enabled()) return radix__remove_section_mapping(start, end); @@ -171,3 +188,297 @@ int remove_section_mapping(unsigned long start, unsigned long end) return hash__remove_section_mapping(start, end); } #endif /* CONFIG_MEMORY_HOTPLUG */ + +void __init mmu_partition_table_init(void) +{ + unsigned long patb_size = 1UL << PATB_SIZE_SHIFT; + unsigned long ptcr; + + BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large."); + partition_tb = __va(memblock_alloc_base(patb_size, patb_size, + MEMBLOCK_ALLOC_ANYWHERE)); + + /* Initialize the Partition Table with no entries */ + memset((void *)partition_tb, 0, patb_size); + + /* + * update partition table control register, + * 64 K size. + */ + ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12); + mtspr(SPRN_PTCR, ptcr); + powernv_set_nmmu_ptcr(ptcr); +} + +void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, + unsigned long dw1) +{ + unsigned long old = be64_to_cpu(partition_tb[lpid].patb0); + + partition_tb[lpid].patb0 = cpu_to_be64(dw0); + partition_tb[lpid].patb1 = cpu_to_be64(dw1); + + /* + * Global flush of TLBs and partition table caches for this lpid. + * The type of flush (hash or radix) depends on what the previous + * use of this partition ID was, not the new use. + */ + asm volatile("ptesync" : : : "memory"); + if (old & PATB_HR) { + asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 1); + } else { + asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0); + } + /* do we need fixup here ?*/ + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); +} +EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry); + +static pmd_t *get_pmd_from_cache(struct mm_struct *mm) +{ + void *pmd_frag, *ret; + + spin_lock(&mm->page_table_lock); + ret = mm->context.pmd_frag; + if (ret) { + pmd_frag = ret + PMD_FRAG_SIZE; + /* + * If we have taken up all the fragments mark PTE page NULL + */ + if (((unsigned long)pmd_frag & ~PAGE_MASK) == 0) + pmd_frag = NULL; + mm->context.pmd_frag = pmd_frag; + } + spin_unlock(&mm->page_table_lock); + return (pmd_t *)ret; +} + +static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm) +{ + void *ret = NULL; + struct page *page; + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO; + + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + page = alloc_page(gfp); + if (!page) + return NULL; + if (!pgtable_pmd_page_ctor(page)) { + __free_pages(page, 0); + return NULL; + } + + atomic_set(&page->pt_frag_refcount, 1); + + ret = page_address(page); + /* + * if we support only one fragment just return the + * allocated page. + */ + if (PMD_FRAG_NR == 1) + return ret; + + spin_lock(&mm->page_table_lock); + /* + * If we find pgtable_page set, we return + * the allocated page with single fragement + * count. + */ + if (likely(!mm->context.pmd_frag)) { + atomic_set(&page->pt_frag_refcount, PMD_FRAG_NR); + mm->context.pmd_frag = ret + PMD_FRAG_SIZE; + } + spin_unlock(&mm->page_table_lock); + + return (pmd_t *)ret; +} + +pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr) +{ + pmd_t *pmd; + + pmd = get_pmd_from_cache(mm); + if (pmd) + return pmd; + + return __alloc_for_pmdcache(mm); +} + +void pmd_fragment_free(unsigned long *pmd) +{ + struct page *page = virt_to_page(pmd); + + BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); + if (atomic_dec_and_test(&page->pt_frag_refcount)) { + pgtable_pmd_page_dtor(page); + __free_page(page); + } +} + +static pte_t *get_pte_from_cache(struct mm_struct *mm) +{ + void *pte_frag, *ret; + + spin_lock(&mm->page_table_lock); + ret = mm->context.pte_frag; + if (ret) { + pte_frag = ret + PTE_FRAG_SIZE; + /* + * If we have taken up all the fragments mark PTE page NULL + */ + if (((unsigned long)pte_frag & ~PAGE_MASK) == 0) + pte_frag = NULL; + mm->context.pte_frag = pte_frag; + } + spin_unlock(&mm->page_table_lock); + return (pte_t *)ret; +} + +static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel) +{ + void *ret = NULL; + struct page *page; + + if (!kernel) { + page = alloc_page(PGALLOC_GFP | __GFP_ACCOUNT); + if (!page) + return NULL; + if (!pgtable_page_ctor(page)) { + __free_page(page); + return NULL; + } + } else { + page = alloc_page(PGALLOC_GFP); + if (!page) + return NULL; + } + + atomic_set(&page->pt_frag_refcount, 1); + + ret = page_address(page); + /* + * if we support only one fragment just return the + * allocated page. + */ + if (PTE_FRAG_NR == 1) + return ret; + spin_lock(&mm->page_table_lock); + /* + * If we find pgtable_page set, we return + * the allocated page with single fragement + * count. + */ + if (likely(!mm->context.pte_frag)) { + atomic_set(&page->pt_frag_refcount, PTE_FRAG_NR); + mm->context.pte_frag = ret + PTE_FRAG_SIZE; + } + spin_unlock(&mm->page_table_lock); + + return (pte_t *)ret; +} + +pte_t *pte_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel) +{ + pte_t *pte; + + pte = get_pte_from_cache(mm); + if (pte) + return pte; + + return __alloc_for_ptecache(mm, kernel); +} + +void pte_fragment_free(unsigned long *table, int kernel) +{ + struct page *page = virt_to_page(table); + + BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); + if (atomic_dec_and_test(&page->pt_frag_refcount)) { + if (!kernel) + pgtable_page_dtor(page); + __free_page(page); + } +} + +static inline void pgtable_free(void *table, int index) +{ + switch (index) { + case PTE_INDEX: + pte_fragment_free(table, 0); + break; + case PMD_INDEX: + pmd_fragment_free(table); + break; + case PUD_INDEX: + kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table); + break; +#if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE) + /* 16M hugepd directory at pud level */ + case HTLB_16M_INDEX: + BUILD_BUG_ON(H_16M_CACHE_INDEX <= 0); + kmem_cache_free(PGT_CACHE(H_16M_CACHE_INDEX), table); + break; + /* 16G hugepd directory at the pgd level */ + case HTLB_16G_INDEX: + BUILD_BUG_ON(H_16G_CACHE_INDEX <= 0); + kmem_cache_free(PGT_CACHE(H_16G_CACHE_INDEX), table); + break; +#endif + /* We don't free pgd table via RCU callback */ + default: + BUG(); + } +} + +#ifdef CONFIG_SMP +void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index) +{ + unsigned long pgf = (unsigned long)table; + + BUG_ON(index > MAX_PGTABLE_INDEX_SIZE); + pgf |= index; + tlb_remove_table(tlb, (void *)pgf); +} + +void __tlb_remove_table(void *_table) +{ + void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); + unsigned int index = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; + + return pgtable_free(table, index); +} +#else +void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index) +{ + return pgtable_free(table, index); +} +#endif + +#ifdef CONFIG_PROC_FS +atomic_long_t direct_pages_count[MMU_PAGE_COUNT]; + +void arch_report_meminfo(struct seq_file *m) +{ + /* + * Hash maps the memory with one size mmu_linear_psize. + * So don't bother to print these on hash + */ + if (!radix_enabled()) + return; + seq_printf(m, "DirectMap4k: %8lu kB\n", + atomic_long_read(&direct_pages_count[MMU_PAGE_4K]) << 2); + seq_printf(m, "DirectMap64k: %8lu kB\n", + atomic_long_read(&direct_pages_count[MMU_PAGE_64K]) << 6); + seq_printf(m, "DirectMap2M: %8lu kB\n", + atomic_long_read(&direct_pages_count[MMU_PAGE_2M]) << 11); + seq_printf(m, "DirectMap1G: %8lu kB\n", + atomic_long_read(&direct_pages_count[MMU_PAGE_1G]) << 20); +} +#endif /* CONFIG_PROC_FS */ diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index 469808e77e58..c08d49046a96 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -24,6 +24,10 @@ #define CREATE_TRACE_POINTS #include <trace/events/thp.h> +#if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE)) +#warning Limited user VSID range means pagetable space is wasted +#endif + #ifdef CONFIG_SPARSEMEM_VMEMMAP /* * vmemmap is the starting address of the virtual address space where @@ -138,7 +142,7 @@ void hash__vmemmap_remove_mapping(unsigned long start, * map_kernel_page adds an entry to the ioremap page table * and adds an entry to the HPT, possibly bolting it */ -int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) +int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) { pgd_t *pgdp; pud_t *pudp; @@ -157,8 +161,7 @@ int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flag ptep = pte_alloc_kernel(pmdp, ea); if (!ptep) return -ENOMEM; - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, - __pgprot(flags))); + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot)); } else { /* * If the mm subsystem is not fully up, we cannot create a @@ -166,7 +169,7 @@ int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flag * entry in the hardware page table. * */ - if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags, + if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, pgprot_val(prot), mmu_io_psize, mmu_kernel_ssize)) { printk(KERN_ERR "Failed to do bolted mapping IO " "memory at %016lx !\n", pa); @@ -189,7 +192,7 @@ unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr #ifdef CONFIG_DEBUG_VM WARN_ON(!hash__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); - assert_spin_locked(&mm->page_table_lock); + assert_spin_locked(pmd_lockptr(mm, pmdp)); #endif __asm__ __volatile__( @@ -261,7 +264,8 @@ void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) { pgtable_t *pgtable_slot; - assert_spin_locked(&mm->page_table_lock); + + assert_spin_locked(pmd_lockptr(mm, pmdp)); /* * we store the pgtable in the second half of PMD */ @@ -281,7 +285,8 @@ pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) pgtable_t pgtable; pgtable_t *pgtable_slot; - assert_spin_locked(&mm->page_table_lock); + assert_spin_locked(pmd_lockptr(mm, pmdp)); + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; pgtable = *pgtable_slot; /* @@ -320,7 +325,7 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, if (!is_kernel_addr(addr)) { ssize = user_segment_size(addr); - vsid = get_vsid(mm->context.id, addr, ssize); + vsid = get_user_vsid(&mm->context, addr, ssize); WARN_ON(vsid == 0); } else { vsid = get_kernel_vsid(addr, mmu_kernel_ssize); diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 2e10a964e290..931156069a81 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -48,20 +48,88 @@ static int native_register_process_table(unsigned long base, unsigned long pg_sz return 0; } -static __ref void *early_alloc_pgtable(unsigned long size) +static __ref void *early_alloc_pgtable(unsigned long size, int nid, + unsigned long region_start, unsigned long region_end) { + unsigned long pa = 0; void *pt; - pt = __va(memblock_alloc_base(size, size, MEMBLOCK_ALLOC_ANYWHERE)); + if (region_start || region_end) /* has region hint */ + pa = memblock_alloc_range(size, size, region_start, region_end, + MEMBLOCK_NONE); + else if (nid != -1) /* has node hint */ + pa = memblock_alloc_base_nid(size, size, + MEMBLOCK_ALLOC_ANYWHERE, + nid, MEMBLOCK_NONE); + + if (!pa) + pa = memblock_alloc_base(size, size, MEMBLOCK_ALLOC_ANYWHERE); + + BUG_ON(!pa); + + pt = __va(pa); memset(pt, 0, size); return pt; } -int radix__map_kernel_page(unsigned long ea, unsigned long pa, +static int early_map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t flags, - unsigned int map_page_size) + unsigned int map_page_size, + int nid, + unsigned long region_start, unsigned long region_end) +{ + unsigned long pfn = pa >> PAGE_SHIFT; + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + pgdp = pgd_offset_k(ea); + if (pgd_none(*pgdp)) { + pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid, + region_start, region_end); + pgd_populate(&init_mm, pgdp, pudp); + } + pudp = pud_offset(pgdp, ea); + if (map_page_size == PUD_SIZE) { + ptep = (pte_t *)pudp; + goto set_the_pte; + } + if (pud_none(*pudp)) { + pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid, + region_start, region_end); + pud_populate(&init_mm, pudp, pmdp); + } + pmdp = pmd_offset(pudp, ea); + if (map_page_size == PMD_SIZE) { + ptep = pmdp_ptep(pmdp); + goto set_the_pte; + } + if (!pmd_present(*pmdp)) { + ptep = early_alloc_pgtable(PAGE_SIZE, nid, + region_start, region_end); + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + ptep = pte_offset_kernel(pmdp, ea); + +set_the_pte: + set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); + smp_wmb(); + return 0; +} + +/* + * nid, region_start, and region_end are hints to try to place the page + * table memory in the same node or region. + */ +static int __map_kernel_page(unsigned long ea, unsigned long pa, + pgprot_t flags, + unsigned int map_page_size, + int nid, + unsigned long region_start, unsigned long region_end) { + unsigned long pfn = pa >> PAGE_SHIFT; pgd_t *pgdp; pud_t *pudp; pmd_t *pmdp; @@ -70,61 +138,48 @@ int radix__map_kernel_page(unsigned long ea, unsigned long pa, * Make sure task size is correct as per the max adddr */ BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); - if (slab_is_available()) { - pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); - if (!pudp) - return -ENOMEM; - if (map_page_size == PUD_SIZE) { - ptep = (pte_t *)pudp; - goto set_the_pte; - } - pmdp = pmd_alloc(&init_mm, pudp, ea); - if (!pmdp) - return -ENOMEM; - if (map_page_size == PMD_SIZE) { - ptep = pmdp_ptep(pmdp); - goto set_the_pte; - } - ptep = pte_alloc_kernel(pmdp, ea); - if (!ptep) - return -ENOMEM; - } else { - pgdp = pgd_offset_k(ea); - if (pgd_none(*pgdp)) { - pudp = early_alloc_pgtable(PUD_TABLE_SIZE); - BUG_ON(pudp == NULL); - pgd_populate(&init_mm, pgdp, pudp); - } - pudp = pud_offset(pgdp, ea); - if (map_page_size == PUD_SIZE) { - ptep = (pte_t *)pudp; - goto set_the_pte; - } - if (pud_none(*pudp)) { - pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); - BUG_ON(pmdp == NULL); - pud_populate(&init_mm, pudp, pmdp); - } - pmdp = pmd_offset(pudp, ea); - if (map_page_size == PMD_SIZE) { - ptep = pmdp_ptep(pmdp); - goto set_the_pte; - } - if (!pmd_present(*pmdp)) { - ptep = early_alloc_pgtable(PAGE_SIZE); - BUG_ON(ptep == NULL); - pmd_populate_kernel(&init_mm, pmdp, ptep); - } - ptep = pte_offset_kernel(pmdp, ea); + + if (unlikely(!slab_is_available())) + return early_map_kernel_page(ea, pa, flags, map_page_size, + nid, region_start, region_end); + + /* + * Should make page table allocation functions be able to take a + * node, so we can place kernel page tables on the right nodes after + * boot. + */ + pgdp = pgd_offset_k(ea); + pudp = pud_alloc(&init_mm, pgdp, ea); + if (!pudp) + return -ENOMEM; + if (map_page_size == PUD_SIZE) { + ptep = (pte_t *)pudp; + goto set_the_pte; } + pmdp = pmd_alloc(&init_mm, pudp, ea); + if (!pmdp) + return -ENOMEM; + if (map_page_size == PMD_SIZE) { + ptep = pmdp_ptep(pmdp); + goto set_the_pte; + } + ptep = pte_alloc_kernel(pmdp, ea); + if (!ptep) + return -ENOMEM; set_the_pte: - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, flags)); + set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); smp_wmb(); return 0; } +int radix__map_kernel_page(unsigned long ea, unsigned long pa, + pgprot_t flags, + unsigned int map_page_size) +{ + return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0); +} + #ifdef CONFIG_STRICT_KERNEL_RWX void radix__change_memory_range(unsigned long start, unsigned long end, unsigned long clear) @@ -171,16 +226,6 @@ void radix__mark_rodata_ro(void) { unsigned long start, end; - /* - * mark_rodata_ro() will mark itself as !writable at some point. - * Due to DD1 workaround in radix__pte_update(), we'll end up with - * an invalid pte and the system will crash quite severly. - */ - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { - pr_warn("Warning: Unable to mark rodata read only on P9 DD1\n"); - return; - } - start = (unsigned long)_stext; end = (unsigned long)__init_begin; @@ -196,9 +241,8 @@ void radix__mark_initmem_nx(void) } #endif /* CONFIG_STRICT_KERNEL_RWX */ -static inline void __meminit print_mapping(unsigned long start, - unsigned long end, - unsigned long size) +static inline void __meminit +print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec) { char buf[10]; @@ -207,76 +251,78 @@ static inline void __meminit print_mapping(unsigned long start, string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf)); - pr_info("Mapped 0x%016lx-0x%016lx with %s pages\n", start, end, buf); + pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf, + exec ? " (exec)" : ""); +} + +static unsigned long next_boundary(unsigned long addr, unsigned long end) +{ +#ifdef CONFIG_STRICT_KERNEL_RWX + if (addr < __pa_symbol(__init_begin)) + return __pa_symbol(__init_begin); +#endif + return end; } static int __meminit create_physical_mapping(unsigned long start, - unsigned long end) + unsigned long end, + int nid) { unsigned long vaddr, addr, mapping_size = 0; + bool prev_exec, exec = false; pgprot_t prot; - unsigned long max_mapping_size; -#ifdef CONFIG_STRICT_KERNEL_RWX - int split_text_mapping = 1; -#else - int split_text_mapping = 0; -#endif + int psize; start = _ALIGN_UP(start, PAGE_SIZE); for (addr = start; addr < end; addr += mapping_size) { unsigned long gap, previous_size; int rc; - gap = end - addr; + gap = next_boundary(addr, end) - addr; previous_size = mapping_size; - max_mapping_size = PUD_SIZE; + prev_exec = exec; -retry: if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE && - mmu_psize_defs[MMU_PAGE_1G].shift && - PUD_SIZE <= max_mapping_size) + mmu_psize_defs[MMU_PAGE_1G].shift) { mapping_size = PUD_SIZE; - else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE && - mmu_psize_defs[MMU_PAGE_2M].shift) + psize = MMU_PAGE_1G; + } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE && + mmu_psize_defs[MMU_PAGE_2M].shift) { mapping_size = PMD_SIZE; - else + psize = MMU_PAGE_2M; + } else { mapping_size = PAGE_SIZE; - - if (split_text_mapping && (mapping_size == PUD_SIZE) && - (addr <= __pa_symbol(__init_begin)) && - (addr + mapping_size) >= __pa_symbol(_stext)) { - max_mapping_size = PMD_SIZE; - goto retry; - } - - if (split_text_mapping && (mapping_size == PMD_SIZE) && - (addr <= __pa_symbol(__init_begin)) && - (addr + mapping_size) >= __pa_symbol(_stext)) - mapping_size = PAGE_SIZE; - - if (mapping_size != previous_size) { - print_mapping(start, addr, previous_size); - start = addr; + psize = mmu_virtual_psize; } vaddr = (unsigned long)__va(addr); if (overlaps_kernel_text(vaddr, vaddr + mapping_size) || - overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) + overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) { prot = PAGE_KERNEL_X; - else + exec = true; + } else { prot = PAGE_KERNEL; + exec = false; + } + + if (mapping_size != previous_size || exec != prev_exec) { + print_mapping(start, addr, previous_size, prev_exec); + start = addr; + } - rc = radix__map_kernel_page(vaddr, addr, prot, mapping_size); + rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end); if (rc) return rc; + + update_page_count(psize, 1); } - print_mapping(start, addr, mapping_size); + print_mapping(start, addr, mapping_size, exec); return 0; } -static void __init radix_init_pgtable(void) +void __init radix_init_pgtable(void) { unsigned long rts_field; struct memblock_region *reg; @@ -286,9 +332,16 @@ static void __init radix_init_pgtable(void) /* * Create the linear mapping, using standard page size for now */ - for_each_memblock(memory, reg) + for_each_memblock(memory, reg) { + /* + * The memblock allocator is up at this point, so the + * page tables will be allocated within the range. No + * need or a node (which we don't have yet). + */ WARN_ON(create_physical_mapping(reg->base, - reg->base + reg->size)); + reg->base + reg->size, + -1)); + } /* Find out how many PID bits are supported */ if (cpu_has_feature(CPU_FTR_HVMODE)) { @@ -317,7 +370,7 @@ static void __init radix_init_pgtable(void) * host. */ BUG_ON(PRTB_SIZE_SHIFT > 36); - process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT); + process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0); /* * Fill in the process table. */ @@ -470,35 +523,6 @@ found: return; } -static void update_hid_for_radix(void) -{ - unsigned long hid0; - unsigned long rb = 3UL << PPC_BITLSHIFT(53); /* IS = 3 */ - - asm volatile("ptesync": : :"memory"); - /* prs = 0, ric = 2, rs = 0, r = 1 is = 3 */ - asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(1), "i"(0), "i"(2), "r"(0) : "memory"); - /* prs = 1, ric = 2, rs = 0, r = 1 is = 3 */ - asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(1), "i"(1), "i"(2), "r"(0) : "memory"); - asm volatile("eieio; tlbsync; ptesync; isync; slbia": : :"memory"); - trace_tlbie(0, 0, rb, 0, 2, 0, 1); - trace_tlbie(0, 0, rb, 0, 2, 1, 1); - - /* - * now switch the HID - */ - hid0 = mfspr(SPRN_HID0); - hid0 |= HID0_POWER9_RADIX; - mtspr(SPRN_HID0, hid0); - asm volatile("isync": : :"memory"); - - /* Wait for it to happen */ - while (!(mfspr(SPRN_HID0) & HID0_POWER9_RADIX)) - cpu_relax(); -} - static void radix_init_amor(void) { /* @@ -513,22 +537,12 @@ static void radix_init_amor(void) static void radix_init_iamr(void) { - unsigned long iamr; - - /* - * The IAMR should set to 0 on DD1. - */ - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) - iamr = 0; - else - iamr = (1ul << 62); - /* * Radix always uses key0 of the IAMR to determine if an access is * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction * fetch. */ - mtspr(SPRN_IAMR, iamr); + mtspr(SPRN_IAMR, (1ul << 62)); } void __init radix__early_init_mmu(void) @@ -554,7 +568,6 @@ void __init radix__early_init_mmu(void) __pud_index_size = RADIX_PUD_INDEX_SIZE; __pgd_index_size = RADIX_PGD_INDEX_SIZE; __pud_cache_index = RADIX_PUD_INDEX_SIZE; - __pmd_cache_index = RADIX_PMD_INDEX_SIZE; __pte_table_size = RADIX_PTE_TABLE_SIZE; __pmd_table_size = RADIX_PMD_TABLE_SIZE; __pud_table_size = RADIX_PUD_TABLE_SIZE; @@ -575,17 +588,13 @@ void __init radix__early_init_mmu(void) #ifdef CONFIG_PCI pci_io_base = ISA_IO_BASE; #endif - - /* - * For now radix also use the same frag size - */ - __pte_frag_nr = H_PTE_FRAG_NR; - __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT; + __pte_frag_nr = RADIX_PTE_FRAG_NR; + __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT; + __pmd_frag_nr = RADIX_PMD_FRAG_NR; + __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT; if (!firmware_has_feature(FW_FEATURE_LPAR)) { radix_init_native(); - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) - update_hid_for_radix(); lpcr = mfspr(SPRN_LPCR); mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); radix_init_partition_table(); @@ -611,10 +620,6 @@ void radix__early_init_mmu_secondary(void) * update partition table control register and UPRT */ if (!firmware_has_feature(FW_FEATURE_LPAR)) { - - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) - update_hid_for_radix(); - lpcr = mfspr(SPRN_LPCR); mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); @@ -695,7 +700,7 @@ struct change_mapping_params { unsigned long aligned_end; }; -static int stop_machine_change_mapping(void *data) +static int __meminit stop_machine_change_mapping(void *data) { struct change_mapping_params *params = (struct change_mapping_params *)data; @@ -705,8 +710,8 @@ static int stop_machine_change_mapping(void *data) spin_unlock(&init_mm.page_table_lock); pte_clear(&init_mm, params->aligned_start, params->pte); - create_physical_mapping(params->aligned_start, params->start); - create_physical_mapping(params->end, params->aligned_end); + create_physical_mapping(params->aligned_start, params->start, -1); + create_physical_mapping(params->end, params->aligned_end, -1); spin_lock(&init_mm.page_table_lock); return 0; } @@ -742,7 +747,7 @@ static void remove_pte_table(pte_t *pte_start, unsigned long addr, /* * clear the pte and potentially split the mapping helper */ -static void split_kernel_mapping(unsigned long addr, unsigned long end, +static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end, unsigned long size, pte_t *pte) { unsigned long mask = ~(size - 1); @@ -835,7 +840,7 @@ static void remove_pud_table(pud_t *pud_start, unsigned long addr, } } -static void remove_pagetable(unsigned long start, unsigned long end) +static void __meminit remove_pagetable(unsigned long start, unsigned long end) { unsigned long addr, next; pud_t *pud_base; @@ -863,12 +868,12 @@ static void remove_pagetable(unsigned long start, unsigned long end) radix__flush_tlb_kernel_range(start, end); } -int __ref radix__create_section_mapping(unsigned long start, unsigned long end) +int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid) { - return create_physical_mapping(start, end); + return create_physical_mapping(start, end, nid); } -int radix__remove_section_mapping(unsigned long start, unsigned long end) +int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) { remove_pagetable(start, end); return 0; @@ -876,19 +881,30 @@ int radix__remove_section_mapping(unsigned long start, unsigned long end) #endif /* CONFIG_MEMORY_HOTPLUG */ #ifdef CONFIG_SPARSEMEM_VMEMMAP +static int __map_kernel_page_nid(unsigned long ea, unsigned long pa, + pgprot_t flags, unsigned int map_page_size, + int nid) +{ + return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0); +} + int __meminit radix__vmemmap_create_mapping(unsigned long start, unsigned long page_size, unsigned long phys) { /* Create a PTE encoding */ unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW; + int nid = early_pfn_to_nid(phys >> PAGE_SHIFT); + int ret; + + ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid); + BUG_ON(ret); - BUG_ON(radix__map_kernel_page(start, phys, __pgprot(flags), page_size)); return 0; } #ifdef CONFIG_MEMORY_HOTPLUG -void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) +void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) { remove_pagetable(start, start + page_size); } @@ -905,7 +921,7 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add #ifdef CONFIG_DEBUG_VM WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); - assert_spin_locked(&mm->page_table_lock); + assert_spin_locked(pmd_lockptr(mm, pmdp)); #endif old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1); @@ -1013,3 +1029,37 @@ int radix__has_transparent_hugepage(void) return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, + pte_t entry, unsigned long address, int psize) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED | + _PAGE_RW | _PAGE_EXEC); + + unsigned long change = pte_val(entry) ^ pte_val(*ptep); + /* + * To avoid NMMU hang while relaxing access, we need mark + * the pte invalid in between. + */ + if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) { + unsigned long old_pte, new_pte; + + old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID); + /* + * new value of pte + */ + new_pte = old_pte | set; + radix__flush_tlb_page_psize(mm, address, psize); + __radix_pte_update(ptep, _PAGE_INVALID, new_pte); + } else { + __radix_pte_update(ptep, 0, set); + /* + * Book3S does not require a TLB flush when relaxing access + * restrictions when the address space is not attached to a + * NMMU, because the core MMU will reload the pte after taking + * an access fault, which is defined by the architectue. + */ + } + /* See ptesync comment in radix__set_pte_at */ +} diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 9f361ae571e9..010e1c616cb2 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -44,20 +44,13 @@ static inline int is_exec_fault(void) static inline int pte_looks_normal(pte_t pte) { -#if defined(CONFIG_PPC_BOOK3S_64) - if ((pte_val(pte) & (_PAGE_PRESENT | _PAGE_SPECIAL)) == _PAGE_PRESENT) { + if (pte_present(pte) && !pte_special(pte)) { if (pte_ci(pte)) return 0; if (pte_user(pte)) return 1; } return 0; -#else - return (pte_val(pte) & - (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER | - _PAGE_PRIVILEGED)) == - (_PAGE_PRESENT | _PAGE_USER); -#endif } static struct page *maybe_pte_to_page(pte_t pte) @@ -73,7 +66,7 @@ static struct page *maybe_pte_to_page(pte_t pte) return page; } -#if defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 +#ifdef CONFIG_PPC_BOOK3S /* Server-style MMU handles coherency when hashing if HW exec permission * is supposed per page (currently 64-bit only). If not, then, we always @@ -106,7 +99,7 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, return pte; } -#else /* defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 */ +#else /* CONFIG_PPC_BOOK3S */ /* Embedded type MMU with HW exec support. This is a bit more complicated * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so @@ -117,7 +110,7 @@ static pte_t set_pte_filter(pte_t pte) struct page *pg; /* No exec permission in the first place, move on */ - if (!(pte_val(pte) & _PAGE_EXEC) || !pte_looks_normal(pte)) + if (!pte_exec(pte) || !pte_looks_normal(pte)) return pte; /* If you set _PAGE_EXEC on weird pages you're on your own */ @@ -137,7 +130,7 @@ static pte_t set_pte_filter(pte_t pte) } /* Else, we filter out _PAGE_EXEC */ - return __pte(pte_val(pte) & ~_PAGE_EXEC); + return pte_exprotect(pte); } static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, @@ -150,7 +143,7 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, * if necessary. Also if _PAGE_EXEC is already set, same deal, * we just bail out */ - if (dirty || (pte_val(pte) & _PAGE_EXEC) || !is_exec_fault()) + if (dirty || pte_exec(pte) || !is_exec_fault()) return pte; #ifdef CONFIG_DEBUG_VM @@ -176,10 +169,10 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, set_bit(PG_arch_1, &pg->flags); bail: - return __pte(pte_val(pte) | _PAGE_EXEC); + return pte_mkexec(pte); } -#endif /* !(defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0) */ +#endif /* CONFIG_PPC_BOOK3S */ /* * set_pte stores a linux PTE into the linux page table. @@ -188,14 +181,13 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { /* - * When handling numa faults, we already have the pte marked - * _PAGE_PRESENT, but we can be sure that it is not in hpte. - * Hence we can use set_pte_at for them. + * Make sure hardware valid bit is not set. We don't do + * tlb flush for this update. */ - VM_WARN_ON(pte_present(*ptep) && !pte_protnone(*ptep)); + VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); /* Add the pte bit when trying to set a pte */ - pte = __pte(pte_val(pte) | _PAGE_PTE); + pte = pte_mkpte(pte); /* Note: mm->context.id might not yet have been assigned as * this context might not have been activated yet when this @@ -221,13 +213,54 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, entry = set_access_flags_filter(entry, vma, dirty); changed = !pte_same(*(ptep), entry); if (changed) { - if (!is_vm_hugetlb_page(vma)) - assert_pte_locked(vma->vm_mm, address); - __ptep_set_access_flags(vma->vm_mm, ptep, entry, address); - flush_tlb_page(vma, address); + assert_pte_locked(vma->vm_mm, address); + __ptep_set_access_flags(vma, ptep, entry, + address, mmu_virtual_psize); + } + return changed; +} + +#ifdef CONFIG_HUGETLB_PAGE +extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t pte, int dirty) +{ +#ifdef HUGETLB_NEED_PRELOAD + /* + * The "return 1" forces a call of update_mmu_cache, which will write a + * TLB entry. Without this, platforms that don't do a write of the TLB + * entry in the TLB miss handler asm will fault ad infinitum. + */ + ptep_set_access_flags(vma, addr, ptep, pte, dirty); + return 1; +#else + int changed, psize; + + pte = set_access_flags_filter(pte, vma, dirty); + changed = !pte_same(*(ptep), pte); + if (changed) { + +#ifdef CONFIG_PPC_BOOK3S_64 + struct hstate *h = hstate_vma(vma); + + psize = hstate_get_psize(h); +#ifdef CONFIG_DEBUG_VM + assert_spin_locked(huge_pte_lockptr(h, vma->vm_mm, ptep)); +#endif + +#else + /* + * Not used on non book3s64 platforms. But 8xx + * can possibly use tsize derived from hstate. + */ + psize = 0; +#endif + __ptep_set_access_flags(vma, ptep, pte, addr, psize); } return changed; +#endif } +#endif /* CONFIG_HUGETLB_PAGE */ #ifdef CONFIG_DEBUG_VM void assert_pte_locked(struct mm_struct *mm, unsigned long addr) diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index d35d9ad3c1cd..bda3c6f1bd32 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -50,7 +50,7 @@ __ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) if (slab_is_available()) { pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO); } else { - pte = __va(memblock_alloc(PAGE_SIZE, PAGE_SIZE)); + pte = __va(memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE)); if (pte) clear_page(pte); } @@ -76,56 +76,69 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) void __iomem * ioremap(phys_addr_t addr, unsigned long size) { - return __ioremap_caller(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED, - __builtin_return_address(0)); + pgprot_t prot = pgprot_noncached(PAGE_KERNEL); + + return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap); void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size) { - return __ioremap_caller(addr, size, _PAGE_NO_CACHE, - __builtin_return_address(0)); + pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL); + + return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_wc); void __iomem * +ioremap_wt(phys_addr_t addr, unsigned long size) +{ + pgprot_t prot = pgprot_cached_wthru(PAGE_KERNEL); + + return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_wt); + +void __iomem * +ioremap_coherent(phys_addr_t addr, unsigned long size) +{ + pgprot_t prot = pgprot_cached(PAGE_KERNEL); + + return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_coherent); + +void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) { + pte_t pte = __pte(flags); + /* writeable implies dirty for kernel addresses */ - if ((flags & (_PAGE_RW | _PAGE_RO)) != _PAGE_RO) - flags |= _PAGE_DIRTY | _PAGE_HWWRITE; + if (pte_write(pte)) + pte = pte_mkdirty(pte); /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ - flags &= ~(_PAGE_USER | _PAGE_EXEC); - flags |= _PAGE_PRIVILEGED; + pte = pte_exprotect(pte); + pte = pte_mkprivileged(pte); - return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); + return __ioremap_caller(addr, size, pte_pgprot(pte), __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_prot); void __iomem * __ioremap(phys_addr_t addr, unsigned long size, unsigned long flags) { - return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); + return __ioremap_caller(addr, size, __pgprot(flags), __builtin_return_address(0)); } void __iomem * -__ioremap_caller(phys_addr_t addr, unsigned long size, unsigned long flags, - void *caller) +__ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller) { unsigned long v, i; phys_addr_t p; int err; - /* Make sure we have the base flags */ - if ((flags & _PAGE_PRESENT) == 0) - flags |= pgprot_val(PAGE_KERNEL); - - /* Non-cacheable page cannot be coherent */ - if (flags & _PAGE_NO_CACHE) - flags &= ~_PAGE_COHERENT; - /* * Choose an address to map it to. * Once the vmalloc system is running, we use it. @@ -148,7 +161,7 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, unsigned long flags, * mem_init() sets high_memory so only do the check after that. */ if (slab_is_available() && (p < virt_to_phys(high_memory)) && - !(__allow_ioremap_reserved && memblock_is_region_reserved(p, size))) { + page_is_ram(__phys_to_pfn(p))) { printk("__ioremap(): phys addr 0x%llx is RAM lr %ps\n", (unsigned long long)p, __builtin_return_address(0)); return NULL; @@ -183,7 +196,7 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, unsigned long flags, err = 0; for (i = 0; i < size && err == 0; i += PAGE_SIZE) - err = map_kernel_page(v+i, p+i, flags); + err = map_kernel_page(v + i, p + i, prot); if (err) { if (slab_is_available()) vunmap((void *)v); @@ -209,7 +222,7 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap); -int map_kernel_page(unsigned long va, phys_addr_t pa, int flags) +int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot) { pmd_t *pd; pte_t *pg; @@ -224,10 +237,8 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, int flags) /* The PTE should never be already set nor present in the * hash table */ - BUG_ON((pte_val(*pg) & (_PAGE_PRESENT | _PAGE_HASHPTE)) && - flags); - set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT, - __pgprot(flags))); + BUG_ON((pte_present(*pg) | pte_hashpte(*pg)) && pgprot_val(prot)); + set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT, prot)); } smp_wmb(); return err; @@ -238,7 +249,7 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, int flags) */ static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) { - unsigned long v, s, f; + unsigned long v, s; phys_addr_t p; int ktext; @@ -248,11 +259,10 @@ static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) for (; s < top; s += PAGE_SIZE) { ktext = ((char *)v >= _stext && (char *)v < etext) || ((char *)v >= _sinittext && (char *)v < _einittext); - f = ktext ? pgprot_val(PAGE_KERNEL_TEXT) : pgprot_val(PAGE_KERNEL); - map_kernel_page(v, p, f); + map_kernel_page(v, p, ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL); #ifdef CONFIG_PPC_STD_MMU_32 if (ktext) - hash_preload(&init_mm, v, 0, 0x300); + hash_preload(&init_mm, v, false, 0x300); #endif v += PAGE_SIZE; p += PAGE_SIZE; diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index adf469f312f2..fb1375c07e8c 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -33,7 +33,6 @@ #include <linux/swap.h> #include <linux/stddef.h> #include <linux/vmalloc.h> -#include <linux/memblock.h> #include <linux/slab.h> #include <linux/hugetlb.h> @@ -47,21 +46,14 @@ #include <asm/smp.h> #include <asm/machdep.h> #include <asm/tlb.h> -#include <asm/trace.h> #include <asm/processor.h> #include <asm/cputable.h> #include <asm/sections.h> #include <asm/firmware.h> #include <asm/dma.h> -#include <asm/powernv.h> #include "mmu_decl.h" -#ifdef CONFIG_PPC_BOOK3S_64 -#if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT)) -#error TASK_SIZE_USER64 exceeds user VSID range -#endif -#endif #ifdef CONFIG_PPC_BOOK3S_64 /* @@ -80,8 +72,6 @@ unsigned long __pud_index_size; EXPORT_SYMBOL(__pud_index_size); unsigned long __pgd_index_size; EXPORT_SYMBOL(__pgd_index_size); -unsigned long __pmd_cache_index; -EXPORT_SYMBOL(__pmd_cache_index); unsigned long __pud_cache_index; EXPORT_SYMBOL(__pud_cache_index); unsigned long __pte_table_size; @@ -123,17 +113,12 @@ unsigned long ioremap_bot = IOREMAP_BASE; * __ioremap_at - Low level function to establish the page tables * for an IO mapping */ -void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size, - unsigned long flags) +void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_t prot) { unsigned long i; - /* Make sure we have the base flags */ - if ((flags & _PAGE_PRESENT) == 0) - flags |= pgprot_val(PAGE_KERNEL); - /* We don't support the 4K PFN hack with ioremap */ - if (flags & H_PAGE_4K_PFN) + if (pgprot_val(prot) & H_PAGE_4K_PFN) return NULL; WARN_ON(pa & ~PAGE_MASK); @@ -141,7 +126,7 @@ void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size, WARN_ON(size & ~PAGE_MASK); for (i = 0; i < size; i += PAGE_SIZE) - if (map_kernel_page((unsigned long)ea+i, pa+i, flags)) + if (map_kernel_page((unsigned long)ea + i, pa + i, prot)) return NULL; return (void __iomem *)ea; @@ -162,7 +147,7 @@ void __iounmap_at(void *ea, unsigned long size) } void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size, - unsigned long flags, void *caller) + pgprot_t prot, void *caller) { phys_addr_t paligned; void __iomem *ret; @@ -192,11 +177,11 @@ void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size, return NULL; area->phys_addr = paligned; - ret = __ioremap_at(paligned, area->addr, size, flags); + ret = __ioremap_at(paligned, area->addr, size, prot); if (!ret) vunmap(area->addr); } else { - ret = __ioremap_at(paligned, (void *)ioremap_bot, size, flags); + ret = __ioremap_at(paligned, (void *)ioremap_bot, size, prot); if (ret) ioremap_bot += size; } @@ -209,49 +194,59 @@ void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size, void __iomem * __ioremap(phys_addr_t addr, unsigned long size, unsigned long flags) { - return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); + return __ioremap_caller(addr, size, __pgprot(flags), __builtin_return_address(0)); } void __iomem * ioremap(phys_addr_t addr, unsigned long size) { - unsigned long flags = pgprot_val(pgprot_noncached(__pgprot(0))); + pgprot_t prot = pgprot_noncached(PAGE_KERNEL); void *caller = __builtin_return_address(0); if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, flags, caller); - return __ioremap_caller(addr, size, flags, caller); + return ppc_md.ioremap(addr, size, prot, caller); + return __ioremap_caller(addr, size, prot, caller); } void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size) { - unsigned long flags = pgprot_val(pgprot_noncached_wc(__pgprot(0))); + pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL); void *caller = __builtin_return_address(0); if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, flags, caller); - return __ioremap_caller(addr, size, flags, caller); + return ppc_md.ioremap(addr, size, prot, caller); + return __ioremap_caller(addr, size, prot, caller); +} + +void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size) +{ + pgprot_t prot = pgprot_cached(PAGE_KERNEL); + void *caller = __builtin_return_address(0); + + if (ppc_md.ioremap) + return ppc_md.ioremap(addr, size, prot, caller); + return __ioremap_caller(addr, size, prot, caller); } void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) { + pte_t pte = __pte(flags); void *caller = __builtin_return_address(0); /* writeable implies dirty for kernel addresses */ - if (flags & _PAGE_WRITE) - flags |= _PAGE_DIRTY; + if (pte_write(pte)) + pte = pte_mkdirty(pte); /* we don't want to let _PAGE_EXEC leak out */ - flags &= ~_PAGE_EXEC; + pte = pte_exprotect(pte); /* * Force kernel mapping. */ - flags &= ~_PAGE_USER; - flags |= _PAGE_PRIVILEGED; + pte = pte_mkprivileged(pte); if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, flags, caller); - return __ioremap_caller(addr, size, flags, caller); + return ppc_md.ioremap(addr, size, pte_pgprot(pte), caller); + return __ioremap_caller(addr, size, pte_pgprot(pte), caller); } @@ -316,177 +311,11 @@ struct page *pud_page(pud_t pud) */ struct page *pmd_page(pmd_t pmd) { - if (pmd_trans_huge(pmd) || pmd_huge(pmd) || pmd_devmap(pmd)) + if (pmd_large(pmd) || pmd_huge(pmd) || pmd_devmap(pmd)) return pte_page(pmd_pte(pmd)); return virt_to_page(pmd_page_vaddr(pmd)); } -#ifdef CONFIG_PPC_64K_PAGES -static pte_t *get_from_cache(struct mm_struct *mm) -{ - void *pte_frag, *ret; - - spin_lock(&mm->page_table_lock); - ret = mm->context.pte_frag; - if (ret) { - pte_frag = ret + PTE_FRAG_SIZE; - /* - * If we have taken up all the fragments mark PTE page NULL - */ - if (((unsigned long)pte_frag & ~PAGE_MASK) == 0) - pte_frag = NULL; - mm->context.pte_frag = pte_frag; - } - spin_unlock(&mm->page_table_lock); - return (pte_t *)ret; -} - -static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel) -{ - void *ret = NULL; - struct page *page; - - if (!kernel) { - page = alloc_page(PGALLOC_GFP | __GFP_ACCOUNT); - if (!page) - return NULL; - if (!pgtable_page_ctor(page)) { - __free_page(page); - return NULL; - } - } else { - page = alloc_page(PGALLOC_GFP); - if (!page) - return NULL; - } - - ret = page_address(page); - spin_lock(&mm->page_table_lock); - /* - * If we find pgtable_page set, we return - * the allocated page with single fragement - * count. - */ - if (likely(!mm->context.pte_frag)) { - set_page_count(page, PTE_FRAG_NR); - mm->context.pte_frag = ret + PTE_FRAG_SIZE; - } - spin_unlock(&mm->page_table_lock); - - return (pte_t *)ret; -} - -pte_t *pte_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel) -{ - pte_t *pte; - - pte = get_from_cache(mm); - if (pte) - return pte; - - return __alloc_for_cache(mm, kernel); -} -#endif /* CONFIG_PPC_64K_PAGES */ - -void pte_fragment_free(unsigned long *table, int kernel) -{ - struct page *page = virt_to_page(table); - if (put_page_testzero(page)) { - if (!kernel) - pgtable_page_dtor(page); - free_unref_page(page); - } -} - -#ifdef CONFIG_SMP -void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) -{ - unsigned long pgf = (unsigned long)table; - - BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); - pgf |= shift; - tlb_remove_table(tlb, (void *)pgf); -} - -void __tlb_remove_table(void *_table) -{ - void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); - unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; - - if (!shift) - /* PTE page needs special handling */ - pte_fragment_free(table, 0); - else { - BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); - kmem_cache_free(PGT_CACHE(shift), table); - } -} -#else -void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) -{ - if (!shift) { - /* PTE page needs special handling */ - pte_fragment_free(table, 0); - } else { - BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); - kmem_cache_free(PGT_CACHE(shift), table); - } -} -#endif - -#ifdef CONFIG_PPC_BOOK3S_64 -void __init mmu_partition_table_init(void) -{ - unsigned long patb_size = 1UL << PATB_SIZE_SHIFT; - unsigned long ptcr; - - BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large."); - partition_tb = __va(memblock_alloc_base(patb_size, patb_size, - MEMBLOCK_ALLOC_ANYWHERE)); - - /* Initialize the Partition Table with no entries */ - memset((void *)partition_tb, 0, patb_size); - - /* - * update partition table control register, - * 64 K size. - */ - ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12); - mtspr(SPRN_PTCR, ptcr); - powernv_set_nmmu_ptcr(ptcr); -} - -void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, - unsigned long dw1) -{ - unsigned long old = be64_to_cpu(partition_tb[lpid].patb0); - - partition_tb[lpid].patb0 = cpu_to_be64(dw0); - partition_tb[lpid].patb1 = cpu_to_be64(dw1); - - /* - * Global flush of TLBs and partition table caches for this lpid. - * The type of flush (hash or radix) depends on what the previous - * use of this partition ID was, not the new use. - */ - asm volatile("ptesync" : : : "memory"); - if (old & PATB_HR) { - asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); - asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); - trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 1); - } else { - asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); - trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0); - } - /* do we need fixup here ?*/ - asm volatile("eieio; tlbsync; ptesync" : : : "memory"); -} -EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry); -#endif /* CONFIG_PPC_BOOK3S_64 */ - #ifdef CONFIG_STRICT_KERNEL_RWX void mark_rodata_ro(void) { diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c index ba71c5481f42..b271b283c785 100644 --- a/arch/powerpc/mm/pkeys.c +++ b/arch/powerpc/mm/pkeys.c @@ -14,9 +14,12 @@ DEFINE_STATIC_KEY_TRUE(pkey_disabled); bool pkey_execute_disable_supported; int pkeys_total; /* Total pkeys as per device tree */ bool pkeys_devtree_defined; /* pkey property exported by device tree */ -u32 initial_allocation_mask; /* Bits set for reserved keys */ -u64 pkey_amr_uamor_mask; /* Bits in AMR/UMOR not to be touched */ +u32 initial_allocation_mask; /* Bits set for the initially allocated keys */ +u32 reserved_allocation_mask; /* Bits set for reserved keys */ +u64 pkey_amr_mask; /* Bits in AMR not to be touched */ u64 pkey_iamr_mask; /* Bits in AMR not to be touched */ +u64 pkey_uamor_mask; /* Bits in UMOR not to be touched */ +int execute_only_key = 2; #define AMR_BITS_PER_PKEY 2 #define AMR_RD_BIT 0x1UL @@ -42,7 +45,7 @@ static void scan_pkey_feature(void) * Since any pkey can be used for data or execute, we will just treat * all keys as equal and track them as one entity. */ - pkeys_total = be32_to_cpu(vals[0]); + pkeys_total = vals[0]; pkeys_devtree_defined = true; } @@ -91,7 +94,7 @@ int pkey_initialize(void) * arch-neutral code. */ pkeys_total = min_t(int, pkeys_total, - (ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)); + ((ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)+1)); if (!pkey_mmu_enabled() || radix_enabled() || !pkeys_total) static_branch_enable(&pkey_disabled); @@ -119,23 +122,39 @@ int pkey_initialize(void) #else os_reserved = 0; #endif - /* - * Bits are in LE format. NOTE: 1, 0 are reserved. - * key 0 is the default key, which allows read/write/execute. - * key 1 is recommended not to be used. PowerISA(3.0) page 1015, - * programming note. - */ - initial_allocation_mask = ~0x0; + /* Bits are in LE format. */ + reserved_allocation_mask = (0x1 << 1) | (0x1 << execute_only_key); /* register mask is in BE format */ - pkey_amr_uamor_mask = ~0x0ul; + pkey_amr_mask = ~0x0ul; + pkey_amr_mask &= ~(0x3ul << pkeyshift(0)); + pkey_iamr_mask = ~0x0ul; + pkey_iamr_mask &= ~(0x3ul << pkeyshift(0)); + pkey_iamr_mask &= ~(0x3ul << pkeyshift(execute_only_key)); - for (i = 2; i < (pkeys_total - os_reserved); i++) { - initial_allocation_mask &= ~(0x1 << i); - pkey_amr_uamor_mask &= ~(0x3ul << pkeyshift(i)); - pkey_iamr_mask &= ~(0x1ul << pkeyshift(i)); + pkey_uamor_mask = ~0x0ul; + pkey_uamor_mask &= ~(0x3ul << pkeyshift(0)); + pkey_uamor_mask &= ~(0x3ul << pkeyshift(execute_only_key)); + + /* mark the rest of the keys as reserved and hence unavailable */ + for (i = (pkeys_total - os_reserved); i < pkeys_total; i++) { + reserved_allocation_mask |= (0x1 << i); + pkey_uamor_mask &= ~(0x3ul << pkeyshift(i)); + } + initial_allocation_mask = reserved_allocation_mask | (0x1 << 0); + + if (unlikely((pkeys_total - os_reserved) <= execute_only_key)) { + /* + * Insufficient number of keys to support + * execute only key. Mark it unavailable. + * Any AMR, UAMOR, IAMR bit set for + * this key is irrelevant since this key + * can never be allocated. + */ + execute_only_key = -1; } + return 0; } @@ -146,8 +165,7 @@ void pkey_mm_init(struct mm_struct *mm) if (static_branch_likely(&pkey_disabled)) return; mm_pkey_allocation_map(mm) = initial_allocation_mask; - /* -1 means unallocated or invalid */ - mm->context.execute_only_pkey = -1; + mm->context.execute_only_pkey = execute_only_key; } static inline u64 read_amr(void) @@ -216,33 +234,6 @@ static inline void init_iamr(int pkey, u8 init_bits) write_iamr(old_iamr | new_iamr_bits); } -static void pkey_status_change(int pkey, bool enable) -{ - u64 old_uamor; - - /* Reset the AMR and IAMR bits for this key */ - init_amr(pkey, 0x0); - init_iamr(pkey, 0x0); - - /* Enable/disable key */ - old_uamor = read_uamor(); - if (enable) - old_uamor |= (0x3ul << pkeyshift(pkey)); - else - old_uamor &= ~(0x3ul << pkeyshift(pkey)); - write_uamor(old_uamor); -} - -void __arch_activate_pkey(int pkey) -{ - pkey_status_change(pkey, true); -} - -void __arch_deactivate_pkey(int pkey) -{ - pkey_status_change(pkey, false); -} - /* * Set the access rights in AMR IAMR and UAMOR registers for @pkey to that * specified in @init_val. @@ -292,9 +283,6 @@ void thread_pkey_regs_restore(struct thread_struct *new_thread, if (static_branch_likely(&pkey_disabled)) return; - /* - * TODO: Just set UAMOR to zero if @new_thread hasn't used any keys yet. - */ if (old_thread->amr != new_thread->amr) write_amr(new_thread->amr); if (old_thread->iamr != new_thread->iamr) @@ -308,9 +296,13 @@ void thread_pkey_regs_init(struct thread_struct *thread) if (static_branch_likely(&pkey_disabled)) return; - write_amr(read_amr() & pkey_amr_uamor_mask); - write_iamr(read_iamr() & pkey_iamr_mask); - write_uamor(read_uamor() & pkey_amr_uamor_mask); + thread->amr = pkey_amr_mask; + thread->iamr = pkey_iamr_mask; + thread->uamor = pkey_uamor_mask; + + write_uamor(pkey_uamor_mask); + write_amr(pkey_amr_mask); + write_iamr(pkey_iamr_mask); } static inline bool pkey_allows_readwrite(int pkey) @@ -325,48 +317,7 @@ static inline bool pkey_allows_readwrite(int pkey) int __execute_only_pkey(struct mm_struct *mm) { - bool need_to_set_mm_pkey = false; - int execute_only_pkey = mm->context.execute_only_pkey; - int ret; - - /* Do we need to assign a pkey for mm's execute-only maps? */ - if (execute_only_pkey == -1) { - /* Go allocate one to use, which might fail */ - execute_only_pkey = mm_pkey_alloc(mm); - if (execute_only_pkey < 0) - return -1; - need_to_set_mm_pkey = true; - } - - /* - * We do not want to go through the relatively costly dance to set AMR - * if we do not need to. Check it first and assume that if the - * execute-only pkey is readwrite-disabled than we do not have to set it - * ourselves. - */ - if (!need_to_set_mm_pkey && !pkey_allows_readwrite(execute_only_pkey)) - return execute_only_pkey; - - /* - * Set up AMR so that it denies access for everything other than - * execution. - */ - ret = __arch_set_user_pkey_access(current, execute_only_pkey, - PKEY_DISABLE_ACCESS | - PKEY_DISABLE_WRITE); - /* - * If the AMR-set operation failed somehow, just return 0 and - * effectively disable execute-only support. - */ - if (ret) { - mm_pkey_free(mm, execute_only_pkey); - return -1; - } - - /* We got one, store it and use it from here on out */ - if (need_to_set_mm_pkey) - mm->context.execute_only_pkey = execute_only_pkey; - return execute_only_pkey; + return mm->context.execute_only_pkey; } static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma) @@ -386,9 +337,9 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, { /* * If the currently associated pkey is execute-only, but the requested - * protection requires read or write, move it back to the default pkey. + * protection is not execute-only, move it back to the default pkey. */ - if (vma_is_pkey_exec_only(vma) && (prot & (PROT_READ | PROT_WRITE))) + if (vma_is_pkey_exec_only(vma) && (prot != PROT_EXEC)) return 0; /* @@ -410,9 +361,6 @@ static bool pkey_access_permitted(int pkey, bool write, bool execute) int pkey_shift; u64 amr; - if (!pkey) - return true; - if (!is_pkey_enabled(pkey)) return true; diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c index 2a049fb8523d..f6f575bae3bc 100644 --- a/arch/powerpc/mm/ppc_mmu_32.c +++ b/arch/powerpc/mm/ppc_mmu_32.c @@ -163,11 +163,11 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys, * Preload a translation in the hash table */ void hash_preload(struct mm_struct *mm, unsigned long ea, - unsigned long access, unsigned long trap) + bool is_exec, unsigned long trap) { pmd_t *pmd; - if (Hash == 0) + if (!Hash) return; pmd = pmd_offset(pud_offset(pgd_offset(mm, ea), ea), ea); if (!pmd_none(*pmd)) @@ -224,7 +224,7 @@ void __init MMU_init_hw(void) * Find some memory for the hash table. */ if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322); - Hash = __va(memblock_alloc(Hash_size, Hash_size)); + Hash = __va(memblock_phys_alloc(Hash_size, Hash_size)); memset(Hash, 0, Hash_size); _SDR1 = __pa(Hash) | SDR1_LOW_BITS; diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 13cfe413b40d..bc3914d54e26 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -14,14 +14,17 @@ * 2 of the License, or (at your option) any later version. */ +#include <asm/asm-prototypes.h> #include <asm/pgtable.h> #include <asm/mmu.h> #include <asm/mmu_context.h> #include <asm/paca.h> +#include <asm/ppc-opcode.h> #include <asm/cputable.h> #include <asm/cacheflush.h> #include <asm/smp.h> #include <linux/compiler.h> +#include <linux/context_tracking.h> #include <linux/mm_types.h> #include <asm/udbg.h> @@ -29,11 +32,10 @@ enum slb_index { LINEAR_INDEX = 0, /* Kernel linear map (0xc000000000000000) */ - VMALLOC_INDEX = 1, /* Kernel virtual map (0xd000000000000000) */ - KSTACK_INDEX = 2, /* Kernel stack map */ + KSTACK_INDEX = 1, /* Kernel stack map */ }; -extern void slb_allocate(unsigned long ea); +static long slb_allocate_user(struct mm_struct *mm, unsigned long ea); #define slb_esid_mask(ssize) \ (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T) @@ -44,13 +46,35 @@ static inline unsigned long mk_esid_data(unsigned long ea, int ssize, return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index; } -static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, +static inline unsigned long __mk_vsid_data(unsigned long vsid, int ssize, unsigned long flags) { - return (get_kernel_vsid(ea, ssize) << slb_vsid_shift(ssize)) | flags | + return (vsid << slb_vsid_shift(ssize)) | flags | ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT); } +static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, + unsigned long flags) +{ + return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags); +} + +static void assert_slb_presence(bool present, unsigned long ea) +{ +#ifdef CONFIG_DEBUG_VM + unsigned long tmp; + + WARN_ON_ONCE(mfmsr() & MSR_EE); + + if (!cpu_has_feature(CPU_FTR_ARCH_206)) + return; + + asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0"); + + WARN_ON(present == (tmp == 0)); +#endif +} + static inline void slb_shadow_update(unsigned long ea, int ssize, unsigned long flags, enum slb_index index) @@ -62,14 +86,14 @@ static inline void slb_shadow_update(unsigned long ea, int ssize, * updating it. No write barriers are needed here, provided * we only update the current CPU's SLB shadow buffer. */ - p->save_area[index].esid = 0; - p->save_area[index].vsid = cpu_to_be64(mk_vsid_data(ea, ssize, flags)); - p->save_area[index].esid = cpu_to_be64(mk_esid_data(ea, ssize, index)); + WRITE_ONCE(p->save_area[index].esid, 0); + WRITE_ONCE(p->save_area[index].vsid, cpu_to_be64(mk_vsid_data(ea, ssize, flags))); + WRITE_ONCE(p->save_area[index].esid, cpu_to_be64(mk_esid_data(ea, ssize, index))); } static inline void slb_shadow_clear(enum slb_index index) { - get_slb_shadow()->save_area[index].esid = 0; + WRITE_ONCE(get_slb_shadow()->save_area[index].esid, cpu_to_be64(index)); } static inline void create_shadowed_slbe(unsigned long ea, int ssize, @@ -83,54 +107,61 @@ static inline void create_shadowed_slbe(unsigned long ea, int ssize, */ slb_shadow_update(ea, ssize, flags, index); + assert_slb_presence(false, ea); asm volatile("slbmte %0,%1" : : "r" (mk_vsid_data(ea, ssize, flags)), "r" (mk_esid_data(ea, ssize, index)) : "memory" ); } -static void __slb_flush_and_rebolt(void) +/* + * Insert bolted entries into SLB (which may not be empty, so don't clear + * slb_cache_ptr). + */ +void __slb_restore_bolted_realmode(void) { - /* If you change this make sure you change SLB_NUM_BOLTED - * and PR KVM appropriately too. */ - unsigned long linear_llp, vmalloc_llp, lflags, vflags; - unsigned long ksp_esid_data, ksp_vsid_data; - - linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; - vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp; - lflags = SLB_VSID_KERNEL | linear_llp; - vflags = SLB_VSID_KERNEL | vmalloc_llp; + struct slb_shadow *p = get_slb_shadow(); + enum slb_index index; - ksp_esid_data = mk_esid_data(get_paca()->kstack, mmu_kernel_ssize, KSTACK_INDEX); - if ((ksp_esid_data & ~0xfffffffUL) <= PAGE_OFFSET) { - ksp_esid_data &= ~SLB_ESID_V; - ksp_vsid_data = 0; - slb_shadow_clear(KSTACK_INDEX); - } else { - /* Update stack entry; others don't change */ - slb_shadow_update(get_paca()->kstack, mmu_kernel_ssize, lflags, KSTACK_INDEX); - ksp_vsid_data = - be64_to_cpu(get_slb_shadow()->save_area[KSTACK_INDEX].vsid); + /* No isync needed because realmode. */ + for (index = 0; index < SLB_NUM_BOLTED; index++) { + asm volatile("slbmte %0,%1" : + : "r" (be64_to_cpu(p->save_area[index].vsid)), + "r" (be64_to_cpu(p->save_area[index].esid))); } - /* We need to do this all in asm, so we're sure we don't touch - * the stack between the slbia and rebolting it. */ - asm volatile("isync\n" - "slbia\n" - /* Slot 1 - first VMALLOC segment */ - "slbmte %0,%1\n" - /* Slot 2 - kernel stack */ - "slbmte %2,%3\n" - "isync" - :: "r"(mk_vsid_data(VMALLOC_START, mmu_kernel_ssize, vflags)), - "r"(mk_esid_data(VMALLOC_START, mmu_kernel_ssize, VMALLOC_INDEX)), - "r"(ksp_vsid_data), - "r"(ksp_esid_data) - : "memory"); + assert_slb_presence(true, local_paca->kstack); +} + +/* + * Insert the bolted entries into an empty SLB. + */ +void slb_restore_bolted_realmode(void) +{ + __slb_restore_bolted_realmode(); + get_paca()->slb_cache_ptr = 0; + + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; } -void slb_flush_and_rebolt(void) +/* + * This flushes all SLB entries including 0, so it must be realmode. + */ +void slb_flush_all_realmode(void) { + asm volatile("slbmte %0,%0; slbia" : : "r" (0)); +} + +/* + * This flushes non-bolted entries, it can be run in virtual mode. Must + * be called with interrupts disabled. + */ +void slb_flush_and_restore_bolted(void) +{ + struct slb_shadow *p = get_slb_shadow(); + + BUILD_BUG_ON(SLB_NUM_BOLTED != 2); WARN_ON(!irqs_disabled()); @@ -140,56 +171,241 @@ void slb_flush_and_rebolt(void) */ hard_irq_disable(); - __slb_flush_and_rebolt(); + asm volatile("isync\n" + "slbia\n" + "slbmte %0, %1\n" + "isync\n" + :: "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].vsid)), + "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].esid)) + : "memory"); + assert_slb_presence(true, get_paca()->kstack); + get_paca()->slb_cache_ptr = 0; + + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; +} + +void slb_save_contents(struct slb_entry *slb_ptr) +{ + int i; + unsigned long e, v; + + /* Save slb_cache_ptr value. */ + get_paca()->slb_save_cache_ptr = get_paca()->slb_cache_ptr; + + if (!slb_ptr) + return; + + for (i = 0; i < mmu_slb_size; i++) { + asm volatile("slbmfee %0,%1" : "=r" (e) : "r" (i)); + asm volatile("slbmfev %0,%1" : "=r" (v) : "r" (i)); + slb_ptr->esid = e; + slb_ptr->vsid = v; + slb_ptr++; + } +} + +void slb_dump_contents(struct slb_entry *slb_ptr) +{ + int i, n; + unsigned long e, v; + unsigned long llp; + + if (!slb_ptr) + return; + + pr_err("SLB contents of cpu 0x%x\n", smp_processor_id()); + pr_err("Last SLB entry inserted at slot %d\n", get_paca()->stab_rr); + + for (i = 0; i < mmu_slb_size; i++) { + e = slb_ptr->esid; + v = slb_ptr->vsid; + slb_ptr++; + + if (!e && !v) + continue; + + pr_err("%02d %016lx %016lx\n", i, e, v); + + if (!(e & SLB_ESID_V)) { + pr_err("\n"); + continue; + } + llp = v & SLB_VSID_LLP; + if (v & SLB_VSID_B_1T) { + pr_err(" 1T ESID=%9lx VSID=%13lx LLP:%3lx\n", + GET_ESID_1T(e), + (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T, llp); + } else { + pr_err(" 256M ESID=%9lx VSID=%13lx LLP:%3lx\n", + GET_ESID(e), + (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT, llp); + } + } + pr_err("----------------------------------\n"); + + /* Dump slb cache entires as well. */ + pr_err("SLB cache ptr value = %d\n", get_paca()->slb_save_cache_ptr); + pr_err("Valid SLB cache entries:\n"); + n = min_t(int, get_paca()->slb_save_cache_ptr, SLB_CACHE_ENTRIES); + for (i = 0; i < n; i++) + pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]); + pr_err("Rest of SLB cache entries:\n"); + for (i = n; i < SLB_CACHE_ENTRIES; i++) + pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]); } void slb_vmalloc_update(void) { - unsigned long vflags; + /* + * vmalloc is not bolted, so just have to flush non-bolted. + */ + slb_flush_and_restore_bolted(); +} + +static bool preload_hit(struct thread_info *ti, unsigned long esid) +{ + unsigned char i; - vflags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmalloc_psize].sllp; - slb_shadow_update(VMALLOC_START, mmu_kernel_ssize, vflags, VMALLOC_INDEX); - slb_flush_and_rebolt(); + for (i = 0; i < ti->slb_preload_nr; i++) { + unsigned char idx; + + idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR; + if (esid == ti->slb_preload_esid[idx]) + return true; + } + return false; } -/* Helper function to compare esids. There are four cases to handle. - * 1. The system is not 1T segment size capable. Use the GET_ESID compare. - * 2. The system is 1T capable, both addresses are < 1T, use the GET_ESID compare. - * 3. The system is 1T capable, only one of the two addresses is > 1T. This is not a match. - * 4. The system is 1T capable, both addresses are > 1T, use the GET_ESID_1T macro to compare. - */ -static inline int esids_match(unsigned long addr1, unsigned long addr2) +static bool preload_add(struct thread_info *ti, unsigned long ea) { - int esid_1t_count; + unsigned char idx; + unsigned long esid; - /* System is not 1T segment size capable. */ - if (!mmu_has_feature(MMU_FTR_1T_SEGMENT)) - return (GET_ESID(addr1) == GET_ESID(addr2)); + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { + /* EAs are stored >> 28 so 256MB segments don't need clearing */ + if (ea & ESID_MASK_1T) + ea &= ESID_MASK_1T; + } + + esid = ea >> SID_SHIFT; - esid_1t_count = (((addr1 >> SID_SHIFT_1T) != 0) + - ((addr2 >> SID_SHIFT_1T) != 0)); + if (preload_hit(ti, esid)) + return false; - /* both addresses are < 1T */ - if (esid_1t_count == 0) - return (GET_ESID(addr1) == GET_ESID(addr2)); + idx = (ti->slb_preload_tail + ti->slb_preload_nr) % SLB_PRELOAD_NR; + ti->slb_preload_esid[idx] = esid; + if (ti->slb_preload_nr == SLB_PRELOAD_NR) + ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR; + else + ti->slb_preload_nr++; - /* One address < 1T, the other > 1T. Not a match */ - if (esid_1t_count == 1) - return 0; + return true; +} - /* Both addresses are > 1T. */ - return (GET_ESID_1T(addr1) == GET_ESID_1T(addr2)); +static void preload_age(struct thread_info *ti) +{ + if (!ti->slb_preload_nr) + return; + ti->slb_preload_nr--; + ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR; } +void slb_setup_new_exec(void) +{ + struct thread_info *ti = current_thread_info(); + struct mm_struct *mm = current->mm; + unsigned long exec = 0x10000000; + + WARN_ON(irqs_disabled()); + + /* + * preload cache can only be used to determine whether a SLB + * entry exists if it does not start to overflow. + */ + if (ti->slb_preload_nr + 2 > SLB_PRELOAD_NR) + return; + + hard_irq_disable(); + + /* + * We have no good place to clear the slb preload cache on exec, + * flush_thread is about the earliest arch hook but that happens + * after we switch to the mm and have aleady preloaded the SLBEs. + * + * For the most part that's probably okay to use entries from the + * previous exec, they will age out if unused. It may turn out to + * be an advantage to clear the cache before switching to it, + * however. + */ + + /* + * preload some userspace segments into the SLB. + * Almost all 32 and 64bit PowerPC executables are linked at + * 0x10000000 so it makes sense to preload this segment. + */ + if (!is_kernel_addr(exec)) { + if (preload_add(ti, exec)) + slb_allocate_user(mm, exec); + } + + /* Libraries and mmaps. */ + if (!is_kernel_addr(mm->mmap_base)) { + if (preload_add(ti, mm->mmap_base)) + slb_allocate_user(mm, mm->mmap_base); + } + + /* see switch_slb */ + asm volatile("isync" : : : "memory"); + + local_irq_enable(); +} + +void preload_new_slb_context(unsigned long start, unsigned long sp) +{ + struct thread_info *ti = current_thread_info(); + struct mm_struct *mm = current->mm; + unsigned long heap = mm->start_brk; + + WARN_ON(irqs_disabled()); + + /* see above */ + if (ti->slb_preload_nr + 3 > SLB_PRELOAD_NR) + return; + + hard_irq_disable(); + + /* Userspace entry address. */ + if (!is_kernel_addr(start)) { + if (preload_add(ti, start)) + slb_allocate_user(mm, start); + } + + /* Top of stack, grows down. */ + if (!is_kernel_addr(sp)) { + if (preload_add(ti, sp)) + slb_allocate_user(mm, sp); + } + + /* Bottom of heap, grows up. */ + if (heap && !is_kernel_addr(heap)) { + if (preload_add(ti, heap)) + slb_allocate_user(mm, heap); + } + + /* see switch_slb */ + asm volatile("isync" : : : "memory"); + + local_irq_enable(); +} + + /* Flush all user entries from the segment table of the current processor. */ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) { - unsigned long offset; - unsigned long slbie_data = 0; - unsigned long pc = KSTK_EIP(tsk); - unsigned long stack = KSTK_ESP(tsk); - unsigned long exec_base; + struct thread_info *ti = task_thread_info(tsk); + unsigned char i; /* * We need interrupts hard-disabled here, not just soft-disabled, @@ -198,91 +414,107 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) * which would update the slb_cache/slb_cache_ptr fields in the PACA. */ hard_irq_disable(); - offset = get_paca()->slb_cache_ptr; - if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) && - offset <= SLB_CACHE_ENTRIES) { - int i; - asm volatile("isync" : : : "memory"); - for (i = 0; i < offset; i++) { - slbie_data = (unsigned long)get_paca()->slb_cache[i] - << SID_SHIFT; /* EA */ - slbie_data |= user_segment_size(slbie_data) - << SLBIE_SSIZE_SHIFT; - slbie_data |= SLBIE_C; /* C set for user addresses */ - asm volatile("slbie %0" : : "r" (slbie_data)); - } - asm volatile("isync" : : : "memory"); + asm volatile("isync" : : : "memory"); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + /* + * SLBIA IH=3 invalidates all Class=1 SLBEs and their + * associated lookaside structures, which matches what + * switch_slb wants. So ARCH_300 does not use the slb + * cache. + */ + asm volatile(PPC_SLBIA(3)); } else { - __slb_flush_and_rebolt(); - } + unsigned long offset = get_paca()->slb_cache_ptr; + + if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) && + offset <= SLB_CACHE_ENTRIES) { + unsigned long slbie_data = 0; + + for (i = 0; i < offset; i++) { + unsigned long ea; + + ea = (unsigned long) + get_paca()->slb_cache[i] << SID_SHIFT; + /* + * Could assert_slb_presence(true) here, but + * hypervisor or machine check could have come + * in and removed the entry at this point. + */ + + slbie_data = ea; + slbie_data |= user_segment_size(slbie_data) + << SLBIE_SSIZE_SHIFT; + slbie_data |= SLBIE_C; /* user slbs have C=1 */ + asm volatile("slbie %0" : : "r" (slbie_data)); + } + + /* Workaround POWER5 < DD2.1 issue */ + if (!cpu_has_feature(CPU_FTR_ARCH_207S) && offset == 1) + asm volatile("slbie %0" : : "r" (slbie_data)); + + } else { + struct slb_shadow *p = get_slb_shadow(); + unsigned long ksp_esid_data = + be64_to_cpu(p->save_area[KSTACK_INDEX].esid); + unsigned long ksp_vsid_data = + be64_to_cpu(p->save_area[KSTACK_INDEX].vsid); + + asm volatile(PPC_SLBIA(1) "\n" + "slbmte %0,%1\n" + "isync" + :: "r"(ksp_vsid_data), + "r"(ksp_esid_data)); + + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + } - /* Workaround POWER5 < DD2.1 issue */ - if (offset == 1 || offset > SLB_CACHE_ENTRIES) - asm volatile("slbie %0" : : "r" (slbie_data)); + get_paca()->slb_cache_ptr = 0; + } + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; - get_paca()->slb_cache_ptr = 0; copy_mm_to_paca(mm); /* - * preload some userspace segments into the SLB. - * Almost all 32 and 64bit PowerPC executables are linked at - * 0x10000000 so it makes sense to preload this segment. + * We gradually age out SLBs after a number of context switches to + * reduce reload overhead of unused entries (like we do with FP/VEC + * reload). Each time we wrap 256 switches, take an entry out of the + * SLB preload cache. */ - exec_base = 0x10000000; + tsk->thread.load_slb++; + if (!tsk->thread.load_slb) { + unsigned long pc = KSTK_EIP(tsk); - if (is_kernel_addr(pc) || is_kernel_addr(stack) || - is_kernel_addr(exec_base)) - return; + preload_age(ti); + preload_add(ti, pc); + } - slb_allocate(pc); + for (i = 0; i < ti->slb_preload_nr; i++) { + unsigned char idx; + unsigned long ea; - if (!esids_match(pc, stack)) - slb_allocate(stack); + idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR; + ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT; - if (!esids_match(pc, exec_base) && - !esids_match(stack, exec_base)) - slb_allocate(exec_base); -} - -static inline void patch_slb_encoding(unsigned int *insn_addr, - unsigned int immed) -{ + slb_allocate_user(mm, ea); + } /* - * This function patches either an li or a cmpldi instruction with - * a new immediate value. This relies on the fact that both li - * (which is actually addi) and cmpldi both take a 16-bit immediate - * value, and it is situated in the same location in the instruction, - * ie. bits 16-31 (Big endian bit order) or the lower 16 bits. - * The signedness of the immediate operand differs between the two - * instructions however this code is only ever patching a small value, - * much less than 1 << 15, so we can get away with it. - * To patch the value we read the existing instruction, clear the - * immediate value, and or in our new value, then write the instruction - * back. + * Synchronize slbmte preloads with possible subsequent user memory + * address accesses by the kernel (user mode won't happen until + * rfid, which is safe). */ - unsigned int insn = (*insn_addr & 0xffff0000) | immed; - patch_instruction(insn_addr, insn); + asm volatile("isync" : : : "memory"); } -extern u32 slb_miss_kernel_load_linear[]; -extern u32 slb_miss_kernel_load_io[]; -extern u32 slb_compare_rr_to_size[]; -extern u32 slb_miss_kernel_load_vmemmap[]; - void slb_set_size(u16 size) { - if (mmu_slb_size == size) - return; - mmu_slb_size = size; - patch_slb_encoding(slb_compare_rr_to_size, mmu_slb_size); } void slb_initialize(void) { unsigned long linear_llp, vmalloc_llp, io_llp; - unsigned long lflags, vflags; + unsigned long lflags; static int slb_encoding_inited; #ifdef CONFIG_SPARSEMEM_VMEMMAP unsigned long vmemmap_llp; @@ -298,34 +530,24 @@ void slb_initialize(void) #endif if (!slb_encoding_inited) { slb_encoding_inited = 1; - patch_slb_encoding(slb_miss_kernel_load_linear, - SLB_VSID_KERNEL | linear_llp); - patch_slb_encoding(slb_miss_kernel_load_io, - SLB_VSID_KERNEL | io_llp); - patch_slb_encoding(slb_compare_rr_to_size, - mmu_slb_size); - pr_devel("SLB: linear LLP = %04lx\n", linear_llp); pr_devel("SLB: io LLP = %04lx\n", io_llp); - #ifdef CONFIG_SPARSEMEM_VMEMMAP - patch_slb_encoding(slb_miss_kernel_load_vmemmap, - SLB_VSID_KERNEL | vmemmap_llp); pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp); #endif } - get_paca()->stab_rr = SLB_NUM_BOLTED; + get_paca()->stab_rr = SLB_NUM_BOLTED - 1; + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; lflags = SLB_VSID_KERNEL | linear_llp; - vflags = SLB_VSID_KERNEL | vmalloc_llp; /* Invalidate the entire SLB (even entry 0) & all the ERATS */ asm volatile("isync":::"memory"); asm volatile("slbmte %0,%0"::"r" (0) : "memory"); asm volatile("isync; slbia; isync":::"memory"); create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, LINEAR_INDEX); - create_shadowed_slbe(VMALLOC_START, mmu_kernel_ssize, vflags, VMALLOC_INDEX); /* For the boot cpu, we're running on the stack in init_thread_union, * which is in the first segment of the linear mapping, and also @@ -340,3 +562,260 @@ void slb_initialize(void) asm volatile("isync":::"memory"); } + +static void slb_cache_update(unsigned long esid_data) +{ + int slb_cache_index; + + if (cpu_has_feature(CPU_FTR_ARCH_300)) + return; /* ISAv3.0B and later does not use slb_cache */ + + /* + * Now update slb cache entries + */ + slb_cache_index = local_paca->slb_cache_ptr; + if (slb_cache_index < SLB_CACHE_ENTRIES) { + /* + * We have space in slb cache for optimized switch_slb(). + * Top 36 bits from esid_data as per ISA + */ + local_paca->slb_cache[slb_cache_index++] = esid_data >> 28; + local_paca->slb_cache_ptr++; + } else { + /* + * Our cache is full and the current cache content strictly + * doesn't indicate the active SLB conents. Bump the ptr + * so that switch_slb() will ignore the cache. + */ + local_paca->slb_cache_ptr = SLB_CACHE_ENTRIES + 1; + } +} + +static enum slb_index alloc_slb_index(bool kernel) +{ + enum slb_index index; + + /* + * The allocation bitmaps can become out of synch with the SLB + * when the _switch code does slbie when bolting a new stack + * segment and it must not be anywhere else in the SLB. This leaves + * a kernel allocated entry that is unused in the SLB. With very + * large systems or small segment sizes, the bitmaps could slowly + * fill with these entries. They will eventually be cleared out + * by the round robin allocator in that case, so it's probably not + * worth accounting for. + */ + + /* + * SLBs beyond 32 entries are allocated with stab_rr only + * POWER7/8/9 have 32 SLB entries, this could be expanded if a + * future CPU has more. + */ + if (local_paca->slb_used_bitmap != U32_MAX) { + index = ffz(local_paca->slb_used_bitmap); + local_paca->slb_used_bitmap |= 1U << index; + if (kernel) + local_paca->slb_kern_bitmap |= 1U << index; + } else { + /* round-robin replacement of slb starting at SLB_NUM_BOLTED. */ + index = local_paca->stab_rr; + if (index < (mmu_slb_size - 1)) + index++; + else + index = SLB_NUM_BOLTED; + local_paca->stab_rr = index; + if (index < 32) { + if (kernel) + local_paca->slb_kern_bitmap |= 1U << index; + else + local_paca->slb_kern_bitmap &= ~(1U << index); + } + } + BUG_ON(index < SLB_NUM_BOLTED); + + return index; +} + +static long slb_insert_entry(unsigned long ea, unsigned long context, + unsigned long flags, int ssize, bool kernel) +{ + unsigned long vsid; + unsigned long vsid_data, esid_data; + enum slb_index index; + + vsid = get_vsid(context, ea, ssize); + if (!vsid) + return -EFAULT; + + /* + * There must not be a kernel SLB fault in alloc_slb_index or before + * slbmte here or the allocation bitmaps could get out of whack with + * the SLB. + * + * User SLB faults or preloads take this path which might get inlined + * into the caller, so add compiler barriers here to ensure unsafe + * memory accesses do not come between. + */ + barrier(); + + index = alloc_slb_index(kernel); + + vsid_data = __mk_vsid_data(vsid, ssize, flags); + esid_data = mk_esid_data(ea, ssize, index); + + /* + * No need for an isync before or after this slbmte. The exception + * we enter with and the rfid we exit with are context synchronizing. + * User preloads should add isync afterwards in case the kernel + * accesses user memory before it returns to userspace with rfid. + */ + assert_slb_presence(false, ea); + asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data)); + + barrier(); + + if (!kernel) + slb_cache_update(esid_data); + + return 0; +} + +static long slb_allocate_kernel(unsigned long ea, unsigned long id) +{ + unsigned long context; + unsigned long flags; + int ssize; + + if (id == KERNEL_REGION_ID) { + + /* We only support upto MAX_PHYSMEM_BITS */ + if ((ea & ~REGION_MASK) > (1UL << MAX_PHYSMEM_BITS)) + return -EFAULT; + + flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp; + +#ifdef CONFIG_SPARSEMEM_VMEMMAP + } else if (id == VMEMMAP_REGION_ID) { + + if ((ea & ~REGION_MASK) >= (1ULL << MAX_EA_BITS_PER_CONTEXT)) + return -EFAULT; + + flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp; +#endif + } else if (id == VMALLOC_REGION_ID) { + + if ((ea & ~REGION_MASK) >= (1ULL << MAX_EA_BITS_PER_CONTEXT)) + return -EFAULT; + + if (ea < H_VMALLOC_END) + flags = local_paca->vmalloc_sllp; + else + flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp; + } else { + return -EFAULT; + } + + ssize = MMU_SEGSIZE_1T; + if (!mmu_has_feature(MMU_FTR_1T_SEGMENT)) + ssize = MMU_SEGSIZE_256M; + + context = get_kernel_context(ea); + return slb_insert_entry(ea, context, flags, ssize, true); +} + +static long slb_allocate_user(struct mm_struct *mm, unsigned long ea) +{ + unsigned long context; + unsigned long flags; + int bpsize; + int ssize; + + /* + * consider this as bad access if we take a SLB miss + * on an address above addr limit. + */ + if (ea >= mm->context.slb_addr_limit) + return -EFAULT; + + context = get_user_context(&mm->context, ea); + if (!context) + return -EFAULT; + + if (unlikely(ea >= H_PGTABLE_RANGE)) { + WARN_ON(1); + return -EFAULT; + } + + ssize = user_segment_size(ea); + + bpsize = get_slice_psize(mm, ea); + flags = SLB_VSID_USER | mmu_psize_defs[bpsize].sllp; + + return slb_insert_entry(ea, context, flags, ssize, false); +} + +long do_slb_fault(struct pt_regs *regs, unsigned long ea) +{ + unsigned long id = REGION_ID(ea); + + /* IRQs are not reconciled here, so can't check irqs_disabled */ + VM_WARN_ON(mfmsr() & MSR_EE); + + if (unlikely(!(regs->msr & MSR_RI))) + return -EINVAL; + + /* + * SLB kernel faults must be very careful not to touch anything + * that is not bolted. E.g., PACA and global variables are okay, + * mm->context stuff is not. + * + * SLB user faults can access all of kernel memory, but must be + * careful not to touch things like IRQ state because it is not + * "reconciled" here. The difficulty is that we must use + * fast_exception_return to return from kernel SLB faults without + * looking at possible non-bolted memory. We could test user vs + * kernel faults in the interrupt handler asm and do a full fault, + * reconcile, ret_from_except for user faults which would make them + * first class kernel code. But for performance it's probably nicer + * if they go via fast_exception_return too. + */ + if (id >= KERNEL_REGION_ID) { + long err; +#ifdef CONFIG_DEBUG_VM + /* Catch recursive kernel SLB faults. */ + BUG_ON(local_paca->in_kernel_slb_handler); + local_paca->in_kernel_slb_handler = 1; +#endif + err = slb_allocate_kernel(ea, id); +#ifdef CONFIG_DEBUG_VM + local_paca->in_kernel_slb_handler = 0; +#endif + return err; + } else { + struct mm_struct *mm = current->mm; + long err; + + if (unlikely(!mm)) + return -EFAULT; + + err = slb_allocate_user(mm, ea); + if (!err) + preload_add(current_thread_info(), ea); + + return err; + } +} + +void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err) +{ + if (err == -EFAULT) { + if (user_mode(regs)) + _exception(SIGSEGV, regs, SEGV_BNDERR, ea); + else + bad_page_fault(regs, ea, SIGSEGV); + } else if (err == -EINVAL) { + unrecoverable_exception(regs); + } else { + BUG(); + } +} diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S deleted file mode 100644 index 2cf5ef3fc50d..000000000000 --- a/arch/powerpc/mm/slb_low.S +++ /dev/null @@ -1,327 +0,0 @@ -/* - * Low-level SLB routines - * - * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM - * - * Based on earlier C version: - * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com - * Copyright (c) 2001 Dave Engebretsen - * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <asm/processor.h> -#include <asm/ppc_asm.h> -#include <asm/asm-offsets.h> -#include <asm/cputable.h> -#include <asm/page.h> -#include <asm/mmu.h> -#include <asm/pgtable.h> -#include <asm/firmware.h> - -/* - * This macro generates asm code to compute the VSID scramble - * function. Used in slb_allocate() and do_stab_bolted. The function - * computed is: (protovsid*VSID_MULTIPLIER) % VSID_MODULUS - * - * rt = register containing the proto-VSID and into which the - * VSID will be stored - * rx = scratch register (clobbered) - * rf = flags - * - * - rt and rx must be different registers - * - The answer will end up in the low VSID_BITS bits of rt. The higher - * bits may contain other garbage, so you may need to mask the - * result. - */ -#define ASM_VSID_SCRAMBLE(rt, rx, rf, size) \ - lis rx,VSID_MULTIPLIER_##size@h; \ - ori rx,rx,VSID_MULTIPLIER_##size@l; \ - mulld rt,rt,rx; /* rt = rt * MULTIPLIER */ \ -/* \ - * powermac get slb fault before feature fixup, so make 65 bit part \ - * the default part of feature fixup \ - */ \ -BEGIN_MMU_FTR_SECTION \ - srdi rx,rt,VSID_BITS_65_##size; \ - clrldi rt,rt,(64-VSID_BITS_65_##size); \ - add rt,rt,rx; \ - addi rx,rt,1; \ - srdi rx,rx,VSID_BITS_65_##size; \ - add rt,rt,rx; \ - rldimi rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_65_##size)); \ -MMU_FTR_SECTION_ELSE \ - srdi rx,rt,VSID_BITS_##size; \ - clrldi rt,rt,(64-VSID_BITS_##size); \ - add rt,rt,rx; /* add high and low bits */ \ - addi rx,rt,1; \ - srdi rx,rx,VSID_BITS_##size; /* extract 2^VSID_BITS bit */ \ - add rt,rt,rx; \ - rldimi rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_##size)); \ -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_68_BIT_VA) - - -/* void slb_allocate(unsigned long ea); - * - * Create an SLB entry for the given EA (user or kernel). - * r3 = faulting address, r13 = PACA - * r9, r10, r11 are clobbered by this function - * r3 is preserved. - * No other registers are examined or changed. - */ -_GLOBAL(slb_allocate) - /* - * check for bad kernel/user address - * (ea & ~REGION_MASK) >= PGTABLE_RANGE - */ - rldicr. r9,r3,4,(63 - H_PGTABLE_EADDR_SIZE - 4) - bne- 8f - - srdi r9,r3,60 /* get region */ - srdi r10,r3,SID_SHIFT /* get esid */ - cmpldi cr7,r9,0xc /* cmp PAGE_OFFSET for later use */ - - /* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */ - blt cr7,0f /* user or kernel? */ - - /* Check if hitting the linear mapping or some other kernel space - */ - bne cr7,1f - - /* Linear mapping encoding bits, the "li" instruction below will - * be patched by the kernel at boot - */ -.globl slb_miss_kernel_load_linear -slb_miss_kernel_load_linear: - li r11,0 - /* - * context = (ea >> 60) - (0xc - 1) - * r9 = region id. - */ - subi r9,r9,KERNEL_REGION_CONTEXT_OFFSET - -BEGIN_FTR_SECTION - b .Lslb_finish_load -END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) - b .Lslb_finish_load_1T - -1: -#ifdef CONFIG_SPARSEMEM_VMEMMAP - cmpldi cr0,r9,0xf - bne 1f -/* Check virtual memmap region. To be patched at kernel boot */ -.globl slb_miss_kernel_load_vmemmap -slb_miss_kernel_load_vmemmap: - li r11,0 - b 6f -1: -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ - - /* - * r10 contains the ESID, which is the original faulting EA shifted - * right by 28 bits. We need to compare that with (H_VMALLOC_END >> 28) - * which is 0xd00038000. That can't be used as an immediate, even if we - * ignored the 0xd, so we have to load it into a register, and we only - * have one register free. So we must load all of (H_VMALLOC_END >> 28) - * into a register and compare ESID against that. - */ - lis r11,(H_VMALLOC_END >> 32)@h // r11 = 0xffffffffd0000000 - ori r11,r11,(H_VMALLOC_END >> 32)@l // r11 = 0xffffffffd0003800 - // Rotate left 4, then mask with 0xffffffff0 - rldic r11,r11,4,28 // r11 = 0xd00038000 - cmpld r10,r11 // if r10 >= r11 - bge 5f // goto io_mapping - - /* - * vmalloc mapping gets the encoding from the PACA as the mapping - * can be demoted from 64K -> 4K dynamically on some machines. - */ - lhz r11,PACAVMALLOCSLLP(r13) - b 6f -5: - /* IO mapping */ -.globl slb_miss_kernel_load_io -slb_miss_kernel_load_io: - li r11,0 -6: - /* - * context = (ea >> 60) - (0xc - 1) - * r9 = region id. - */ - subi r9,r9,KERNEL_REGION_CONTEXT_OFFSET - -BEGIN_FTR_SECTION - b .Lslb_finish_load -END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) - b .Lslb_finish_load_1T - -0: /* - * For userspace addresses, make sure this is region 0. - */ - cmpdi r9, 0 - bne- 8f - /* - * user space make sure we are within the allowed limit - */ - ld r11,PACA_SLB_ADDR_LIMIT(r13) - cmpld r3,r11 - bge- 8f - - /* when using slices, we extract the psize off the slice bitmaps - * and then we need to get the sllp encoding off the mmu_psize_defs - * array. - * - * XXX This is a bit inefficient especially for the normal case, - * so we should try to implement a fast path for the standard page - * size using the old sllp value so we avoid the array. We cannot - * really do dynamic patching unfortunately as processes might flip - * between 4k and 64k standard page size - */ -#ifdef CONFIG_PPC_MM_SLICES - /* r10 have esid */ - cmpldi r10,16 - /* below SLICE_LOW_TOP */ - blt 5f - /* - * Handle hpsizes, - * r9 is get_paca()->context.high_slices_psize[index], r11 is mask_index - */ - srdi r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT + 1) /* index */ - addi r9,r11,PACAHIGHSLICEPSIZE - lbzx r9,r13,r9 /* r9 is hpsizes[r11] */ - /* r11 = (r10 >> (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)) & 0x1 */ - rldicl r11,r10,(64 - (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)),63 - b 6f - -5: - /* - * Handle lpsizes - * r9 is get_paca()->context.low_slices_psize, r11 is index - */ - ld r9,PACALOWSLICESPSIZE(r13) - mr r11,r10 -6: - sldi r11,r11,2 /* index * 4 */ - /* Extract the psize and multiply to get an array offset */ - srd r9,r9,r11 - andi. r9,r9,0xf - mulli r9,r9,MMUPSIZEDEFSIZE - - /* Now get to the array and obtain the sllp - */ - ld r11,PACATOC(r13) - ld r11,mmu_psize_defs@got(r11) - add r11,r11,r9 - ld r11,MMUPSIZESLLP(r11) - ori r11,r11,SLB_VSID_USER -#else - /* paca context sllp already contains the SLB_VSID_USER bits */ - lhz r11,PACACONTEXTSLLP(r13) -#endif /* CONFIG_PPC_MM_SLICES */ - - ld r9,PACACONTEXTID(r13) -BEGIN_FTR_SECTION - cmpldi r10,0x1000 - bge .Lslb_finish_load_1T -END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) - b .Lslb_finish_load - -8: /* invalid EA - return an error indication */ - crset 4*cr0+eq /* indicate failure */ - blr - -/* - * Finish loading of an SLB entry and return - * - * r3 = EA, r9 = context, r10 = ESID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET - */ -.Lslb_finish_load: - rldimi r10,r9,ESID_BITS,0 - ASM_VSID_SCRAMBLE(r10,r9,r11,256M) - /* r3 = EA, r11 = VSID data */ - /* - * Find a slot, round robin. Previously we tried to find a - * free slot first but that took too long. Unfortunately we - * dont have any LRU information to help us choose a slot. - */ - - mr r9,r3 - - /* slb_finish_load_1T continues here. r9=EA with non-ESID bits clear */ -7: ld r10,PACASTABRR(r13) - addi r10,r10,1 - /* This gets soft patched on boot. */ -.globl slb_compare_rr_to_size -slb_compare_rr_to_size: - cmpldi r10,0 - - blt+ 4f - li r10,SLB_NUM_BOLTED - -4: - std r10,PACASTABRR(r13) - -3: - rldimi r9,r10,0,36 /* r9 = EA[0:35] | entry */ - oris r10,r9,SLB_ESID_V@h /* r10 = r9 | SLB_ESID_V */ - - /* r9 = ESID data, r11 = VSID data */ - - /* - * No need for an isync before or after this slbmte. The exception - * we enter with and the rfid we exit with are context synchronizing. - */ - slbmte r11,r10 - - /* we're done for kernel addresses */ - crclr 4*cr0+eq /* set result to "success" */ - bgelr cr7 - - /* Update the slb cache */ - lhz r9,PACASLBCACHEPTR(r13) /* offset = paca->slb_cache_ptr */ - cmpldi r9,SLB_CACHE_ENTRIES - bge 1f - - /* still room in the slb cache */ - sldi r11,r9,2 /* r11 = offset * sizeof(u32) */ - srdi r10,r10,28 /* get the 36 bits of the ESID */ - add r11,r11,r13 /* r11 = (u32 *)paca + offset */ - stw r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */ - addi r9,r9,1 /* offset++ */ - b 2f -1: /* offset >= SLB_CACHE_ENTRIES */ - li r9,SLB_CACHE_ENTRIES+1 -2: - sth r9,PACASLBCACHEPTR(r13) /* paca->slb_cache_ptr = offset */ - crclr 4*cr0+eq /* set result to "success" */ - blr - -/* - * Finish loading of a 1T SLB entry (for the kernel linear mapping) and return. - * - * r3 = EA, r9 = context, r10 = ESID(256MB), r11 = flags, clobbers r9 - */ -.Lslb_finish_load_1T: - srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */ - rldimi r10,r9,ESID_BITS_1T,0 - ASM_VSID_SCRAMBLE(r10,r9,r11,1T) - - li r10,MMU_SEGSIZE_1T - rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */ - - /* r3 = EA, r11 = VSID data */ - clrrdi r9,r3,SID_SHIFT_1T /* clear out non-ESID bits */ - b 7b - - -_ASM_NOKPROBE_SYMBOL(slb_allocate) -_ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_linear) -_ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_io) -_ASM_NOKPROBE_SYMBOL(slb_compare_rr_to_size) -#ifdef CONFIG_SPARSEMEM_VMEMMAP -_ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_vmemmap) -#endif diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 23ec2c5e3b78..06898c13901d 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -31,58 +31,62 @@ #include <linux/spinlock.h> #include <linux/export.h> #include <linux/hugetlb.h> +#include <linux/sched/mm.h> #include <asm/mman.h> #include <asm/mmu.h> #include <asm/copro.h> #include <asm/hugetlb.h> +#include <asm/mmu_context.h> static DEFINE_SPINLOCK(slice_convert_lock); -/* - * One bit per slice. We have lower slices which cover 256MB segments - * upto 4G range. That gets us 16 low slices. For the rest we track slices - * in 1TB size. - */ -struct slice_mask { - u64 low_slices; - DECLARE_BITMAP(high_slices, SLICE_NUM_HIGH); -}; #ifdef DEBUG int _slice_debug = 1; -static void slice_print_mask(const char *label, struct slice_mask mask) +static void slice_print_mask(const char *label, const struct slice_mask *mask) { if (!_slice_debug) return; - pr_devel("%s low_slice: %*pbl\n", label, (int)SLICE_NUM_LOW, &mask.low_slices); - pr_devel("%s high_slice: %*pbl\n", label, (int)SLICE_NUM_HIGH, mask.high_slices); + pr_devel("%s low_slice: %*pbl\n", label, + (int)SLICE_NUM_LOW, &mask->low_slices); + pr_devel("%s high_slice: %*pbl\n", label, + (int)SLICE_NUM_HIGH, mask->high_slices); } #define slice_dbg(fmt...) do { if (_slice_debug) pr_devel(fmt); } while (0) #else -static void slice_print_mask(const char *label, struct slice_mask mask) {} +static void slice_print_mask(const char *label, const struct slice_mask *mask) {} #define slice_dbg(fmt...) #endif +static inline bool slice_addr_is_low(unsigned long addr) +{ + u64 tmp = (u64)addr; + + return tmp < SLICE_LOW_TOP; +} + static void slice_range_to_mask(unsigned long start, unsigned long len, struct slice_mask *ret) { unsigned long end = start + len - 1; ret->low_slices = 0; - bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); + if (SLICE_NUM_HIGH) + bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); - if (start < SLICE_LOW_TOP) { - unsigned long mend = min(end, (SLICE_LOW_TOP - 1)); + if (slice_addr_is_low(start)) { + unsigned long mend = min(end, + (unsigned long)(SLICE_LOW_TOP - 1)); ret->low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1)) - (1u << GET_LOW_SLICE_INDEX(start)); } - if ((start + len) > SLICE_LOW_TOP) { + if (SLICE_NUM_HIGH && !slice_addr_is_low(end)) { unsigned long start_index = GET_HIGH_SLICE_INDEX(start); unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT)); unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index; @@ -113,11 +117,13 @@ static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice) unsigned long start = slice << SLICE_HIGH_SHIFT; unsigned long end = start + (1ul << SLICE_HIGH_SHIFT); +#ifdef CONFIG_PPC64 /* Hack, so that each addresses is controlled by exactly one * of the high or low area bitmaps, the first high area starts * at 4GB, not 0 */ if (start == 0) start = SLICE_LOW_TOP; +#endif return !slice_area_is_free(mm, start, end - start); } @@ -128,13 +134,14 @@ static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret, unsigned long i; ret->low_slices = 0; - bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); + if (SLICE_NUM_HIGH) + bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); for (i = 0; i < SLICE_NUM_LOW; i++) if (!slice_low_has_vma(mm, i)) ret->low_slices |= 1u << i; - if (high_limit <= SLICE_LOW_TOP) + if (slice_addr_is_low(high_limit - 1)) return; for (i = 0; i < GET_HIGH_SLICE_INDEX(high_limit); i++) @@ -142,53 +149,75 @@ static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret, __set_bit(i, ret->high_slices); } -static void slice_mask_for_size(struct mm_struct *mm, int psize, struct slice_mask *ret, - unsigned long high_limit) +#ifdef CONFIG_PPC_BOOK3S_64 +static struct slice_mask *slice_mask_for_size(struct mm_struct *mm, int psize) { - unsigned char *hpsizes; - int index, mask_index; - unsigned long i; - u64 lpsizes; - - ret->low_slices = 0; - bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); +#ifdef CONFIG_PPC_64K_PAGES + if (psize == MMU_PAGE_64K) + return &mm->context.mask_64k; +#endif + if (psize == MMU_PAGE_4K) + return &mm->context.mask_4k; +#ifdef CONFIG_HUGETLB_PAGE + if (psize == MMU_PAGE_16M) + return &mm->context.mask_16m; + if (psize == MMU_PAGE_16G) + return &mm->context.mask_16g; +#endif + BUG(); +} +#elif defined(CONFIG_PPC_8xx) +static struct slice_mask *slice_mask_for_size(struct mm_struct *mm, int psize) +{ + if (psize == mmu_virtual_psize) + return &mm->context.mask_base_psize; +#ifdef CONFIG_HUGETLB_PAGE + if (psize == MMU_PAGE_512K) + return &mm->context.mask_512k; + if (psize == MMU_PAGE_8M) + return &mm->context.mask_8m; +#endif + BUG(); +} +#else +#error "Must define the slice masks for page sizes supported by the platform" +#endif - lpsizes = mm->context.low_slices_psize; - for (i = 0; i < SLICE_NUM_LOW; i++) - if (((lpsizes >> (i * 4)) & 0xf) == psize) - ret->low_slices |= 1u << i; +static bool slice_check_range_fits(struct mm_struct *mm, + const struct slice_mask *available, + unsigned long start, unsigned long len) +{ + unsigned long end = start + len - 1; + u64 low_slices = 0; - if (high_limit <= SLICE_LOW_TOP) - return; + if (slice_addr_is_low(start)) { + unsigned long mend = min(end, + (unsigned long)(SLICE_LOW_TOP - 1)); - hpsizes = mm->context.high_slices_psize; - for (i = 0; i < GET_HIGH_SLICE_INDEX(high_limit); i++) { - mask_index = i & 0x1; - index = i >> 1; - if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize) - __set_bit(i, ret->high_slices); + low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1)) + - (1u << GET_LOW_SLICE_INDEX(start)); } -} + if ((low_slices & available->low_slices) != low_slices) + return false; -static int slice_check_fit(struct mm_struct *mm, - struct slice_mask mask, struct slice_mask available) -{ - DECLARE_BITMAP(result, SLICE_NUM_HIGH); - /* - * Make sure we just do bit compare only to the max - * addr limit and not the full bit map size. - */ - unsigned long slice_count = GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit); + if (SLICE_NUM_HIGH && !slice_addr_is_low(end)) { + unsigned long start_index = GET_HIGH_SLICE_INDEX(start); + unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT)); + unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index; + unsigned long i; - bitmap_and(result, mask.high_slices, - available.high_slices, slice_count); + for (i = start_index; i < start_index + count; i++) { + if (!test_bit(i, available->high_slices)) + return false; + } + } - return (mask.low_slices & available.low_slices) == mask.low_slices && - bitmap_equal(result, mask.high_slices, slice_count); + return true; } static void slice_flush_segments(void *parm) { +#ifdef CONFIG_PPC64 struct mm_struct *mm = parm; unsigned long flags; @@ -198,42 +227,66 @@ static void slice_flush_segments(void *parm) copy_mm_to_paca(current->active_mm); local_irq_save(flags); - slb_flush_and_rebolt(); + slb_flush_and_restore_bolted(); local_irq_restore(flags); +#endif } -static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psize) +static void slice_convert(struct mm_struct *mm, + const struct slice_mask *mask, int psize) { int index, mask_index; /* Write the new slice psize bits */ - unsigned char *hpsizes; - u64 lpsizes; + unsigned char *hpsizes, *lpsizes; + struct slice_mask *psize_mask, *old_mask; unsigned long i, flags; + int old_psize; slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize); slice_print_mask(" mask", mask); + psize_mask = slice_mask_for_size(mm, psize); + /* We need to use a spinlock here to protect against * concurrent 64k -> 4k demotion ... */ spin_lock_irqsave(&slice_convert_lock, flags); lpsizes = mm->context.low_slices_psize; - for (i = 0; i < SLICE_NUM_LOW; i++) - if (mask.low_slices & (1u << i)) - lpsizes = (lpsizes & ~(0xful << (i * 4))) | - (((unsigned long)psize) << (i * 4)); + for (i = 0; i < SLICE_NUM_LOW; i++) { + if (!(mask->low_slices & (1u << i))) + continue; + + mask_index = i & 0x1; + index = i >> 1; - /* Assign the value back */ - mm->context.low_slices_psize = lpsizes; + /* Update the slice_mask */ + old_psize = (lpsizes[index] >> (mask_index * 4)) & 0xf; + old_mask = slice_mask_for_size(mm, old_psize); + old_mask->low_slices &= ~(1u << i); + psize_mask->low_slices |= 1u << i; + + /* Update the sizes array */ + lpsizes[index] = (lpsizes[index] & ~(0xf << (mask_index * 4))) | + (((unsigned long)psize) << (mask_index * 4)); + } hpsizes = mm->context.high_slices_psize; for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit); i++) { + if (!test_bit(i, mask->high_slices)) + continue; + mask_index = i & 0x1; index = i >> 1; - if (test_bit(i, mask.high_slices)) - hpsizes[index] = (hpsizes[index] & - ~(0xf << (mask_index * 4))) | + + /* Update the slice_mask */ + old_psize = (hpsizes[index] >> (mask_index * 4)) & 0xf; + old_mask = slice_mask_for_size(mm, old_psize); + __clear_bit(i, old_mask->high_slices); + __set_bit(i, psize_mask->high_slices); + + /* Update the sizes array */ + hpsizes[index] = (hpsizes[index] & ~(0xf << (mask_index * 4))) | (((unsigned long)psize) << (mask_index * 4)); } @@ -254,26 +307,25 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz * 'available' slice_mark. */ static bool slice_scan_available(unsigned long addr, - struct slice_mask available, - int end, - unsigned long *boundary_addr) + const struct slice_mask *available, + int end, unsigned long *boundary_addr) { unsigned long slice; - if (addr < SLICE_LOW_TOP) { + if (slice_addr_is_low(addr)) { slice = GET_LOW_SLICE_INDEX(addr); *boundary_addr = (slice + end) << SLICE_LOW_SHIFT; - return !!(available.low_slices & (1u << slice)); + return !!(available->low_slices & (1u << slice)); } else { slice = GET_HIGH_SLICE_INDEX(addr); *boundary_addr = (slice + end) ? ((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP; - return !!test_bit(slice, available.high_slices); + return !!test_bit(slice, available->high_slices); } } static unsigned long slice_find_area_bottomup(struct mm_struct *mm, unsigned long len, - struct slice_mask available, + const struct slice_mask *available, int psize, unsigned long high_limit) { int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); @@ -319,7 +371,7 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm, static unsigned long slice_find_area_topdown(struct mm_struct *mm, unsigned long len, - struct slice_mask available, + const struct slice_mask *available, int psize, unsigned long high_limit) { int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); @@ -377,7 +429,7 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm, static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len, - struct slice_mask mask, int psize, + const struct slice_mask *mask, int psize, int topdown, unsigned long high_limit) { if (topdown) @@ -386,23 +438,33 @@ static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len, return slice_find_area_bottomup(mm, len, mask, psize, high_limit); } -static inline void slice_or_mask(struct slice_mask *dst, struct slice_mask *src) +static inline void slice_copy_mask(struct slice_mask *dst, + const struct slice_mask *src) { - DECLARE_BITMAP(result, SLICE_NUM_HIGH); - - dst->low_slices |= src->low_slices; - bitmap_or(result, dst->high_slices, src->high_slices, SLICE_NUM_HIGH); - bitmap_copy(dst->high_slices, result, SLICE_NUM_HIGH); + dst->low_slices = src->low_slices; + if (!SLICE_NUM_HIGH) + return; + bitmap_copy(dst->high_slices, src->high_slices, SLICE_NUM_HIGH); } -static inline void slice_andnot_mask(struct slice_mask *dst, struct slice_mask *src) +static inline void slice_or_mask(struct slice_mask *dst, + const struct slice_mask *src1, + const struct slice_mask *src2) { - DECLARE_BITMAP(result, SLICE_NUM_HIGH); - - dst->low_slices &= ~src->low_slices; + dst->low_slices = src1->low_slices | src2->low_slices; + if (!SLICE_NUM_HIGH) + return; + bitmap_or(dst->high_slices, src1->high_slices, src2->high_slices, SLICE_NUM_HIGH); +} - bitmap_andnot(result, dst->high_slices, src->high_slices, SLICE_NUM_HIGH); - bitmap_copy(dst->high_slices, result, SLICE_NUM_HIGH); +static inline void slice_andnot_mask(struct slice_mask *dst, + const struct slice_mask *src1, + const struct slice_mask *src2) +{ + dst->low_slices = src1->low_slices & ~src2->low_slices; + if (!SLICE_NUM_HIGH) + return; + bitmap_andnot(dst->high_slices, src1->high_slices, src2->high_slices, SLICE_NUM_HIGH); } #ifdef CONFIG_PPC_64K_PAGES @@ -415,10 +477,10 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, unsigned long flags, unsigned int psize, int topdown) { - struct slice_mask mask; struct slice_mask good_mask; struct slice_mask potential_mask; - struct slice_mask compat_mask; + const struct slice_mask *maskp; + const struct slice_mask *compat_maskp = NULL; int fixed = (flags & MAP_FIXED); int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); unsigned long page_size = 1UL << pshift; @@ -442,23 +504,16 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, } if (high_limit > mm->context.slb_addr_limit) { + /* + * Increasing the slb_addr_limit does not require + * slice mask cache to be recalculated because it should + * be already initialised beyond the old address limit. + */ mm->context.slb_addr_limit = high_limit; + on_each_cpu(slice_flush_segments, mm, 1); } - /* - * init different masks - */ - mask.low_slices = 0; - bitmap_zero(mask.high_slices, SLICE_NUM_HIGH); - - /* silence stupid warning */; - potential_mask.low_slices = 0; - bitmap_zero(potential_mask.high_slices, SLICE_NUM_HIGH); - - compat_mask.low_slices = 0; - bitmap_zero(compat_mask.high_slices, SLICE_NUM_HIGH); - /* Sanity checks */ BUG_ON(mm->task_size == 0); BUG_ON(mm->context.slb_addr_limit == 0); @@ -481,8 +536,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, /* First make up a "good" mask of slices that have the right size * already */ - slice_mask_for_size(mm, psize, &good_mask, high_limit); - slice_print_mask(" good_mask", good_mask); + maskp = slice_mask_for_size(mm, psize); /* * Here "good" means slices that are already the right page size, @@ -503,40 +557,47 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, * search in good | compat | free, found => convert free. */ -#ifdef CONFIG_PPC_64K_PAGES - /* If we support combo pages, we can allow 64k pages in 4k slices */ - if (psize == MMU_PAGE_64K) { - slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask, high_limit); + /* + * If we support combo pages, we can allow 64k pages in 4k slices + * The mask copies could be avoided in most cases here if we had + * a pointer to good mask for the next code to use. + */ + if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && psize == MMU_PAGE_64K) { + compat_maskp = slice_mask_for_size(mm, MMU_PAGE_4K); if (fixed) - slice_or_mask(&good_mask, &compat_mask); + slice_or_mask(&good_mask, maskp, compat_maskp); + else + slice_copy_mask(&good_mask, maskp); + } else { + slice_copy_mask(&good_mask, maskp); } -#endif + + slice_print_mask(" good_mask", &good_mask); + if (compat_maskp) + slice_print_mask(" compat_mask", compat_maskp); /* First check hint if it's valid or if we have MAP_FIXED */ if (addr != 0 || fixed) { - /* Build a mask for the requested range */ - slice_range_to_mask(addr, len, &mask); - slice_print_mask(" mask", mask); - /* Check if we fit in the good mask. If we do, we just return, * nothing else to do */ - if (slice_check_fit(mm, mask, good_mask)) { + if (slice_check_range_fits(mm, &good_mask, addr, len)) { slice_dbg(" fits good !\n"); - return addr; + newaddr = addr; + goto return_addr; } } else { /* Now let's see if we can find something in the existing * slices for that size */ - newaddr = slice_find_area(mm, len, good_mask, + newaddr = slice_find_area(mm, len, &good_mask, psize, topdown, high_limit); if (newaddr != -ENOMEM) { /* Found within the good mask, we don't have to setup, * we thus return directly */ slice_dbg(" found area at 0x%lx\n", newaddr); - return newaddr; + goto return_addr; } } /* @@ -544,12 +605,15 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, * empty and thus can be converted */ slice_mask_for_free(mm, &potential_mask, high_limit); - slice_or_mask(&potential_mask, &good_mask); - slice_print_mask(" potential", potential_mask); + slice_or_mask(&potential_mask, &potential_mask, &good_mask); + slice_print_mask(" potential", &potential_mask); - if ((addr != 0 || fixed) && slice_check_fit(mm, mask, potential_mask)) { - slice_dbg(" fits potential !\n"); - goto convert; + if (addr != 0 || fixed) { + if (slice_check_range_fits(mm, &potential_mask, addr, len)) { + slice_dbg(" fits potential !\n"); + newaddr = addr; + goto convert; + } } /* If we have MAP_FIXED and failed the above steps, then error out */ @@ -562,46 +626,64 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, * anywhere in the good area. */ if (addr) { - addr = slice_find_area(mm, len, good_mask, - psize, topdown, high_limit); - if (addr != -ENOMEM) { - slice_dbg(" found area at 0x%lx\n", addr); - return addr; + newaddr = slice_find_area(mm, len, &good_mask, + psize, topdown, high_limit); + if (newaddr != -ENOMEM) { + slice_dbg(" found area at 0x%lx\n", newaddr); + goto return_addr; } } /* Now let's see if we can find something in the existing slices * for that size plus free slices */ - addr = slice_find_area(mm, len, potential_mask, - psize, topdown, high_limit); + newaddr = slice_find_area(mm, len, &potential_mask, + psize, topdown, high_limit); #ifdef CONFIG_PPC_64K_PAGES - if (addr == -ENOMEM && psize == MMU_PAGE_64K) { + if (newaddr == -ENOMEM && psize == MMU_PAGE_64K) { /* retry the search with 4k-page slices included */ - slice_or_mask(&potential_mask, &compat_mask); - addr = slice_find_area(mm, len, potential_mask, - psize, topdown, high_limit); + slice_or_mask(&potential_mask, &potential_mask, compat_maskp); + newaddr = slice_find_area(mm, len, &potential_mask, + psize, topdown, high_limit); } #endif - if (addr == -ENOMEM) + if (newaddr == -ENOMEM) return -ENOMEM; - slice_range_to_mask(addr, len, &mask); - slice_dbg(" found potential area at 0x%lx\n", addr); - slice_print_mask(" mask", mask); + slice_range_to_mask(newaddr, len, &potential_mask); + slice_dbg(" found potential area at 0x%lx\n", newaddr); + slice_print_mask(" mask", &potential_mask); convert: - slice_andnot_mask(&mask, &good_mask); - slice_andnot_mask(&mask, &compat_mask); - if (mask.low_slices || !bitmap_empty(mask.high_slices, SLICE_NUM_HIGH)) { - slice_convert(mm, mask, psize); + /* + * Try to allocate the context before we do slice convert + * so that we handle the context allocation failure gracefully. + */ + if (need_extra_context(mm, newaddr)) { + if (alloc_extended_context(mm, newaddr) < 0) + return -ENOMEM; + } + + slice_andnot_mask(&potential_mask, &potential_mask, &good_mask); + if (compat_maskp && !fixed) + slice_andnot_mask(&potential_mask, &potential_mask, compat_maskp); + if (potential_mask.low_slices || + (SLICE_NUM_HIGH && + !bitmap_empty(potential_mask.high_slices, SLICE_NUM_HIGH))) { + slice_convert(mm, &potential_mask, psize); if (psize > MMU_PAGE_BASE) on_each_cpu(slice_flush_segments, mm, 1); } - return addr; + return newaddr; +return_addr: + if (need_extra_context(mm, newaddr)) { + if (alloc_extended_context(mm, newaddr) < 0) + return -ENOMEM; + } + return newaddr; } EXPORT_SYMBOL_GPL(slice_get_unmapped_area); @@ -627,95 +709,75 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr) { - unsigned char *hpsizes; + unsigned char *psizes; int index, mask_index; - /* - * Radix doesn't use slice, but can get enabled along with MMU_SLICE - */ - if (radix_enabled()) { -#ifdef CONFIG_PPC_64K_PAGES - return MMU_PAGE_64K; -#else - return MMU_PAGE_4K; -#endif - } - if (addr < SLICE_LOW_TOP) { - u64 lpsizes; - lpsizes = mm->context.low_slices_psize; + VM_BUG_ON(radix_enabled()); + + if (slice_addr_is_low(addr)) { + psizes = mm->context.low_slices_psize; index = GET_LOW_SLICE_INDEX(addr); - return (lpsizes >> (index * 4)) & 0xf; + } else { + psizes = mm->context.high_slices_psize; + index = GET_HIGH_SLICE_INDEX(addr); } - hpsizes = mm->context.high_slices_psize; - index = GET_HIGH_SLICE_INDEX(addr); mask_index = index & 0x1; - return (hpsizes[index >> 1] >> (mask_index * 4)) & 0xf; + return (psizes[index >> 1] >> (mask_index * 4)) & 0xf; } EXPORT_SYMBOL_GPL(get_slice_psize); -/* - * This is called by hash_page when it needs to do a lazy conversion of - * an address space from real 64K pages to combo 4K pages (typically - * when hitting a non cacheable mapping on a processor or hypervisor - * that won't allow them for 64K pages). - * - * This is also called in init_new_context() to change back the user - * psize from whatever the parent context had it set to - * N.B. This may be called before mm->context.id has been set. - * - * This function will only change the content of the {low,high)_slice_psize - * masks, it will not flush SLBs as this shall be handled lazily by the - * caller. - */ -void slice_set_user_psize(struct mm_struct *mm, unsigned int psize) +void slice_init_new_context_exec(struct mm_struct *mm) { - int index, mask_index; - unsigned char *hpsizes; - unsigned long flags, lpsizes; - unsigned int old_psize; - int i; + unsigned char *hpsizes, *lpsizes; + struct slice_mask *mask; + unsigned int psize = mmu_virtual_psize; - slice_dbg("slice_set_user_psize(mm=%p, psize=%d)\n", mm, psize); + slice_dbg("slice_init_new_context_exec(mm=%p)\n", mm); - VM_BUG_ON(radix_enabled()); - spin_lock_irqsave(&slice_convert_lock, flags); - - old_psize = mm->context.user_psize; - slice_dbg(" old_psize=%d\n", old_psize); - if (old_psize == psize) - goto bail; + /* + * In the case of exec, use the default limit. In the + * case of fork it is just inherited from the mm being + * duplicated. + */ +#ifdef CONFIG_PPC64 + mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64; +#else + mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW; +#endif mm->context.user_psize = psize; - wmb(); + /* + * Set all slice psizes to the default. + */ lpsizes = mm->context.low_slices_psize; - for (i = 0; i < SLICE_NUM_LOW; i++) - if (((lpsizes >> (i * 4)) & 0xf) == old_psize) - lpsizes = (lpsizes & ~(0xful << (i * 4))) | - (((unsigned long)psize) << (i * 4)); - /* Assign the value back */ - mm->context.low_slices_psize = lpsizes; + memset(lpsizes, (psize << 4) | psize, SLICE_NUM_LOW >> 1); hpsizes = mm->context.high_slices_psize; - for (i = 0; i < SLICE_NUM_HIGH; i++) { - mask_index = i & 0x1; - index = i >> 1; - if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == old_psize) - hpsizes[index] = (hpsizes[index] & - ~(0xf << (mask_index * 4))) | - (((unsigned long)psize) << (mask_index * 4)); - } + memset(hpsizes, (psize << 4) | psize, SLICE_NUM_HIGH >> 1); + /* + * Slice mask cache starts zeroed, fill the default size cache. + */ + mask = slice_mask_for_size(mm, psize); + mask->low_slices = ~0UL; + if (SLICE_NUM_HIGH) + bitmap_fill(mask->high_slices, SLICE_NUM_HIGH); +} +#ifdef CONFIG_PPC_BOOK3S_64 +void slice_setup_new_exec(void) +{ + struct mm_struct *mm = current->mm; + slice_dbg("slice_setup_new_exec(mm=%p)\n", mm); - slice_dbg(" lsps=%lx, hsps=%lx\n", - (unsigned long)mm->context.low_slices_psize, - (unsigned long)mm->context.high_slices_psize); + if (!is_32bit_task()) + return; - bail: - spin_unlock_irqrestore(&slice_convert_lock, flags); + mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW; } +#endif void slice_set_range_psize(struct mm_struct *mm, unsigned long start, unsigned long len, unsigned int psize) @@ -725,7 +787,7 @@ void slice_set_range_psize(struct mm_struct *mm, unsigned long start, VM_BUG_ON(radix_enabled()); slice_range_to_mask(start, len, &mask); - slice_convert(mm, mask, psize); + slice_convert(mm, &mask, psize); } #ifdef CONFIG_HUGETLB_PAGE @@ -748,33 +810,27 @@ void slice_set_range_psize(struct mm_struct *mm, unsigned long start, * for now as we only use slices with hugetlbfs enabled. This should * be fixed as the generic code gets fixed. */ -int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, +int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len) { - struct slice_mask mask, available; + const struct slice_mask *maskp; unsigned int psize = mm->context.user_psize; - unsigned long high_limit = mm->context.slb_addr_limit; - if (radix_enabled()) - return 0; + VM_BUG_ON(radix_enabled()); - slice_range_to_mask(addr, len, &mask); - slice_mask_for_size(mm, psize, &available, high_limit); + maskp = slice_mask_for_size(mm, psize); #ifdef CONFIG_PPC_64K_PAGES /* We need to account for 4k slices too */ if (psize == MMU_PAGE_64K) { - struct slice_mask compat_mask; - slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask, high_limit); - slice_or_mask(&available, &compat_mask); + const struct slice_mask *compat_maskp; + struct slice_mask available; + + compat_maskp = slice_mask_for_size(mm, MMU_PAGE_4K); + slice_or_mask(&available, maskp, compat_maskp); + return !slice_check_range_fits(mm, &available, addr, len); } #endif -#if 0 /* too verbose */ - slice_dbg("is_hugepage_only_range(mm=%p, addr=%lx, len=%lx)\n", - mm, addr, len); - slice_print_mask(" mask", mask); - slice_print_mask(" available", available); -#endif - return !slice_check_fit(mm, mask, available); + return !slice_check_range_fits(mm, maskp, addr, len); } #endif diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c index f14a07c2fb90..3327551c8b47 100644 --- a/arch/powerpc/mm/subpage-prot.c +++ b/arch/powerpc/mm/subpage-prot.c @@ -13,10 +13,10 @@ #include <linux/types.h> #include <linux/mm.h> #include <linux/hugetlb.h> +#include <linux/syscalls.h> #include <asm/pgtable.h> #include <linux/uaccess.h> -#include <asm/tlbflush.h> /* * Free all pages allocated for subpage protection maps and pointers. @@ -185,7 +185,8 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, * in a 2-bit field won't allow writes to a page that is otherwise * write-protected. */ -long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map) +SYSCALL_DEFINE3(subpage_prot, unsigned long, addr, + unsigned long, len, u32 __user *, map) { struct mm_struct *mm = current->mm; struct subpage_prot_table *spt = &mm->context.spt; diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index a07f5372a4bf..6a23b9ebd2a1 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -12,6 +12,8 @@ #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/memblock.h> +#include <linux/mmu_context.h> +#include <linux/sched/mm.h> #include <asm/ppc-opcode.h> #include <asm/tlb.h> @@ -33,13 +35,12 @@ static inline void tlbiel_radix_set_isa300(unsigned int set, unsigned int is, { unsigned long rb; unsigned long rs; - unsigned int r = 1; /* radix format */ rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); rs = ((unsigned long)pid << PPC_BITLSHIFT(31)); - asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) - : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "r"(r) + asm volatile(PPC_TLBIEL(%0, %1, %2, %3, 1) + : : "r"(rb), "r"(rs), "i"(ric), "i"(prs) : "memory"); } @@ -98,7 +99,7 @@ static inline void __tlbiel_pid(unsigned long pid, int set, rb |= set << PPC_BITLSHIFT(51); rs = ((unsigned long)pid) << PPC_BITLSHIFT(31); prs = 1; /* process scoped */ - r = 1; /* raidx format */ + r = 1; /* radix format */ asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); @@ -112,13 +113,60 @@ static inline void __tlbie_pid(unsigned long pid, unsigned long ric) rb = PPC_BIT(53); /* IS = 1 */ rs = pid << PPC_BITLSHIFT(31); prs = 1; /* process scoped */ - r = 1; /* raidx format */ + r = 1; /* radix format */ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); trace_tlbie(0, 0, rb, rs, ric, prs, r); } +static inline void __tlbiel_lpid(unsigned long lpid, int set, + unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = PPC_BIT(52); /* IS = 2 */ + rb |= set << PPC_BITLSHIFT(51); + rs = 0; /* LPID comes from LPIDR */ + prs = 0; /* partition scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(lpid, 1, rb, rs, ric, prs, r); +} + +static inline void __tlbie_lpid(unsigned long lpid, unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = PPC_BIT(52); /* IS = 2 */ + rs = lpid; + prs = 0; /* partition scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(lpid, 0, rb, rs, ric, prs, r); +} + +static inline void __tlbiel_lpid_guest(unsigned long lpid, int set, + unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = PPC_BIT(52); /* IS = 2 */ + rb |= set << PPC_BITLSHIFT(51); + rs = 0; /* LPID comes from LPIDR */ + prs = 1; /* process scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(lpid, 1, rb, rs, ric, prs, r); +} + + static inline void __tlbiel_va(unsigned long va, unsigned long pid, unsigned long ap, unsigned long ric) { @@ -128,7 +176,7 @@ static inline void __tlbiel_va(unsigned long va, unsigned long pid, rb |= ap << PPC_BITLSHIFT(58); rs = pid << PPC_BITLSHIFT(31); prs = 1; /* process scoped */ - r = 1; /* raidx format */ + r = 1; /* radix format */ asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); @@ -144,13 +192,29 @@ static inline void __tlbie_va(unsigned long va, unsigned long pid, rb |= ap << PPC_BITLSHIFT(58); rs = pid << PPC_BITLSHIFT(31); prs = 1; /* process scoped */ - r = 1; /* raidx format */ + r = 1; /* radix format */ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); trace_tlbie(0, 0, rb, rs, ric, prs, r); } +static inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid, + unsigned long ap, unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = va & ~(PPC_BITMASK(52, 63)); + rb |= ap << PPC_BITLSHIFT(58); + rs = lpid; + prs = 0; /* partition scoped */ + r = 1; /* radix format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(lpid, 0, rb, rs, ric, prs, r); +} + static inline void fixup_tlbie(void) { unsigned long pid = 0; @@ -162,6 +226,16 @@ static inline void fixup_tlbie(void) } } +static inline void fixup_tlbie_lpid(unsigned long lpid) +{ + unsigned long va = ((1UL << 52) - 1); + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_lpid_va(va, lpid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB); + } +} + /* * We use 128 set in radix mode and 256 set in hpt mode. */ @@ -215,6 +289,87 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric) asm volatile("eieio; tlbsync; ptesync": : :"memory"); } +static inline void _tlbiel_lpid(unsigned long lpid, unsigned long ric) +{ + int set; + + VM_BUG_ON(mfspr(SPRN_LPID) != lpid); + + asm volatile("ptesync": : :"memory"); + + /* + * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, + * also flush the entire Page Walk Cache. + */ + __tlbiel_lpid(lpid, 0, ric); + + /* For PWC, only one flush is needed */ + if (ric == RIC_FLUSH_PWC) { + asm volatile("ptesync": : :"memory"); + return; + } + + /* For the remaining sets, just flush the TLB */ + for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) + __tlbiel_lpid(lpid, set, RIC_FLUSH_TLB); + + asm volatile("ptesync": : :"memory"); + asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); +} + +static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric) +{ + asm volatile("ptesync": : :"memory"); + + /* + * Workaround the fact that the "ric" argument to __tlbie_pid + * must be a compile-time contraint to match the "i" constraint + * in the asm statement. + */ + switch (ric) { + case RIC_FLUSH_TLB: + __tlbie_lpid(lpid, RIC_FLUSH_TLB); + break; + case RIC_FLUSH_PWC: + __tlbie_lpid(lpid, RIC_FLUSH_PWC); + break; + case RIC_FLUSH_ALL: + default: + __tlbie_lpid(lpid, RIC_FLUSH_ALL); + } + fixup_tlbie_lpid(lpid); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +static inline void _tlbiel_lpid_guest(unsigned long lpid, unsigned long ric) +{ + int set; + + VM_BUG_ON(mfspr(SPRN_LPID) != lpid); + + asm volatile("ptesync": : :"memory"); + + /* + * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, + * also flush the entire Page Walk Cache. + */ + __tlbiel_lpid_guest(lpid, 0, ric); + + /* For PWC, only one flush is needed */ + if (ric == RIC_FLUSH_PWC) { + asm volatile("ptesync": : :"memory"); + return; + } + + /* For the remaining sets, just flush the TLB */ + for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) + __tlbiel_lpid_guest(lpid, set, RIC_FLUSH_TLB); + + asm volatile("ptesync": : :"memory"); + asm volatile(PPC_INVALIDATE_ERAT : : :"memory"); +} + + static inline void __tlbiel_va_range(unsigned long start, unsigned long end, unsigned long pid, unsigned long page_size, unsigned long psize) @@ -269,6 +424,17 @@ static inline void _tlbie_va(unsigned long va, unsigned long pid, asm volatile("eieio; tlbsync; ptesync": : :"memory"); } +static inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid, + unsigned long psize, unsigned long ric) +{ + unsigned long ap = mmu_get_ap(psize); + + asm volatile("ptesync": : :"memory"); + __tlbie_lpid_va(va, lpid, ap, ric); + fixup_tlbie_lpid(lpid); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + static inline void _tlbie_va_range(unsigned long start, unsigned long end, unsigned long pid, unsigned long page_size, unsigned long psize, bool also_pwc) @@ -341,6 +507,15 @@ void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmadd } EXPORT_SYMBOL(radix__local_flush_tlb_page); +static bool mm_is_singlethreaded(struct mm_struct *mm) +{ + if (atomic_read(&mm->context.copros) > 0) + return false; + if (atomic_read(&mm->mm_users) <= 1 && current->mm == mm) + return true; + return false; +} + static bool mm_needs_flush_escalation(struct mm_struct *mm) { /* @@ -348,10 +523,47 @@ static bool mm_needs_flush_escalation(struct mm_struct *mm) * caching PTEs and not flushing them properly when * RIC = 0 for a PID/LPID invalidate */ - return atomic_read(&mm->context.copros) != 0; + if (atomic_read(&mm->context.copros) > 0) + return true; + return false; } #ifdef CONFIG_SMP +static void do_exit_flush_lazy_tlb(void *arg) +{ + struct mm_struct *mm = arg; + unsigned long pid = mm->context.id; + + if (current->mm == mm) + return; /* Local CPU */ + + if (current->active_mm == mm) { + /* + * Must be a kernel thread because sender is single-threaded. + */ + BUG_ON(current->mm); + mmgrab(&init_mm); + switch_mm(mm, &init_mm, current); + current->active_mm = &init_mm; + mmdrop(mm); + } + _tlbiel_pid(pid, RIC_FLUSH_ALL); +} + +static void exit_flush_lazy_tlbs(struct mm_struct *mm) +{ + /* + * Would be nice if this was async so it could be run in + * parallel with our local flush, but generic code does not + * give a good API for it. Could extend the generic code or + * make a special powerpc IPI for flushing TLBs. + * For now it's not too performance critical. + */ + smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb, + (void *)mm, 1); + mm_reset_thread_local(mm); +} + void radix__flush_tlb_mm(struct mm_struct *mm) { unsigned long pid; @@ -361,18 +573,30 @@ void radix__flush_tlb_mm(struct mm_struct *mm) return; preempt_disable(); + /* + * Order loads of mm_cpumask vs previous stores to clear ptes before + * the invalidate. See barrier in switch_mm_irqs_off + */ + smp_mb(); if (!mm_is_thread_local(mm)) { + if (unlikely(mm_is_singlethreaded(mm))) { + exit_flush_lazy_tlbs(mm); + goto local; + } + if (mm_needs_flush_escalation(mm)) _tlbie_pid(pid, RIC_FLUSH_ALL); else _tlbie_pid(pid, RIC_FLUSH_TLB); - } else + } else { +local: _tlbiel_pid(pid, RIC_FLUSH_TLB); + } preempt_enable(); } EXPORT_SYMBOL(radix__flush_tlb_mm); -void radix__flush_all_mm(struct mm_struct *mm) +static void __flush_all_mm(struct mm_struct *mm, bool fullmm) { unsigned long pid; @@ -381,12 +605,25 @@ void radix__flush_all_mm(struct mm_struct *mm) return; preempt_disable(); - if (!mm_is_thread_local(mm)) + smp_mb(); /* see radix__flush_tlb_mm */ + if (!mm_is_thread_local(mm)) { + if (unlikely(mm_is_singlethreaded(mm))) { + if (!fullmm) { + exit_flush_lazy_tlbs(mm); + goto local; + } + } _tlbie_pid(pid, RIC_FLUSH_ALL); - else + } else { +local: _tlbiel_pid(pid, RIC_FLUSH_ALL); + } preempt_enable(); } +void radix__flush_all_mm(struct mm_struct *mm) +{ + __flush_all_mm(mm, false); +} EXPORT_SYMBOL(radix__flush_all_mm); void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) @@ -405,10 +642,17 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, return; preempt_disable(); - if (!mm_is_thread_local(mm)) + smp_mb(); /* see radix__flush_tlb_mm */ + if (!mm_is_thread_local(mm)) { + if (unlikely(mm_is_singlethreaded(mm))) { + exit_flush_lazy_tlbs(mm); + goto local; + } _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB); - else + } else { +local: _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); + } preempt_enable(); } @@ -446,35 +690,38 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range); static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; -void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) +static inline void __radix__flush_tlb_range(struct mm_struct *mm, + unsigned long start, unsigned long end, + bool flush_all_sizes) { - struct mm_struct *mm = vma->vm_mm; unsigned long pid; unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift; unsigned long page_size = 1UL << page_shift; unsigned long nr_pages = (end - start) >> page_shift; bool local, full; -#ifdef CONFIG_HUGETLB_PAGE - if (is_vm_hugetlb_page(vma)) - return radix__flush_hugetlb_tlb_range(vma, start, end); -#endif - pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) return; preempt_disable(); - if (mm_is_thread_local(mm)) { - local = true; - full = (end == TLB_FLUSH_ALL || - nr_pages > tlb_local_single_page_flush_ceiling); - } else { + smp_mb(); /* see radix__flush_tlb_mm */ + if (!mm_is_thread_local(mm)) { + if (unlikely(mm_is_singlethreaded(mm))) { + if (end != TLB_FLUSH_ALL) { + exit_flush_lazy_tlbs(mm); + goto is_local; + } + } local = false; full = (end == TLB_FLUSH_ALL || nr_pages > tlb_single_page_flush_ceiling); + } else { +is_local: + local = true; + full = (end == TLB_FLUSH_ALL || + nr_pages > tlb_local_single_page_flush_ceiling); } if (full) { @@ -487,37 +734,64 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, _tlbie_pid(pid, RIC_FLUSH_TLB); } } else { - bool hflush = false; + bool hflush = flush_all_sizes; + bool gflush = flush_all_sizes; unsigned long hstart, hend; + unsigned long gstart, gend; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - hstart = (start + HPAGE_PMD_SIZE - 1) >> HPAGE_PMD_SHIFT; - hend = end >> HPAGE_PMD_SHIFT; - if (hstart < hend) { - hstart <<= HPAGE_PMD_SHIFT; - hend <<= HPAGE_PMD_SHIFT; + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) hflush = true; + + if (hflush) { + hstart = (start + PMD_SIZE - 1) & PMD_MASK; + hend = end & PMD_MASK; + if (hstart == hend) + hflush = false; + } + + if (gflush) { + gstart = (start + PUD_SIZE - 1) & PUD_MASK; + gend = end & PUD_MASK; + if (gstart == gend) + gflush = false; } -#endif asm volatile("ptesync": : :"memory"); if (local) { __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize); if (hflush) __tlbiel_va_range(hstart, hend, pid, - HPAGE_PMD_SIZE, MMU_PAGE_2M); + PMD_SIZE, MMU_PAGE_2M); + if (gflush) + __tlbiel_va_range(gstart, gend, pid, + PUD_SIZE, MMU_PAGE_1G); asm volatile("ptesync": : :"memory"); } else { __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize); if (hflush) __tlbie_va_range(hstart, hend, pid, - HPAGE_PMD_SIZE, MMU_PAGE_2M); + PMD_SIZE, MMU_PAGE_2M); + if (gflush) + __tlbie_va_range(gstart, gend, pid, + PUD_SIZE, MMU_PAGE_1G); fixup_tlbie(); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } } preempt_enable(); } + +void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) + +{ +#ifdef CONFIG_HUGETLB_PAGE + if (is_vm_hugetlb_page(vma)) + return radix__flush_hugetlb_tlb_range(vma, start, end); +#endif + + __radix__flush_tlb_range(vma->vm_mm, start, end, false); +} EXPORT_SYMBOL(radix__flush_tlb_range); static int radix_get_mmu_psize(int page_size) @@ -535,6 +809,58 @@ static int radix_get_mmu_psize(int page_size) return psize; } +/* + * Flush partition scoped LPID address translation for all CPUs. + */ +void radix__flush_tlb_lpid_page(unsigned int lpid, + unsigned long addr, + unsigned long page_size) +{ + int psize = radix_get_mmu_psize(page_size); + + _tlbie_lpid_va(addr, lpid, psize, RIC_FLUSH_TLB); +} +EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid_page); + +/* + * Flush partition scoped PWC from LPID for all CPUs. + */ +void radix__flush_pwc_lpid(unsigned int lpid) +{ + _tlbie_lpid(lpid, RIC_FLUSH_PWC); +} +EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid); + +/* + * Flush partition scoped translations from LPID (=LPIDR) + */ +void radix__flush_tlb_lpid(unsigned int lpid) +{ + _tlbie_lpid(lpid, RIC_FLUSH_ALL); +} +EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid); + +/* + * Flush partition scoped translations from LPID (=LPIDR) + */ +void radix__local_flush_tlb_lpid(unsigned int lpid) +{ + _tlbiel_lpid(lpid, RIC_FLUSH_ALL); +} +EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid); + +/* + * Flush process scoped translations from LPID (=LPIDR). + * Important difference, the guest normally manages its own translations, + * but some cases e.g., vCPU CPU migration require KVM to flush. + */ +void radix__local_flush_tlb_lpid_guest(unsigned int lpid) +{ + _tlbiel_lpid_guest(lpid, RIC_FLUSH_ALL); +} +EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid_guest); + + static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, unsigned long end, int psize); @@ -543,6 +869,8 @@ void radix__tlb_flush(struct mmu_gather *tlb) int psize = 0; struct mm_struct *mm = tlb->mm; int page_size = tlb->page_size; + unsigned long start = tlb->start; + unsigned long end = tlb->end; /* * if page size is not something we understand, do a full mm flush @@ -552,16 +880,46 @@ void radix__tlb_flush(struct mmu_gather *tlb) * See the comment for radix in arch_exit_mmap(). */ if (tlb->fullmm) { - radix__flush_all_mm(mm); + __flush_all_mm(mm, true); +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) + } else if (mm_tlb_flush_nested(mm)) { + /* + * If there is a concurrent invalidation that is clearing ptes, + * then it's possible this invalidation will miss one of those + * cleared ptes and miss flushing the TLB. If this invalidate + * returns before the other one flushes TLBs, that can result + * in it returning while there are still valid TLBs inside the + * range to be invalidated. + * + * See mm/memory.c:tlb_finish_mmu() for more details. + * + * The solution to this is ensure the entire range is always + * flushed here. The problem for powerpc is that the flushes + * are page size specific, so this "forced flush" would not + * do the right thing if there are a mix of page sizes in + * the range to be invalidated. So use __flush_tlb_range + * which invalidates all possible page sizes in the range. + * + * PWC flush probably is not be required because the core code + * shouldn't free page tables in this path, but accounting + * for the possibility makes us a bit more robust. + * + * need_flush_all is an uncommon case because page table + * teardown should be done with exclusive locks held (but + * after locks are dropped another invalidate could come + * in), it could be optimized further if necessary. + */ + if (!tlb->need_flush_all) + __radix__flush_tlb_range(mm, start, end, true); + else + radix__flush_all_mm(mm); +#endif } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) { if (!tlb->need_flush_all) radix__flush_tlb_mm(mm); else radix__flush_all_mm(mm); } else { - unsigned long start = tlb->start; - unsigned long end = tlb->end; - if (!tlb->need_flush_all) radix__flush_tlb_range_psize(mm, start, end, psize); else @@ -585,24 +943,33 @@ static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm, return; preempt_disable(); - if (mm_is_thread_local(mm)) { - local = true; - full = (end == TLB_FLUSH_ALL || - nr_pages > tlb_local_single_page_flush_ceiling); - } else { + smp_mb(); /* see radix__flush_tlb_mm */ + if (!mm_is_thread_local(mm)) { + if (unlikely(mm_is_singlethreaded(mm))) { + if (end != TLB_FLUSH_ALL) { + exit_flush_lazy_tlbs(mm); + goto is_local; + } + } local = false; full = (end == TLB_FLUSH_ALL || nr_pages > tlb_single_page_flush_ceiling); + } else { +is_local: + local = true; + full = (end == TLB_FLUSH_ALL || + nr_pages > tlb_local_single_page_flush_ceiling); } if (full) { - if (!local && mm_needs_flush_escalation(mm)) - also_pwc = true; - - if (local) + if (local) { _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); - else - _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL: RIC_FLUSH_TLB); + } else { + if (mm_needs_flush_escalation(mm)) + also_pwc = true; + + _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); + } } else { if (local) _tlbiel_va_range(start, end, pid, page_size, psize, also_pwc); @@ -643,11 +1010,16 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) /* Otherwise first do the PWC, then iterate the pages. */ preempt_disable(); - - if (mm_is_thread_local(mm)) { - _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); - } else { + smp_mb(); /* see radix__flush_tlb_mm */ + if (!mm_is_thread_local(mm)) { + if (unlikely(mm_is_singlethreaded(mm))) { + exit_flush_lazy_tlbs(mm); + goto local; + } _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); + } else { +local: + _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); } preempt_enable(); @@ -668,7 +1040,7 @@ void radix__flush_tlb_all(void) rb = 0x3 << PPC_BITLSHIFT(53); /* IS = 3 */ prs = 0; /* partition scoped */ - r = 1; /* raidx format */ + r = 1; /* radix format */ rs = 1 & ((1UL << 32) - 1); /* any LPID value to flush guest mappings */ asm volatile("ptesync": : :"memory"); @@ -685,28 +1057,10 @@ void radix__flush_tlb_all(void) asm volatile("eieio; tlbsync; ptesync": : :"memory"); } -void radix__flush_tlb_pte_p9_dd1(unsigned long old_pte, struct mm_struct *mm, - unsigned long address) -{ - /* - * We track page size in pte only for DD1, So we can - * call this only on DD1. - */ - if (!cpu_has_feature(CPU_FTR_POWER9_DD1)) { - VM_WARN_ON(1); - return; - } - - if (old_pte & R_PAGE_LARGE) - radix__flush_tlb_page_psize(mm, address, MMU_PAGE_2M); - else - radix__flush_tlb_page_psize(mm, address, mmu_virtual_psize); -} - #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE extern void radix_kvm_prefetch_workaround(struct mm_struct *mm) { - unsigned int pid = mm->context.id; + unsigned long pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) return; @@ -734,7 +1088,9 @@ extern void radix_kvm_prefetch_workaround(struct mm_struct *mm) for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) { if (sib == cpu) continue; - if (paca[sib].kvm_hstate.kvm_vcpu) + if (!cpu_possible(sib)) + continue; + if (paca_ptrs[sib]->kvm_hstate.kvm_vcpu) flush = true; } if (flush) diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c index 702d7689d714..cf8472cf3d59 100644 --- a/arch/powerpc/mm/tlb_hash32.c +++ b/arch/powerpc/mm/tlb_hash32.c @@ -41,7 +41,7 @@ void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr) { unsigned long ptephys; - if (Hash != 0) { + if (Hash) { ptephys = __pa(ptep) & PAGE_MASK; flush_hash_pages(mm->context.id, addr, ptephys, 1); } @@ -54,7 +54,7 @@ EXPORT_SYMBOL(flush_hash_entry); */ void tlb_flush(struct mmu_gather *tlb) { - if (Hash == 0) { + if (!Hash) { /* * 603 needs to flush the whole TLB here since * it doesn't use a hash table. @@ -84,7 +84,7 @@ static void flush_range(struct mm_struct *mm, unsigned long start, int count; unsigned int ctx = mm->context.id; - if (Hash == 0) { + if (!Hash) { _tlbia(); return; } @@ -124,7 +124,7 @@ void flush_tlb_mm(struct mm_struct *mm) { struct vm_area_struct *mp; - if (Hash == 0) { + if (!Hash) { _tlbia(); return; } @@ -145,7 +145,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) struct mm_struct *mm; pmd_t *pmd; - if (Hash == 0) { + if (!Hash) { _tlbie(vmaddr); return; } diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index 9b23f12e863c..87d71dd25441 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -89,7 +89,7 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, /* Build full vaddr */ if (!is_kernel_addr(addr)) { ssize = user_segment_size(addr); - vsid = get_vsid(mm->context.id, addr, ssize); + vsid = get_user_vsid(&mm->context, addr, ssize); } else { vsid = get_kernel_vsid(addr, mmu_kernel_ssize); ssize = mmu_kernel_ssize; diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S index eb82d787d99a..7fd20c52a8ec 100644 --- a/arch/powerpc/mm/tlb_low_64e.S +++ b/arch/powerpc/mm/tlb_low_64e.S @@ -22,6 +22,7 @@ #include <asm/ppc-opcode.h> #include <asm/kvm_asm.h> #include <asm/kvm_booke_hv_asm.h> +#include <asm/feature-fixups.h> #ifdef CONFIG_PPC_64K_PAGES #define VPTE_PMD_SHIFT (PTE_INDEX_SIZE+1) diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index 15fe5f0c8665..ae5d568e267f 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -503,6 +503,9 @@ static void setup_page_sizes(void) for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { struct mmu_psize_def *def = &mmu_psize_defs[psize]; + if (!def->shift) + continue; + if (tlb1ps & (1U << (def->shift - 10))) { def->flags |= MMU_PAGE_SIZE_DIRECT; diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S index 048b8e9f4492..e066a658acac 100644 --- a/arch/powerpc/mm/tlb_nohash_low.S +++ b/arch/powerpc/mm/tlb_nohash_low.S @@ -34,6 +34,8 @@ #include <asm/asm-offsets.h> #include <asm/processor.h> #include <asm/bug.h> +#include <asm/asm-compat.h> +#include <asm/feature-fixups.h> #if defined(CONFIG_40x) |

