diff options
Diffstat (limited to 'arch/powerpc/mm')
| -rw-r--r-- | arch/powerpc/mm/Makefile | 47 | ||||
| -rw-r--r-- | arch/powerpc/mm/book3s32/Makefile | 9 | ||||
| -rw-r--r-- | arch/powerpc/mm/book3s32/hash_low.S (renamed from arch/powerpc/mm/hash_low_32.S) | 16 | ||||
| -rw-r--r-- | arch/powerpc/mm/book3s32/mmu.c (renamed from arch/powerpc/mm/ppc_mmu_32.c) | 166 | ||||
| -rw-r--r-- | arch/powerpc/mm/book3s32/mmu_context.c (renamed from arch/powerpc/mm/mmu_context_hash32.c) | 7 | ||||
| -rw-r--r-- | arch/powerpc/mm/book3s32/tlb.c (renamed from arch/powerpc/mm/tlb_hash32.c) | 9 | ||||
| -rw-r--r-- | arch/powerpc/mm/book3s64/Makefile | 23 | ||||
| -rw-r--r-- | arch/powerpc/mm/book3s64/hash_4k.c (renamed from arch/powerpc/mm/hash64_4k.c) | 2 | ||||
| -rw-r--r-- | arch/powerpc/mm/book3s64/hash_64k.c (renamed from arch/powerpc/mm/hash64_64k.c) | 2 | ||||
| -rw-r--r-- | arch/powerpc/mm/book3s64/hash_hugepage.c (renamed from arch/powerpc/mm/hugepage-hash64.c) | 2 | ||||
| -rw-r--r-- | arch/powerpc/mm/book3s64/hash_hugetlbpage.c (renamed from arch/powerpc/mm/hugetlbpage-hash64.c) | 31 | ||||
| -rw-r--r-- | arch/powerpc/mm/book3s64/hash_native.c (renamed from arch/powerpc/mm/hash_native_64.c) | 81 | ||||
| -rw-r--r-- | arch/powerpc/mm/book3s64/hash_pgtable.c (renamed from arch/powerpc/mm/pgtable-hash64.c) | 23 | ||||
| -rw-r--r-- | arch/powerpc/mm/book3s64/hash_tlb.c (renamed from arch/powerpc/mm/tlb_hash64.c) | 24 | ||||
| -rw-r--r-- | arch/powerpc/mm/book3s64/hash_utils.c (renamed from arch/powerpc/mm/hash_utils_64.c) | 267 | ||||
| -rw-r--r-- | arch/powerpc/mm/book3s64/iommu_api.c (renamed from arch/powerpc/mm/mmu_context_iommu.c) | 149 | ||||
| -rw-r--r-- | arch/powerpc/mm/book3s64/mmu_context.c (renamed from arch/powerpc/mm/mmu_context_book3s64.c) | 97 | ||||
| -rw-r--r-- | arch/powerpc/mm/book3s64/pgtable.c (renamed from arch/powerpc/mm/pgtable-book3s64.c) | 125 | ||||
| -rw-r--r-- | arch/powerpc/mm/book3s64/pkeys.c (renamed from arch/powerpc/mm/pkeys.c) | 11 | ||||
| -rw-r--r-- | arch/powerpc/mm/book3s64/radix_hugetlbpage.c (renamed from arch/powerpc/mm/hugetlbpage-radix.c) | 0 | ||||
| -rw-r--r-- | arch/powerpc/mm/book3s64/radix_pgtable.c (renamed from arch/powerpc/mm/pgtable-radix.c) | 325 | ||||
| -rw-r--r-- | arch/powerpc/mm/book3s64/radix_tlb.c (renamed from arch/powerpc/mm/tlb-radix.c) | 508 | ||||
| -rw-r--r-- | arch/powerpc/mm/book3s64/slb.c (renamed from arch/powerpc/mm/slb.c) | 38 | ||||
| -rw-r--r-- | arch/powerpc/mm/book3s64/subpage_prot.c (renamed from arch/powerpc/mm/subpage-prot.c) | 57 | ||||
| -rw-r--r-- | arch/powerpc/mm/copro_fault.c | 33 | ||||
| -rw-r--r-- | arch/powerpc/mm/dma-noncoherent.c | 325 | ||||
| -rw-r--r-- | arch/powerpc/mm/drmem.c | 12 | ||||
| -rw-r--r-- | arch/powerpc/mm/fault.c | 89 | ||||
| -rw-r--r-- | arch/powerpc/mm/highmem.c | 14 | ||||
| -rw-r--r-- | arch/powerpc/mm/hugetlbpage.c | 299 | ||||
| -rw-r--r-- | arch/powerpc/mm/init-common.c | 40 | ||||
| -rw-r--r-- | arch/powerpc/mm/init_32.c | 20 | ||||
| -rw-r--r-- | arch/powerpc/mm/init_64.c | 88 | ||||
| -rw-r--r-- | arch/powerpc/mm/ioremap.c | 99 | ||||
| -rw-r--r-- | arch/powerpc/mm/ioremap_32.c | 93 | ||||
| -rw-r--r-- | arch/powerpc/mm/ioremap_64.c | 115 | ||||
| -rw-r--r-- | arch/powerpc/mm/kasan/Makefile | 5 | ||||
| -rw-r--r-- | arch/powerpc/mm/kasan/kasan_init_32.c | 221 | ||||
| -rw-r--r-- | arch/powerpc/mm/mem.c | 316 | ||||
| -rw-r--r-- | arch/powerpc/mm/mmap.c | 16 | ||||
| -rw-r--r-- | arch/powerpc/mm/mmu_context.c | 9 | ||||
| -rw-r--r-- | arch/powerpc/mm/mmu_decl.h | 34 | ||||
| -rw-r--r-- | arch/powerpc/mm/nohash/40x.c (renamed from arch/powerpc/mm/40x_mmu.c) | 9 | ||||
| -rw-r--r-- | arch/powerpc/mm/nohash/44x.c (renamed from arch/powerpc/mm/44x_mmu.c) | 9 | ||||
| -rw-r--r-- | arch/powerpc/mm/nohash/8xx.c (renamed from arch/powerpc/mm/8xx_mmu.c) | 85 | ||||
| -rw-r--r-- | arch/powerpc/mm/nohash/Makefile | 19 | ||||
| -rw-r--r-- | arch/powerpc/mm/nohash/book3e_hugetlbpage.c (renamed from arch/powerpc/mm/hugetlbpage-book3e.c) | 68 | ||||
| -rw-r--r-- | arch/powerpc/mm/nohash/book3e_pgtable.c (renamed from arch/powerpc/mm/pgtable-book3e.c) | 15 | ||||
| -rw-r--r-- | arch/powerpc/mm/nohash/fsl_booke.c (renamed from arch/powerpc/mm/fsl_booke_mmu.c) | 17 | ||||
| -rw-r--r-- | arch/powerpc/mm/nohash/kaslr_booke.c | 401 | ||||
| -rw-r--r-- | arch/powerpc/mm/nohash/mmu_context.c (renamed from arch/powerpc/mm/mmu_context_nohash.c) | 8 | ||||
| -rw-r--r-- | arch/powerpc/mm/nohash/tlb.c (renamed from arch/powerpc/mm/tlb_nohash.c) | 29 | ||||
| -rw-r--r-- | arch/powerpc/mm/nohash/tlb_low.S (renamed from arch/powerpc/mm/tlb_nohash_low.S) | 7 | ||||
| -rw-r--r-- | arch/powerpc/mm/nohash/tlb_low_64e.S (renamed from arch/powerpc/mm/tlb_low_64e.S) | 37 | ||||
| -rw-r--r-- | arch/powerpc/mm/numa.c | 100 | ||||
| -rw-r--r-- | arch/powerpc/mm/pgtable-frag.c | 6 | ||||
| -rw-r--r-- | arch/powerpc/mm/pgtable.c | 132 | ||||
| -rw-r--r-- | arch/powerpc/mm/pgtable_32.c | 194 | ||||
| -rw-r--r-- | arch/powerpc/mm/pgtable_64.c | 212 | ||||
| -rw-r--r-- | arch/powerpc/mm/ptdump/bats.c | 2 | ||||
| -rw-r--r-- | arch/powerpc/mm/ptdump/hashpagetable.c | 32 | ||||
| -rw-r--r-- | arch/powerpc/mm/ptdump/ptdump.c | 125 | ||||
| -rw-r--r-- | arch/powerpc/mm/slice.c | 124 | ||||
| -rw-r--r-- | arch/powerpc/mm/vphn.c | 71 | ||||
| -rw-r--r-- | arch/powerpc/mm/vphn.h | 17 |
65 files changed, 3195 insertions, 2371 deletions
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 3c1bd9fa23cd..5e147986400d 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -5,53 +5,18 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) -CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE) - obj-y := fault.o mem.o pgtable.o mmap.o \ init_$(BITS).o pgtable_$(BITS).o \ + pgtable-frag.o ioremap.o ioremap_$(BITS).o \ init-common.o mmu_context.o drmem.o -obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ - tlb_nohash_low.o -obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(BITS)e.o -hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o -obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o -obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-hash64.o hash_utils_64.o slb.o \ - $(hash64-y) mmu_context_book3s64.o \ - pgtable-book3s64.o pgtable-frag.o -obj-$(CONFIG_PPC32) += pgtable-frag.o -obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o tlb-radix.o -obj-$(CONFIG_PPC_BOOK3S_32) += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o -obj-$(CONFIG_PPC_BOOK3S) += tlb_hash$(BITS).o -ifdef CONFIG_PPC_BOOK3S_64 -obj-$(CONFIG_PPC_4K_PAGES) += hash64_4k.o -obj-$(CONFIG_PPC_64K_PAGES) += hash64_64k.o -endif -obj-$(CONFIG_40x) += 40x_mmu.o -obj-$(CONFIG_44x) += 44x_mmu.o -obj-$(CONFIG_PPC_8xx) += 8xx_mmu.o -obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke_mmu.o +obj-$(CONFIG_PPC_MMU_NOHASH) += nohash/ +obj-$(CONFIG_PPC_BOOK3S_32) += book3s32/ +obj-$(CONFIG_PPC_BOOK3S_64) += book3s64/ obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o -obj-$(CONFIG_PPC_SPLPAR) += vphn.o obj-$(CONFIG_PPC_MM_SLICES) += slice.o -obj-y += hugetlbpage.o -ifdef CONFIG_HUGETLB_PAGE -obj-$(CONFIG_PPC_BOOK3S_64) += hugetlbpage-hash64.o -obj-$(CONFIG_PPC_RADIX_MMU) += hugetlbpage-radix.o -obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o -endif -obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o -obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o +obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o -obj-$(CONFIG_SPAPR_TCE_IOMMU) += mmu_context_iommu.o obj-$(CONFIG_PPC_PTDUMP) += ptdump/ -obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o - -# Disable kcov instrumentation on sensitive code -# This is necessary for booting with kcov enabled on book3e machines -KCOV_INSTRUMENT_tlb_nohash.o := n -KCOV_INSTRUMENT_fsl_booke_mmu.o := n - -# Instrumenting the SLB fault path can lead to duplicate SLB entries -KCOV_INSTRUMENT_slb.o := n +obj-$(CONFIG_KASAN) += kasan/ diff --git a/arch/powerpc/mm/book3s32/Makefile b/arch/powerpc/mm/book3s32/Makefile new file mode 100644 index 000000000000..1732eaa740a9 --- /dev/null +++ b/arch/powerpc/mm/book3s32/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 + +KASAN_SANITIZE_mmu.o := n + +ifdef CONFIG_KASAN +CFLAGS_mmu.o += -DDISABLE_BRANCH_PROFILING +endif + +obj-y += mmu.o hash_low.o mmu_context.o tlb.o diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/book3s32/hash_low.S index a6c491f18a04..8bbbd9775c8a 100644 --- a/arch/powerpc/mm/hash_low_32.S +++ b/arch/powerpc/mm/book3s32/hash_low.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * PowerPC version * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) @@ -11,12 +12,6 @@ * This file contains low-level assembler routines for managing * the PowerPC MMU hash table. (PPC 8xx processors don't use a * hash table, so this file is not used on them.) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <asm/reg.h> @@ -309,13 +304,13 @@ Hash_msk = (((1 << Hash_bits) - 1) * 64) _GLOBAL(create_hpte) /* Convert linux-style PTE (r5) to low word of PPC-style PTE (r8) */ - rlwinm r8,r5,32-10,31,31 /* _PAGE_RW -> PP lsb */ - rlwinm r0,r5,32-7,31,31 /* _PAGE_DIRTY -> PP lsb */ + rlwinm r8,r5,32-9,30,30 /* _PAGE_RW -> PP msb */ + rlwinm r0,r5,32-6,30,30 /* _PAGE_DIRTY -> PP msb */ and r8,r8,r0 /* writable if _RW & _DIRTY */ rlwimi r5,r5,32-1,30,30 /* _PAGE_USER -> PP msb */ rlwimi r5,r5,32-2,31,31 /* _PAGE_USER -> PP lsb */ ori r8,r8,0xe04 /* clear out reserved bits */ - andc r8,r5,r8 /* PP = user? (rw&dirty? 2: 3): 0 */ + andc r8,r5,r8 /* PP = user? (rw&dirty? 1: 3): 0 */ BEGIN_FTR_SECTION rlwinm r8,r8,0,~_PAGE_COHERENT /* clear M (coherence not required) */ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) @@ -539,7 +534,8 @@ _GLOBAL(flush_hash_pages) #ifdef CONFIG_SMP lis r9, (mmu_hash_lock - PAGE_OFFSET)@ha addi r9, r9, (mmu_hash_lock - PAGE_OFFSET)@l - lwz r8,TASK_CPU(r2) + tophys (r8, r2) + lwz r8, TASK_CPU(r8) oris r8,r8,9 10: lwarx r0,0,r9 cmpi 0,r0,0 diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/book3s32/mmu.c index f29d2f118b44..69b2419accef 100644 --- a/arch/powerpc/mm/ppc_mmu_32.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * This file contains the routines for handling the MMU on those * PowerPC implementations where the MMU substantially follows the @@ -14,12 +15,6 @@ * * Derived from "arch/i386/mm/init.c" * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <linux/kernel.h> @@ -34,11 +29,12 @@ #include <asm/code-patching.h> #include <asm/sections.h> -#include "mmu_decl.h" +#include <mm/mmu_decl.h> -struct hash_pte *Hash, *Hash_end; -unsigned long Hash_size, Hash_mask; +struct hash_pte *Hash; +static unsigned long Hash_size, Hash_mask; unsigned long _SDR1; +static unsigned int hash_mb, hash_mb2; struct ppc_bat BATS[8][2]; /* 8 pairs of IBAT, DBAT */ @@ -78,7 +74,7 @@ static int find_free_bat(void) { int b; - if (cpu_has_feature(CPU_FTR_601)) { + if (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) { for (b = 0; b < 4; b++) { struct ppc_bat *bat = BATS[b]; @@ -98,10 +94,20 @@ static int find_free_bat(void) return -1; } +/* + * This function calculates the size of the larger block usable to map the + * beginning of an area based on the start address and size of that area: + * - max block size is 8M on 601 and 256 on other 6xx. + * - base address must be aligned to the block size. So the maximum block size + * is identified by the lowest bit set to 1 in the base address (for instance + * if base is 0x16000000, max size is 0x02000000). + * - block size has to be a power of two. This is calculated by finding the + * highest bit set to 1. + */ static unsigned int block_size(unsigned long base, unsigned long top) { - unsigned int max_size = (cpu_has_feature(CPU_FTR_601) ? 8 : 256) << 20; - unsigned int base_shift = (fls(base) - 1) & 31; + unsigned int max_size = IS_ENABLED(CONFIG_PPC_BOOK3S_601) ? SZ_8M : SZ_256M; + unsigned int base_shift = (ffs(base) - 1) & 31; unsigned int block_shift = (fls(top - base) - 1) & 31; return min3(max_size, 1U << base_shift, 1U << block_shift); @@ -157,7 +163,7 @@ static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long to unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { - int done; + unsigned long done; unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET; if (__map_without_bats) { @@ -169,10 +175,10 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) return __mmu_mapin_ram(base, top); done = __mmu_mapin_ram(base, border); - if (done != border - base) + if (done != border) return done; - return done + __mmu_mapin_ram(border, top); + return __mmu_mapin_ram(border, top); } void mmu_mark_initmem_nx(void) @@ -183,7 +189,7 @@ void mmu_mark_initmem_nx(void) unsigned long top = (unsigned long)_etext - PAGE_OFFSET; unsigned long size; - if (cpu_has_feature(CPU_FTR_601)) + if (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) return; for (i = 0; i < nb - 1 && base < top && top - base > (128 << 10);) { @@ -221,7 +227,7 @@ void mmu_mark_rodata_ro(void) int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; int i; - if (cpu_has_feature(CPU_FTR_601)) + if (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) return; for (i = 0; i < nb; i++) { @@ -245,15 +251,24 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys, { unsigned int bl; int wimgxpp; - struct ppc_bat *bat = BATS[index]; + struct ppc_bat *bat; unsigned long flags = pgprot_val(prot); + if (index == -1) + index = find_free_bat(); + if (index == -1) { + pr_err("%s: no BAT available for mapping 0x%llx\n", __func__, + (unsigned long long)phys); + return; + } + bat = BATS[index]; + if ((flags & _PAGE_NO_CACHE) || (cpu_has_feature(CPU_FTR_NEED_COHERENT) == 0)) flags &= ~_PAGE_COHERENT; bl = (size >> 17) - 1; - if (PVR_VER(mfspr(SPRN_PVR)) != 1) { + if (!IS_ENABLED(CONFIG_PPC_BOOK3S_601)) { /* 603, 604, etc. */ /* Do DBAT first */ wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE @@ -291,8 +306,7 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys, /* * Preload a translation in the hash table */ -void hash_preload(struct mm_struct *mm, unsigned long ea, - bool is_exec, unsigned long trap) +void hash_preload(struct mm_struct *mm, unsigned long ea) { pmd_t *pmd; @@ -304,11 +318,43 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, } /* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a PTE in the linux page tables. + * We use it to preload an HPTE into the hash table corresponding to + * the updated linux PTE. + * + * This must always be called with the pte lock held. + */ +void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, + pte_t *ptep) +{ + if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) + return; + /* + * We don't need to worry about _PAGE_PRESENT here because we are + * called with either mm->page_table_lock held or ptl lock held + */ + + /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ + if (!pte_young(*ptep) || address >= TASK_SIZE) + return; + + /* We have to test for regs NULL since init will get here first thing at boot */ + if (!current->thread.regs) + return; + + /* We also avoid filling the hash if not coming from a fault */ + if (TRAP(current->thread.regs) != 0x300 && TRAP(current->thread.regs) != 0x400) + return; + + hash_preload(vma->vm_mm, address); +} + +/* * Initialize the hash table and patch the instructions in hashtable.S. */ void __init MMU_init_hw(void) { - unsigned int hmask, mb, mb2; unsigned int n_hpteg, lg_n_hpteg; if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) @@ -345,26 +391,43 @@ void __init MMU_init_hw(void) __func__, Hash_size, Hash_size); _SDR1 = __pa(Hash) | SDR1_LOW_BITS; - Hash_end = (struct hash_pte *) ((unsigned long)Hash + Hash_size); + pr_info("Total memory = %lldMB; using %ldkB for hash table\n", + (unsigned long long)(total_memory >> 20), Hash_size >> 10); + - printk("Total memory = %lldMB; using %ldkB for hash table (at %p)\n", - (unsigned long long)(total_memory >> 20), Hash_size >> 10, Hash); + Hash_mask = n_hpteg - 1; + hash_mb2 = hash_mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg; + if (lg_n_hpteg > 16) + hash_mb2 = 16 - LG_HPTEG_SIZE; + /* + * When KASAN is selected, there is already an early temporary hash + * table and the switch to the final hash table is done later. + */ + if (IS_ENABLED(CONFIG_KASAN)) + return; + + MMU_init_hw_patch(); +} + +void __init MMU_init_hw_patch(void) +{ + unsigned int hmask = Hash_mask >> (16 - LG_HPTEG_SIZE); + + if (ppc_md.progress) + ppc_md.progress("hash:patch", 0x345); + if (ppc_md.progress) + ppc_md.progress("hash:done", 0x205); + + /* WARNING: Make sure nothing can trigger a KASAN check past this point */ /* * Patch up the instructions in hashtable.S:create_hpte */ - if ( ppc_md.progress ) ppc_md.progress("hash:patch", 0x345); - Hash_mask = n_hpteg - 1; - hmask = Hash_mask >> (16 - LG_HPTEG_SIZE); - mb2 = mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg; - if (lg_n_hpteg > 16) - mb2 = 16 - LG_HPTEG_SIZE; - modify_instruction_site(&patch__hash_page_A0, 0xffff, ((unsigned int)Hash - PAGE_OFFSET) >> 16); - modify_instruction_site(&patch__hash_page_A1, 0x7c0, mb << 6); - modify_instruction_site(&patch__hash_page_A2, 0x7c0, mb2 << 6); + modify_instruction_site(&patch__hash_page_A1, 0x7c0, hash_mb << 6); + modify_instruction_site(&patch__hash_page_A2, 0x7c0, hash_mb2 << 6); modify_instruction_site(&patch__hash_page_B, 0xffff, hmask); modify_instruction_site(&patch__hash_page_C, 0xffff, hmask); @@ -373,11 +436,9 @@ void __init MMU_init_hw(void) */ modify_instruction_site(&patch__flush_hash_A0, 0xffff, ((unsigned int)Hash - PAGE_OFFSET) >> 16); - modify_instruction_site(&patch__flush_hash_A1, 0x7c0, mb << 6); - modify_instruction_site(&patch__flush_hash_A2, 0x7c0, mb2 << 6); + modify_instruction_site(&patch__flush_hash_A1, 0x7c0, hash_mb << 6); + modify_instruction_site(&patch__flush_hash_A2, 0x7c0, hash_mb2 << 6); modify_instruction_site(&patch__flush_hash_B, 0xffff, hmask); - - if ( ppc_md.progress ) ppc_md.progress("hash:done", 0x205); } void setup_initial_memory_limit(phys_addr_t first_memblock_base, @@ -389,8 +450,35 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base, BUG_ON(first_memblock_base != 0); /* 601 can only access 16MB at the moment */ - if (PVR_VER(mfspr(SPRN_PVR)) == 1) + if (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01000000)); else /* Anything else has 256M mapped */ memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000)); } + +void __init print_system_hash_info(void) +{ + pr_info("Hash_size = 0x%lx\n", Hash_size); + if (Hash_mask) + pr_info("Hash_mask = 0x%lx\n", Hash_mask); +} + +#ifdef CONFIG_PPC_KUEP +void __init setup_kuep(bool disabled) +{ + pr_info("Activating Kernel Userspace Execution Prevention\n"); + + if (disabled) + pr_warn("KUEP cannot be disabled yet on 6xx when compiled in\n"); +} +#endif + +#ifdef CONFIG_PPC_KUAP +void __init setup_kuap(bool disabled) +{ + pr_info("Activating Kernel Userspace Access Protection\n"); + + if (disabled) + pr_warn("KUAP cannot be disabled yet on 6xx when compiled in\n"); +} +#endif diff --git a/arch/powerpc/mm/mmu_context_hash32.c b/arch/powerpc/mm/book3s32/mmu_context.c index 921c1e33e941..218996e40a8e 100644 --- a/arch/powerpc/mm/mmu_context_hash32.c +++ b/arch/powerpc/mm/book3s32/mmu_context.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * This file contains the routines for handling the MMU on those * PowerPC implementations where the MMU substantially follows the @@ -14,12 +15,6 @@ * * Derived from "arch/i386/mm/init.c" * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <linux/mm.h> diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/book3s32/tlb.c index cf8472cf3d59..2fcd321040ff 100644 --- a/arch/powerpc/mm/tlb_hash32.c +++ b/arch/powerpc/mm/book3s32/tlb.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * This file contains the routines for TLB flushing. * On machines where the MMU uses a hash table to store virtual to @@ -14,12 +15,6 @@ * * Derived from "arch/i386/mm/init.c" * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <linux/kernel.h> @@ -32,7 +27,7 @@ #include <asm/tlbflush.h> #include <asm/tlb.h> -#include "mmu_decl.h" +#include <mm/mmu_decl.h> /* * Called when unmapping pages to flush entries from the TLB/hash table. diff --git a/arch/powerpc/mm/book3s64/Makefile b/arch/powerpc/mm/book3s64/Makefile new file mode 100644 index 000000000000..fd393b8be14f --- /dev/null +++ b/arch/powerpc/mm/book3s64/Makefile @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: GPL-2.0 + +ccflags-y := $(NO_MINIMAL_TOC) + +CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE) + +obj-y += hash_pgtable.o hash_utils.o slb.o \ + mmu_context.o pgtable.o hash_tlb.o +obj-$(CONFIG_PPC_NATIVE) += hash_native.o +obj-$(CONFIG_PPC_RADIX_MMU) += radix_pgtable.o radix_tlb.o +obj-$(CONFIG_PPC_4K_PAGES) += hash_4k.o +obj-$(CONFIG_PPC_64K_PAGES) += hash_64k.o +obj-$(CONFIG_HUGETLB_PAGE) += hash_hugetlbpage.o +ifdef CONFIG_HUGETLB_PAGE +obj-$(CONFIG_PPC_RADIX_MMU) += radix_hugetlbpage.o +endif +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hash_hugepage.o +obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage_prot.o +obj-$(CONFIG_SPAPR_TCE_IOMMU) += iommu_api.o +obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o + +# Instrumenting the SLB fault path can lead to duplicate SLB entries +KCOV_INSTRUMENT_slb.o := n diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/book3s64/hash_4k.c index 6fa6765a10eb..22e787123cdf 100644 --- a/arch/powerpc/mm/hash64_4k.c +++ b/arch/powerpc/mm/book3s64/hash_4k.c @@ -1,6 +1,6 @@ /* * Copyright IBM Corporation, 2015 - * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * Author Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2 of the GNU Lesser General Public License diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/book3s64/hash_64k.c index 3afa253d7f52..7084ce2951e6 100644 --- a/arch/powerpc/mm/hash64_64k.c +++ b/arch/powerpc/mm/book3s64/hash_64k.c @@ -1,6 +1,6 @@ /* * Copyright IBM Corporation, 2015 - * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * Author Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2 of the GNU Lesser General Public License diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/book3s64/hash_hugepage.c index dfbc3b32f09b..440823797de7 100644 --- a/arch/powerpc/mm/hugepage-hash64.c +++ b/arch/powerpc/mm/book3s64/hash_hugepage.c @@ -1,6 +1,6 @@ /* * Copyright IBM Corporation, 2013 - * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * Author Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2.1 of the GNU Lesser General Public License diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/book3s64/hash_hugetlbpage.c index b0d9209d9a86..eefa89c6117b 100644 --- a/arch/powerpc/mm/hugetlbpage-hash64.c +++ b/arch/powerpc/mm/book3s64/hash_hugetlbpage.c @@ -15,6 +15,9 @@ #include <asm/cacheflush.h> #include <asm/machdep.h> +unsigned int hpage_shift; +EXPORT_SYMBOL(hpage_shift); + extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn, unsigned long pa, unsigned long rlags, unsigned long vflags, int psize, int ssize); @@ -34,7 +37,8 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, /* Search the Linux page table for a match with va */ vpn = hpt_vpn(ea, vsid, ssize); - /* At this point, we have a pte (old_pte) which can be used to build + /* + * At this point, we have a pte (old_pte) which can be used to build * or update an HPTE. There are 2 cases: * * 1. There is a valid (present) pte with no associated HPTE (this is @@ -55,8 +59,10 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, if (unlikely(!check_pte_access(access, old_pte))) return 1; - /* Try to lock the PTE, add ACCESSED and DIRTY if it was - * a write access */ + /* + * Try to lock the PTE, add ACCESSED and DIRTY if it was + * a write access + */ new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; if (access & _PAGE_WRITE) new_pte |= _PAGE_DIRTY; @@ -74,8 +80,10 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, rpte = __real_pte(__pte(old_pte), ptep, offset); if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) - /* No CPU has hugepages but lacks no execute, so we - * don't need to worry about that case */ + /* + * No CPU has hugepages but lacks no execute, so we + * don't need to worry about that case + */ rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); /* Check if pte already has an hpte (case 2) */ @@ -145,3 +153,16 @@ void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr old_pte, pte); set_huge_pte_at(vma->vm_mm, addr, ptep, pte); } + +void hugetlbpage_init_default(void) +{ + /* Set default large page size. Currently, we pick 16M or 1M + * depending on what is available + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift) + hpage_shift = mmu_psize_defs[MMU_PAGE_16M].shift; + else if (mmu_psize_defs[MMU_PAGE_1M].shift) + hpage_shift = mmu_psize_defs[MMU_PAGE_1M].shift; + else if (mmu_psize_defs[MMU_PAGE_2M].shift) + hpage_shift = mmu_psize_defs[MMU_PAGE_2M].shift; +} diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/book3s64/hash_native.c index aaa28fd918fe..d2d8237ea9d5 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * native hashtable management. * * SMP scalability work: * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #undef DEBUG_LOW @@ -45,7 +41,7 @@ #define HPTE_LOCK_BIT (56+3) #endif -DEFINE_RAW_SPINLOCK(native_tlbie_lock); +static DEFINE_RAW_SPINLOCK(native_tlbie_lock); static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is) { @@ -60,7 +56,7 @@ static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is) * tlbiel instruction for hash, set invalidation * i.e., r=1 and is=01 or is=10 or is=11 */ -static inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is, +static __always_inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is, unsigned int pid, unsigned int ric, unsigned int prs) { @@ -116,7 +112,7 @@ static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) asm volatile("ptesync": : :"memory"); - asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); + asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory"); } void hash__tlbiel_all(unsigned int action) @@ -201,9 +197,32 @@ static inline unsigned long ___tlbie(unsigned long vpn, int psize, return va; } -static inline void fixup_tlbie(unsigned long vpn, int psize, int apsize, int ssize) +static inline void fixup_tlbie_vpn(unsigned long vpn, int psize, + int apsize, int ssize) { - if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + /* Radix flush for a hash guest */ + + unsigned long rb,rs,prs,r,ric; + + rb = PPC_BIT(52); /* IS = 2 */ + rs = 0; /* lpid = 0 */ + prs = 0; /* partition scoped */ + r = 1; /* radix format */ + ric = 0; /* RIC_FLSUH_TLB */ + + /* + * Need the extra ptesync to make sure we don't + * re-order the tlbie + */ + asm volatile("ptesync": : :"memory"); + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), + "i"(ric), "r"(rs) : "memory"); + } + + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { /* Need the extra ptesync to ensure we don't reorder tlbie*/ asm volatile("ptesync": : :"memory"); ___tlbie(vpn, psize, apsize, ssize); @@ -287,7 +306,7 @@ static inline void tlbie(unsigned long vpn, int psize, int apsize, asm volatile("ptesync": : :"memory"); } else { __tlbie(vpn, psize, apsize, ssize); - fixup_tlbie(vpn, psize, apsize, ssize); + fixup_tlbie_vpn(vpn, psize, apsize, ssize); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } if (lock_tlbie && !use_local) @@ -463,19 +482,12 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, return ret; } -static long native_hpte_find(unsigned long vpn, int psize, int ssize) +static long __native_hpte_find(unsigned long want_v, unsigned long slot) { struct hash_pte *hptep; - unsigned long hash; + unsigned long hpte_v; unsigned long i; - long slot; - unsigned long want_v, hpte_v; - - hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize); - want_v = hpte_encode_avpn(vpn, psize, ssize); - /* Bolted mappings are only ever in the primary group */ - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; for (i = 0; i < HPTES_PER_GROUP; i++) { hptep = htab_address + slot; @@ -489,6 +501,33 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize) return -1; } +static long native_hpte_find(unsigned long vpn, int psize, int ssize) +{ + unsigned long hpte_group; + unsigned long want_v; + unsigned long hash; + long slot; + + hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize); + want_v = hpte_encode_avpn(vpn, psize, ssize); + + /* + * We try to keep bolted entries always in primary hash + * But in some case we can find them in secondary too. + */ + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot = __native_hpte_find(want_v, hpte_group); + if (slot < 0) { + /* Try in secondary */ + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; + slot = __native_hpte_find(want_v, hpte_group); + if (slot < 0) + return -1; + } + + return slot; +} + /* * Update the page protection bits. Intended to be used to create * guard pages for kernel data structures on pages which are bolted @@ -860,7 +899,7 @@ static void native_flush_hash_range(unsigned long number, int local) /* * Just do one more with the last used values. */ - fixup_tlbie(vpn, psize, psize, ssize); + fixup_tlbie_vpn(vpn, psize, psize, ssize); asm volatile("eieio; tlbsync; ptesync":::"memory"); if (lock_tlbie) diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index c08d49046a96..64733b9cb20a 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright 2005, Paul Mackerras, IBM Corporation. * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation. * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/sched.h> @@ -19,7 +15,7 @@ #include <asm/mmu.h> #include <asm/tlb.h> -#include "mmu_decl.h" +#include <mm/mmu_decl.h> #define CREATE_TRACE_POINTS #include <trace/events/thp.h> @@ -112,9 +108,16 @@ int __meminit hash__vmemmap_create_mapping(unsigned long start, unsigned long page_size, unsigned long phys) { - int rc = htab_bolt_mapping(start, start + page_size, phys, - pgprot_val(PAGE_KERNEL), - mmu_vmemmap_psize, mmu_kernel_ssize); + int rc; + + if ((start + page_size) >= H_VMEMMAP_END) { + pr_warn("Outside the supported range\n"); + return -1; + } + + rc = htab_bolt_mapping(start, start + page_size, phys, + pgprot_val(PAGE_KERNEL), + mmu_vmemmap_psize, mmu_kernel_ssize); if (rc < 0) { int rc2 = htab_remove_mapping(start, start + page_size, mmu_vmemmap_psize, @@ -403,6 +406,8 @@ int hash__has_transparent_hugepage(void) return 1; } +EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage); + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_STRICT_KERNEL_RWX diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/book3s64/hash_tlb.c index 87d71dd25441..4a70d8dd39cd 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/book3s64/hash_tlb.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * This file contains the routines for flushing entries from the * TLB and MMU hash table. @@ -14,11 +15,6 @@ * * Dave Engebretsen <engebret@us.ibm.com> * Rework for PPC64 port. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/kernel.h> @@ -55,7 +51,8 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, i = batch->index; - /* Get page size (maybe move back to caller). + /* + * Get page size (maybe move back to caller). * * NOTE: when using special 64K mappings in 4K environment like * for SPEs, we obtain the page size from the slice, which thus @@ -77,10 +74,12 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, #endif } else { psize = pte_pagesize_index(mm, addr, pte); - /* Mask the address for the standard page size. If we + /* + * Mask the address for the standard page size. If we * have a 64k page kernel, but the hardware does not * support 64k pages, this might be different from the - * hardware page size encoded in the slice table. */ + * hardware page size encoded in the slice table. + */ addr &= PAGE_MASK; offset = PTRS_PER_PTE; } @@ -161,7 +160,8 @@ void hash__tlb_flush(struct mmu_gather *tlb) { struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch); - /* If there's a TLB batch pending, then we must flush it because the + /* + * If there's a TLB batch pending, then we must flush it because the * pages are going to be freed and we really don't want to have a CPU * access a freed page because it has a stale TLB */ @@ -201,7 +201,8 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, BUG_ON(!mm->pgd); - /* Note: Normally, we should only ever use a batch within a + /* + * Note: Normally, we should only ever use a batch within a * PTE locked section. This violates the rule, but will work * since we don't actually modify the PTEs, we just flush the * hash while leaving the PTEs intact (including their reference @@ -238,7 +239,8 @@ void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) unsigned long flags; addr = _ALIGN_DOWN(addr, PMD_SIZE); - /* Note: Normally, we should only ever use a batch within a + /* + * Note: Normally, we should only ever use a batch within a * PTE locked section. This violates the rule, but will work * since we don't actually modify the PTEs, we just flush the * hash while leaving the PTEs intact (including their reference diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/book3s64/hash_utils.c index 0a4f939a8161..b30435c7d804 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerPC64 port by Mike Corrigan and Dave Engebretsen * {mikejc|engebret}@us.ibm.com @@ -11,11 +12,6 @@ * * Description: * PowerPC Hashed Page Table functions - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #undef DEBUG @@ -37,6 +33,8 @@ #include <linux/context_tracking.h> #include <linux/libfdt.h> #include <linux/pkeys.h> +#include <linux/hugetlb.h> +#include <linux/cpu.h> #include <asm/debugfs.h> #include <asm/processor.h> @@ -64,6 +62,9 @@ #include <asm/ps3.h> #include <asm/pte-walk.h> #include <asm/asm-prototypes.h> +#include <asm/ultravisor.h> + +#include <mm/mmu_decl.h> #ifdef DEBUG #define DBG(fmt...) udbg_printf(fmt) @@ -128,7 +129,8 @@ static DEFINE_SPINLOCK(linear_map_hash_lock); struct mmu_hash_ops mmu_hash_ops; EXPORT_SYMBOL(mmu_hash_ops); -/* There are definitions of page sizes arrays to be used when none +/* + * These are definitions of page sizes arrays to be used when none * is provided by the firmware. */ @@ -145,7 +147,8 @@ static struct mmu_psize_def mmu_psize_defaults[] = { }, }; -/* POWER4, GPUL, POWER5 +/* + * POWER4, GPUL, POWER5 * * Support for 16Mb large pages */ @@ -260,6 +263,7 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend, unsigned long vsid = get_kernel_vsid(vaddr, ssize); unsigned long vpn = hpt_vpn(vaddr, vsid, ssize); unsigned long tprot = prot; + bool secondary_hash = false; /* * If we hit a bad address return error. @@ -270,10 +274,6 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend, if (overlaps_kernel_text(vaddr, vaddr + step)) tprot &= ~HPTE_R_N; - /* Make kvm guest trampolines executable */ - if (overlaps_kvm_tmp(vaddr, vaddr + step)) - tprot &= ~HPTE_R_N; - /* * If relocatable, check if it overlaps interrupt vectors that * are copied down to real 0. For relocatable kernel @@ -292,13 +292,31 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend, hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); BUG_ON(!mmu_hash_ops.hpte_insert); +repeat: ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot, HPTE_V_BOLTED, psize, psize, ssize); + if (ret == -1) { + /* + * Try to to keep bolted entries in primary. + * Remove non bolted entries and try insert again + */ + ret = mmu_hash_ops.hpte_remove(hpteg); + if (ret != -1) + ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot, + HPTE_V_BOLTED, psize, psize, + ssize); + if (ret == -1 && !secondary_hash) { + secondary_hash = true; + hpteg = ((~hash & htab_hash_mask) * HPTES_PER_GROUP); + goto repeat; + } + } if (ret < 0) break; + cond_resched(); #ifdef CONFIG_DEBUG_PAGEALLOC if (debug_pagealloc_enabled() && (paddr >> PAGE_SHIFT) < linear_map_hash_count) @@ -479,7 +497,8 @@ static int __init htab_dt_scan_page_sizes(unsigned long node, } #ifdef CONFIG_HUGETLB_PAGE -/* Scan for 16G memory blocks that have been set aside for huge pages +/* + * Scan for 16G memory blocks that have been set aside for huge pages * and reserve those blocks for 16G huge pages. */ static int __init htab_dt_scan_hugepage_blocks(unsigned long node, @@ -496,8 +515,10 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned long node, if (type == NULL || strcmp(type, "memory") != 0) return 0; - /* This property is the log base 2 of the number of virtual pages that - * will represent this memory block. */ + /* + * This property is the log base 2 of the number of virtual pages that + * will represent this memory block. + */ page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL); if (page_count_prop == NULL) return 0; @@ -673,16 +694,15 @@ static void __init htab_init_page_sizes(void) #endif /* CONFIG_PPC_64K_PAGES */ #ifdef CONFIG_SPARSEMEM_VMEMMAP - /* We try to use 16M pages for vmemmap if that is supported + /* + * We try to use 16M pages for vmemmap if that is supported * and we have at least 1G of RAM at boot */ if (mmu_psize_defs[MMU_PAGE_16M].shift && memblock_phys_mem_size() >= 0x40000000) mmu_vmemmap_psize = MMU_PAGE_16M; - else if (mmu_psize_defs[MMU_PAGE_64K].shift) - mmu_vmemmap_psize = MMU_PAGE_64K; else - mmu_vmemmap_psize = MMU_PAGE_4K; + mmu_vmemmap_psize = mmu_virtual_psize; #endif /* CONFIG_SPARSEMEM_VMEMMAP */ printk(KERN_DEBUG "Page orders: linear mapping = %d, " @@ -742,7 +762,8 @@ unsigned htab_shift_for_mem_size(unsigned long mem_size) static unsigned long __init htab_get_table_size(void) { - /* If hash size isn't already provided by the platform, we try to + /* + * If hash size isn't already provided by the platform, we try to * retrieve it from the device-tree. If it's not there neither, we * calculate it now based on the total RAM size */ @@ -755,12 +776,12 @@ static unsigned long __init htab_get_table_size(void) } #ifdef CONFIG_MEMORY_HOTPLUG -void resize_hpt_for_hotplug(unsigned long new_mem_size) +int resize_hpt_for_hotplug(unsigned long new_mem_size) { unsigned target_hpt_shift; if (!mmu_hash_ops.resize_hpt) - return; + return 0; target_hpt_shift = htab_shift_for_mem_size(new_mem_size); @@ -772,23 +793,25 @@ void resize_hpt_for_hotplug(unsigned long new_mem_size) * reduce unless the target shift is at least 2 below the * current shift */ - if ((target_hpt_shift > ppc64_pft_size) - || (target_hpt_shift < (ppc64_pft_size - 1))) { - int rc; - - rc = mmu_hash_ops.resize_hpt(target_hpt_shift); - if (rc && (rc != -ENODEV)) - printk(KERN_WARNING - "Unable to resize hash page table to target order %d: %d\n", - target_hpt_shift, rc); - } + if (target_hpt_shift > ppc64_pft_size || + target_hpt_shift < ppc64_pft_size - 1) + return mmu_hash_ops.resize_hpt(target_hpt_shift); + + return 0; } int hash__create_section_mapping(unsigned long start, unsigned long end, int nid) { - int rc = htab_bolt_mapping(start, end, __pa(start), - pgprot_val(PAGE_KERNEL), mmu_linear_psize, - mmu_kernel_ssize); + int rc; + + if (end >= H_VMALLOC_START) { + pr_warn("Outside the supported range\n"); + return -1; + } + + rc = htab_bolt_mapping(start, end, __pa(start), + pgprot_val(PAGE_KERNEL), mmu_linear_psize, + mmu_kernel_ssize); if (rc < 0) { int rc2 = htab_remove_mapping(start, end, mmu_linear_psize, @@ -817,7 +840,7 @@ static void __init hash_init_partition_table(phys_addr_t hash_table, * For now, UPRT is 0 and we have no segment table. */ htab_size = __ilog2(htab_size) - 18; - mmu_partition_table_set_entry(0, hash_table | htab_size, 0); + mmu_partition_table_set_entry(0, hash_table | htab_size, 0, false); pr_info("Partition table %p\n", partition_tb); } @@ -851,12 +874,6 @@ static void __init htab_initialize(void) /* Using a hypervisor which owns the htab */ htab_address = NULL; _SDR1 = 0; - /* - * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall - * to inform the hypervisor that we wish to use the HPT. - */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) - register_process_table(0, 0, 0); #ifdef CONFIG_FA_DUMP /* * If firmware assisted dump is active firmware preserves @@ -929,6 +946,11 @@ static void __init htab_initialize(void) DBG("creating mapping for region: %lx..%lx (prot: %lx)\n", base, size, prot); + if ((base + size) >= H_VMALLOC_START) { + pr_warn("Outside the supported range\n"); + continue; + } + BUG_ON(htab_bolt_mapping(base, base + size, __pa(base), prot, mmu_linear_psize, mmu_kernel_ssize)); } @@ -968,6 +990,7 @@ void __init hash__early_init_devtree(void) htab_scan_page_sizes(); } +static struct hash_mm_context init_hash_mm_context; void __init hash__early_init_mmu(void) { #ifndef CONFIG_PPC_64K_PAGES @@ -1013,11 +1036,11 @@ void __init hash__early_init_mmu(void) __pgd_val_bits = HASH_PGD_VAL_BITS; __kernel_virt_start = H_KERN_VIRT_START; - __kernel_virt_size = H_KERN_VIRT_SIZE; __vmalloc_start = H_VMALLOC_START; __vmalloc_end = H_VMALLOC_END; __kernel_io_start = H_KERN_IO_START; - vmemmap = (struct page *)H_VMEMMAP_BASE; + __kernel_io_end = H_KERN_IO_END; + vmemmap = (struct page *)H_VMEMMAP_START; ioremap_bot = IOREMAP_BASE; #ifdef CONFIG_PCI @@ -1035,12 +1058,16 @@ void __init hash__early_init_mmu(void) if (!mmu_hash_ops.hpte_insert) panic("hash__early_init_mmu: No MMU hash ops defined!\n"); - /* Initialize the MMU Hash table and create the linear mapping + /* + * Initialize the MMU Hash table and create the linear mapping * of memory. Has to be done before SLB initialization as this is * currently where the page size encoding is obtained. */ htab_initialize(); + init_mm.context.hash_context = &init_hash_mm_context; + mm_ctx_set_slb_addr_limit(&init_mm.context, SLB_ADDR_LIMIT_DEFAULT); + pr_info("Initializing hash mmu with SLB\n"); /* Initialize SLB management */ slb_initialize(); @@ -1059,8 +1086,8 @@ void hash__early_init_mmu_secondary(void) if (!cpu_has_feature(CPU_FTR_ARCH_300)) mtspr(SPRN_SDR1, _SDR1); else - mtspr(SPRN_PTCR, - __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); + set_ptcr_when_no_uv(__pa(partition_tb) | + (PATB_SIZE_SHIFT - 12)); } /* Initialize SLB */ slb_initialize(); @@ -1147,10 +1174,13 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr) */ static int subpage_protection(struct mm_struct *mm, unsigned long ea) { - struct subpage_prot_table *spt = &mm->context.spt; + struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); u32 spp = 0; u32 **sbpm, *sbpp; + if (!spt) + return 0; + if (ea >= spt->maxaddr) return 0; if (ea < 0x100000000UL) { @@ -1214,7 +1244,8 @@ static void check_paca_psize(unsigned long ea, struct mm_struct *mm, } } -/* Result code is: +/* + * Result code is: * 0 - handled * 1 - normal page fault * -1 - critical hash insertion error @@ -1238,7 +1269,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, trace_hash_fault(ea, access, trap); /* Get region & vsid */ - switch (REGION_ID(ea)) { + switch (get_region_id(ea)) { case USER_REGION_ID: user_region = 1; if (! mm) { @@ -1252,15 +1283,19 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, break; case VMALLOC_REGION_ID: vsid = get_kernel_vsid(ea, mmu_kernel_ssize); - if (ea < VMALLOC_END) - psize = mmu_vmalloc_psize; - else - psize = mmu_io_psize; + psize = mmu_vmalloc_psize; + ssize = mmu_kernel_ssize; + break; + + case IO_REGION_ID: + vsid = get_kernel_vsid(ea, mmu_kernel_ssize); + psize = mmu_io_psize; ssize = mmu_kernel_ssize; break; default: - /* Not a valid range - * Send the problem up to do_page_fault + /* + * Not a valid range + * Send the problem up to do_page_fault() */ rc = 1; goto bail; @@ -1285,7 +1320,8 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, flags |= HPTE_LOCAL_UPDATE; #ifndef CONFIG_PPC_64K_PAGES - /* If we use 4K pages and our psize is not 4K, then we might + /* + * If we use 4K pages and our psize is not 4K, then we might * be hitting a special driver mapping, and need to align the * address before we fetch the PTE. * @@ -1307,7 +1343,8 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, /* Add _PAGE_PRESENT to the required access perm */ access |= _PAGE_PRESENT; - /* Pre-check access permissions (will be re-checked atomically + /* + * Pre-check access permissions (will be re-checked atomically * in __hash_page_XX but this pre-check is a fast path */ if (!check_pte_access(access, pte_val(*ptep))) { @@ -1354,7 +1391,8 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, psize = MMU_PAGE_4K; } - /* If this PTE is non-cacheable and we have restrictions on + /* + * If this PTE is non-cacheable and we have restrictions on * using non cacheable large pages, then we switch to 4k */ if (mmu_ci_restrictions && psize == MMU_PAGE_64K && pte_ci(*ptep)) { @@ -1395,7 +1433,8 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, flags, ssize, spp); } - /* Dump some info in case of hash insertion failure, they should + /* + * Dump some info in case of hash insertion failure, they should * never happen so it is really useful to know if/when they do */ if (rc == -1) @@ -1421,7 +1460,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap, unsigned long flags = 0; struct mm_struct *mm = current->mm; - if (REGION_ID(ea) == VMALLOC_REGION_ID) + if ((get_region_id(ea) == VMALLOC_REGION_ID) || + (get_region_id(ea) == IO_REGION_ID)) mm = &init_mm; if (dsisr & DSISR_NOHPTE) @@ -1431,14 +1471,15 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap, } EXPORT_SYMBOL_GPL(hash_page); -int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, - unsigned long dsisr) +int __hash_page(unsigned long trap, unsigned long ea, unsigned long dsisr, + unsigned long msr) { unsigned long access = _PAGE_PRESENT | _PAGE_READ; unsigned long flags = 0; struct mm_struct *mm = current->mm; + unsigned int region_id = get_region_id(ea); - if (REGION_ID(ea) == VMALLOC_REGION_ID) + if ((region_id == VMALLOC_REGION_ID) || (region_id == IO_REGION_ID)) mm = &init_mm; if (dsisr & DSISR_NOHPTE) @@ -1455,7 +1496,7 @@ int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, * 2) user space access kernel space. */ access |= _PAGE_PRIVILEGED; - if ((msr & MSR_PR) || (REGION_ID(ea) == USER_REGION_ID)) + if ((msr & MSR_PR) || (region_id == USER_REGION_ID)) access &= ~_PAGE_PRIVILEGED; if (trap == 0x400) @@ -1470,7 +1511,7 @@ static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) int psize = get_slice_psize(mm, ea); /* We only prefault standard pages for now */ - if (unlikely(psize != mm->context.user_psize)) + if (unlikely(psize != mm_ctx_user_psize(&mm->context))) return false; /* @@ -1488,8 +1529,8 @@ static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) } #endif -void hash_preload(struct mm_struct *mm, unsigned long ea, - bool is_exec, unsigned long trap) +static void hash_preload(struct mm_struct *mm, unsigned long ea, + bool is_exec, unsigned long trap) { int hugepage_shift; unsigned long vsid; @@ -1499,7 +1540,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, int rc, ssize, update_flags = 0; unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0); - BUG_ON(REGION_ID(ea) != USER_REGION_ID); + BUG_ON(get_region_id(ea) != USER_REGION_ID); if (!should_hash_preload(mm, ea)) return; @@ -1549,7 +1590,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, /* Hash it in */ #ifdef CONFIG_PPC_64K_PAGES - if (mm->context.user_psize == MMU_PAGE_64K) + if (mm_ctx_user_psize(&mm->context) == MMU_PAGE_64K) rc = __hash_page_64K(ea, access, vsid, ptep, trap, update_flags, ssize); else @@ -1562,13 +1603,64 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, */ if (rc == -1) hash_failure_debug(ea, access, vsid, trap, ssize, - mm->context.user_psize, - mm->context.user_psize, + mm_ctx_user_psize(&mm->context), + mm_ctx_user_psize(&mm->context), pte_val(*ptep)); out_exit: local_irq_restore(flags); } +/* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a PTE in the linux page tables. + * We use it to preload an HPTE into the hash table corresponding to + * the updated linux PTE. + * + * This must always be called with the pte lock held. + */ +void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, + pte_t *ptep) +{ + /* + * We don't need to worry about _PAGE_PRESENT here because we are + * called with either mm->page_table_lock held or ptl lock held + */ + unsigned long trap; + bool is_exec; + + if (radix_enabled()) { + prefetch((void *)address); + return; + } + + /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ + if (!pte_young(*ptep) || address >= TASK_SIZE) + return; + + /* + * We try to figure out if we are coming from an instruction + * access fault and pass that down to __hash_page so we avoid + * double-faulting on execution of fresh text. We have to test + * for regs NULL since init will get here first thing at boot. + * + * We also avoid filling the hash if not coming from a fault. + */ + + trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL; + switch (trap) { + case 0x300: + is_exec = false; + break; + case 0x400: + is_exec = true; + break; + default: + return; + } + + hash_preload(vma->vm_mm, address, is_exec, trap); +} + #ifdef CONFIG_PPC_MEM_KEYS /* * Return the protection key associated with the given address and the @@ -1634,7 +1726,8 @@ unsigned long pte_get_hash_gslot(unsigned long vpn, unsigned long shift, return gslot; } -/* WARNING: This is called from hash_low_64.S, if you change this prototype, +/* + * WARNING: This is called from hash_low_64.S, if you change this prototype, * do not forget to update the assembly call site ! */ void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize, @@ -1674,7 +1767,7 @@ void flush_hash_hugepage(unsigned long vsid, unsigned long addr, /* * IF we try to do a HUGE PTE update after a withdraw is done. * we will find the below NULL. This happens when we do - * split_huge_page_pmd + * split_huge_pmd */ if (!hpte_slot_array) return; @@ -1855,7 +1948,8 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size) { - /* We don't currently support the first MEMBLOCK not mapping 0 + /* + * We don't currently support the first MEMBLOCK not mapping 0 * physical on those processors */ BUG_ON(first_memblock_base != 0); @@ -1867,11 +1961,20 @@ void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, * * For guests on platforms before POWER9, we clamp the it limit to 1G * to avoid some funky things such as RTAS bugs etc... + * + * On POWER9 we limit to 1TB in case the host erroneously told us that + * the RMA was >1TB. Effective address bits 0:23 are treated as zero + * (meaning the access is aliased to zero i.e. addr = addr % 1TB) + * for virtual real mode addressing and so it doesn't make sense to + * have an area larger than 1TB as it can't be addressed. */ if (!early_cpu_has_feature(CPU_FTR_HVMODE)) { ppc64_rma_size = first_memblock_size; if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) ppc64_rma_size = min_t(u64, ppc64_rma_size, 0x40000000); + else + ppc64_rma_size = min_t(u64, ppc64_rma_size, + 1UL << SID_SHIFT_1T); /* Finally limit subsequent allocations */ memblock_set_current_limit(ppc64_rma_size); @@ -1890,10 +1993,16 @@ static int hpt_order_get(void *data, u64 *val) static int hpt_order_set(void *data, u64 val) { + int ret; + if (!mmu_hash_ops.resize_hpt) return -ENODEV; - return mmu_hash_ops.resize_hpt(val); + cpus_read_lock(); + ret = mmu_hash_ops.resize_hpt(val); + cpus_read_unlock(); + + return ret; } DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n"); @@ -1909,3 +2018,11 @@ static int __init hash64_debugfs(void) } machine_device_initcall(pseries, hash64_debugfs); #endif /* CONFIG_DEBUG_FS */ + +void __init print_system_hash_info(void) +{ + pr_info("ppc64_pft_size = 0x%llx\n", ppc64_pft_size); + + if (htab_hash_mask) + pr_info("htab_hash_mask = 0x%lx\n", htab_hash_mask); +} diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/book3s64/iommu_api.c index e7a9c4f6bfca..56cc84520577 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -1,13 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * IOMMU helpers in MMU context. * * Copyright (C) 2015 IBM Corp. <aik@ozlabs.ru> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <linux/sched/signal.h> @@ -19,6 +14,7 @@ #include <linux/hugetlb.h> #include <linux/swap.h> #include <linux/sizes.h> +#include <linux/mm.h> #include <asm/mmu_context.h> #include <asm/pte-walk.h> #include <linux/mm_inline.h> @@ -51,40 +47,6 @@ struct mm_iommu_table_group_mem_t { u64 dev_hpa; /* Device memory base address */ }; -static long mm_iommu_adjust_locked_vm(struct mm_struct *mm, - unsigned long npages, bool incr) -{ - long ret = 0, locked, lock_limit; - - if (!npages) - return 0; - - down_write(&mm->mmap_sem); - - if (incr) { - locked = mm->locked_vm + npages; - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - ret = -ENOMEM; - else - mm->locked_vm += npages; - } else { - if (WARN_ON_ONCE(npages > mm->locked_vm)) - npages = mm->locked_vm; - mm->locked_vm -= npages; - } - - pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n", - current ? current->pid : 0, - incr ? '+' : '-', - npages << PAGE_SHIFT, - mm->locked_vm << PAGE_SHIFT, - rlimit(RLIMIT_MEMLOCK)); - up_write(&mm->mmap_sem); - - return ret; -} - bool mm_iommu_preregistered(struct mm_struct *mm) { return !list_empty(&mm->context.iommu_group_mem_list); @@ -95,28 +57,15 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, unsigned long entries, unsigned long dev_hpa, struct mm_iommu_table_group_mem_t **pmem) { - struct mm_iommu_table_group_mem_t *mem; - long i, ret, locked_entries = 0; + struct mm_iommu_table_group_mem_t *mem, *mem2; + long i, ret, locked_entries = 0, pinned = 0; unsigned int pageshift; - - mutex_lock(&mem_list_mutex); - - list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, - next) { - /* Overlap? */ - if ((mem->ua < (ua + (entries << PAGE_SHIFT))) && - (ua < (mem->ua + - (mem->entries << PAGE_SHIFT)))) { - ret = -EINVAL; - goto unlock_exit; - } - - } + unsigned long entry, chunk; if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) { - ret = mm_iommu_adjust_locked_vm(mm, entries, true); + ret = account_locked_vm(mm, entries, true); if (ret) - goto unlock_exit; + return ret; locked_entries = entries; } @@ -148,17 +97,28 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, } down_read(&mm->mmap_sem); - ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL); + chunk = (1UL << (PAGE_SHIFT + MAX_ORDER - 1)) / + sizeof(struct vm_area_struct *); + chunk = min(chunk, entries); + for (entry = 0; entry < entries; entry += chunk) { + unsigned long n = min(entries - entry, chunk); + + ret = get_user_pages(ua + (entry << PAGE_SHIFT), n, + FOLL_WRITE | FOLL_LONGTERM, + mem->hpages + entry, NULL); + if (ret == n) { + pinned += n; + continue; + } + if (ret > 0) + pinned += ret; + break; + } up_read(&mm->mmap_sem); - if (ret != entries) { - /* free the reference taken */ - for (i = 0; i < ret; i++) - put_page(mem->hpages[i]); - - vfree(mem->hpas); - kfree(mem); - ret = -EFAULT; - goto unlock_exit; + if (pinned != entries) { + if (!ret) + ret = -EFAULT; + goto free_exit; } pageshift = PAGE_SHIFT; @@ -169,11 +129,8 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, * Allow to use larger than 64k IOMMU pages. Only do that * if we are backed by hugetlb. */ - if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) { - struct page *head = compound_head(page); - - pageshift = compound_order(head) + PAGE_SHIFT; - } + if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) + pageshift = page_shift(compound_head(page)); mem->pageshift = min(mem->pageshift, pageshift); /* * We don't need struct page reference any more, switch @@ -183,21 +140,43 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, } good_exit: - ret = 0; atomic64_set(&mem->mapped, 1); mem->used = 1; mem->ua = ua; mem->entries = entries; - *pmem = mem; - list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list); + mutex_lock(&mem_list_mutex); -unlock_exit: - if (locked_entries && ret) - mm_iommu_adjust_locked_vm(mm, locked_entries, false); + list_for_each_entry_rcu(mem2, &mm->context.iommu_group_mem_list, next) { + /* Overlap? */ + if ((mem2->ua < (ua + (entries << PAGE_SHIFT))) && + (ua < (mem2->ua + + (mem2->entries << PAGE_SHIFT)))) { + ret = -EINVAL; + mutex_unlock(&mem_list_mutex); + goto free_exit; + } + } + + list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list); mutex_unlock(&mem_list_mutex); + *pmem = mem; + + return 0; + +free_exit: + /* free the reference taken */ + for (i = 0; i < pinned; i++) + put_page(mem->hpages[i]); + + vfree(mem->hpas); + kfree(mem); + +unlock_exit: + account_locked_vm(mm, locked_entries, false); + return ret; } @@ -266,7 +245,7 @@ static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem) long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem) { long ret = 0; - unsigned long entries, dev_hpa; + unsigned long unlock_entries = 0; mutex_lock(&mem_list_mutex); @@ -287,17 +266,17 @@ long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem) goto unlock_exit; } + if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) + unlock_entries = mem->entries; + /* @mapped became 0 so now mappings are disabled, release the region */ - entries = mem->entries; - dev_hpa = mem->dev_hpa; mm_iommu_release(mem); - if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) - mm_iommu_adjust_locked_vm(mm, entries, false); - unlock_exit: mutex_unlock(&mem_list_mutex); + account_locked_vm(mm, unlock_entries, false); + return ret; } EXPORT_SYMBOL_GPL(mm_iommu_put); diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/book3s64/mmu_context.c index f720c5cc0b5e..0ba30b8b935b 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/book3s64/mmu_context.c @@ -1,13 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * MMU context allocation for 64-bit kernels. * * Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <linux/sched.h> @@ -55,13 +50,52 @@ EXPORT_SYMBOL_GPL(hash__alloc_context_id); void slb_setup_new_exec(void); +static int realloc_context_ids(mm_context_t *ctx) +{ + int i, id; + + /* + * id 0 (aka. ctx->id) is special, we always allocate a new one, even if + * there wasn't one allocated previously (which happens in the exec + * case where ctx is newly allocated). + * + * We have to be a bit careful here. We must keep the existing ids in + * the array, so that we can test if they're non-zero to decide if we + * need to allocate a new one. However in case of error we must free the + * ids we've allocated but *not* any of the existing ones (or risk a + * UAF). That's why we decrement i at the start of the error handling + * loop, to skip the id that we just tested but couldn't reallocate. + */ + for (i = 0; i < ARRAY_SIZE(ctx->extended_id); i++) { + if (i == 0 || ctx->extended_id[i]) { + id = hash__alloc_context_id(); + if (id < 0) + goto error; + + ctx->extended_id[i] = id; + } + } + + /* The caller expects us to return id */ + return ctx->id; + +error: + for (i--; i >= 0; i--) { + if (ctx->extended_id[i]) + ida_free(&mmu_context_ida, ctx->extended_id[i]); + } + + return id; +} + static int hash__init_new_context(struct mm_struct *mm) { int index; - index = hash__alloc_context_id(); - if (index < 0) - return index; + mm->context.hash_context = kmalloc(sizeof(struct hash_mm_context), + GFP_KERNEL); + if (!mm->context.hash_context) + return -ENOMEM; /* * The old code would re-promote on fork, we don't do that when using @@ -77,10 +111,33 @@ static int hash__init_new_context(struct mm_struct *mm) * We should not be calling init_new_context() on init_mm. Hence a * check against 0 is OK. */ - if (mm->context.id == 0) + if (mm->context.id == 0) { + memset(mm->context.hash_context, 0, sizeof(struct hash_mm_context)); slice_init_new_context_exec(mm); + } else { + /* This is fork. Copy hash_context details from current->mm */ + memcpy(mm->context.hash_context, current->mm->context.hash_context, sizeof(struct hash_mm_context)); +#ifdef CONFIG_PPC_SUBPAGE_PROT + /* inherit subpage prot detalis if we have one. */ + if (current->mm->context.hash_context->spt) { + mm->context.hash_context->spt = kmalloc(sizeof(struct subpage_prot_table), + GFP_KERNEL); + if (!mm->context.hash_context->spt) { + kfree(mm->context.hash_context); + return -ENOMEM; + } + } +#endif + } - subpage_prot_init_new_context(mm); + index = realloc_context_ids(&mm->context); + if (index < 0) { +#ifdef CONFIG_PPC_SUBPAGE_PROT + kfree(mm->context.hash_context->spt); +#endif + kfree(mm->context.hash_context); + return index; + } pkey_mm_init(mm); return index; @@ -117,7 +174,7 @@ static int radix__init_new_context(struct mm_struct *mm) */ asm volatile("ptesync;isync" : : : "memory"); - mm->context.npu_context = NULL; + mm->context.hash_context = NULL; return index; } @@ -162,6 +219,7 @@ static void destroy_contexts(mm_context_t *ctx) if (context_id) ida_free(&mmu_context_ida, context_id); } + kfree(ctx->hash_context); } static void pmd_frag_destroy(void *pmd_frag) @@ -198,8 +256,21 @@ void destroy_context(struct mm_struct *mm) #ifdef CONFIG_SPAPR_TCE_IOMMU WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list)); #endif + /* + * For tasks which were successfully initialized we end up calling + * arch_exit_mmap() which clears the process table entry. And + * arch_exit_mmap() is called before the required fullmm TLB flush + * which does a RIC=2 flush. Hence for an initialized task, we do clear + * any cached process table entries. + * + * The condition below handles the error case during task init. We have + * set the process table entry early and if we fail a task + * initialization, we need to ensure the process table entry is zeroed. + * We need not worry about process table entry caches because the task + * never ran with the PID value. + */ if (radix_enabled()) - WARN_ON(process_tb[mm->context.id].prtb0 != 0); + process_tb[mm->context.id].prtb0 = 0; else subpage_prot_free(mm); destroy_contexts(&mm->context); diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/book3s64/pgtable.c index a4341aba0af4..75483b40fcb1 100644 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -1,10 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/sched.h> @@ -12,12 +8,15 @@ #include <linux/memblock.h> #include <misc/cxl-base.h> +#include <asm/debugfs.h> #include <asm/pgalloc.h> #include <asm/tlb.h> #include <asm/trace.h> #include <asm/powernv.h> +#include <asm/firmware.h> +#include <asm/ultravisor.h> -#include "mmu_decl.h" +#include <mm/mmu_decl.h> #include <trace/events/thp.h> unsigned long __pmd_frag_nr; @@ -25,9 +24,6 @@ EXPORT_SYMBOL(__pmd_frag_nr); unsigned long __pmd_frag_size_shift; EXPORT_SYMBOL(__pmd_frag_size_shift); -int (*register_process_table)(unsigned long base, unsigned long page_size, - unsigned long tbl_size); - #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * This is called when relaxing access to a hugepage. It's also called in the page @@ -76,7 +72,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); assert_spin_locked(pmd_lockptr(mm, pmdp)); - WARN_ON(!(pmd_large(pmd) || pmd_devmap(pmd))); + WARN_ON(!(pmd_large(pmd))); #endif trace_hugepage_set_pmd(addr, pmd_val(pmd)); return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); @@ -116,6 +112,9 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, /* * This ensures that generic code that rely on IRQ disabling * to prevent a parallel THP split work as expected. + * + * Marking the entry with _PAGE_INVALID && ~_PAGE_PRESENT requires + * a special case check in pmd_access_permitted. */ serialize_against_pte_lookup(vma->vm_mm); return __pmd(old_pmd); @@ -206,37 +205,61 @@ void __init mmu_partition_table_init(void) * 64 K size. */ ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12); - mtspr(SPRN_PTCR, ptcr); + set_ptcr_when_no_uv(ptcr); powernv_set_nmmu_ptcr(ptcr); } +static void flush_partition(unsigned int lpid, bool radix) +{ + if (radix) { + radix__flush_all_lpid(lpid); + radix__flush_all_lpid_guest(lpid); + } else { + asm volatile("ptesync" : : : "memory"); + asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + /* do we need fixup here ?*/ + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); + trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0); + } +} + void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, - unsigned long dw1) + unsigned long dw1, bool flush) { unsigned long old = be64_to_cpu(partition_tb[lpid].patb0); + /* + * When ultravisor is enabled, the partition table is stored in secure + * memory and can only be accessed doing an ultravisor call. However, we + * maintain a copy of the partition table in normal memory to allow Nest + * MMU translations to occur (for normal VMs). + * + * Therefore, here we always update partition_tb, regardless of whether + * we are running under an ultravisor or not. + */ partition_tb[lpid].patb0 = cpu_to_be64(dw0); partition_tb[lpid].patb1 = cpu_to_be64(dw1); /* - * Global flush of TLBs and partition table caches for this lpid. - * The type of flush (hash or radix) depends on what the previous - * use of this partition ID was, not the new use. + * If ultravisor is enabled, we do an ultravisor call to register the + * partition table entry (PATE), which also do a global flush of TLBs + * and partition table caches for the lpid. Otherwise, just do the + * flush. The type of flush (hash or radix) depends on what the previous + * use of the partition ID was, not the new use. */ - asm volatile("ptesync" : : : "memory"); - if (old & PATB_HR) { - asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); - asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); - trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 1); - } else { - asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); - trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0); + if (firmware_has_feature(FW_FEATURE_ULTRAVISOR)) { + uv_register_pate(lpid, dw0, dw1); + pr_info("PATE registered by ultravisor: dw0 = 0x%lx, dw1 = 0x%lx\n", + dw0, dw1); + } else if (flush) { + /* + * Boot does not need to flush, because MMU is off and each + * CPU does a tlbiel_all() before switching them on, which + * flushes everything. + */ + flush_partition(lpid, (old & PATB_HR)); } - /* do we need fixup here ?*/ - asm volatile("eieio; tlbsync; ptesync" : : : "memory"); } EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry); @@ -447,3 +470,49 @@ int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, return true; } + +/* + * Does the CPU support tlbie? + */ +bool tlbie_capable __read_mostly = true; +EXPORT_SYMBOL(tlbie_capable); + +/* + * Should tlbie be used for management of CPU TLBs, for kernel and process + * address spaces? tlbie may still be used for nMMU accelerators, and for KVM + * guest address spaces. + */ +bool tlbie_enabled __read_mostly = true; + +static int __init setup_disable_tlbie(char *str) +{ + if (!radix_enabled()) { + pr_err("disable_tlbie: Unable to disable TLBIE with Hash MMU.\n"); + return 1; + } + + tlbie_capable = false; + tlbie_enabled = false; + + return 1; +} +__setup("disable_tlbie", setup_disable_tlbie); + +static int __init pgtable_debugfs_setup(void) +{ + if (!tlbie_capable) + return 0; + + /* + * There is no locking vs tlb flushing when changing this value. + * The tlb flushers will see one value or another, and use either + * tlbie or tlbiel with IPIs. In both cases the TLBs will be + * invalidated as expected. + */ + debugfs_create_bool("tlbie_enabled", 0600, + powerpc_debugfs_root, + &tlbie_enabled); + + return 0; +} +arch_initcall(pgtable_debugfs_setup); diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c index 587807763737..59e0ebbd8036 100644 --- a/arch/powerpc/mm/pkeys.c +++ b/arch/powerpc/mm/book3s64/pkeys.c @@ -7,6 +7,7 @@ #include <asm/mman.h> #include <asm/mmu_context.h> +#include <asm/mmu.h> #include <asm/setup.h> #include <linux/pkeys.h> #include <linux/of_device.h> @@ -306,16 +307,6 @@ void thread_pkey_regs_init(struct thread_struct *thread) write_iamr(pkey_iamr_mask); } -static inline bool pkey_allows_readwrite(int pkey) -{ - int pkey_shift = pkeyshift(pkey); - - if (!is_pkey_enabled(pkey)) - return true; - - return !(read_amr() & ((AMR_RD_BIT|AMR_WR_BIT) << pkey_shift)); -} - int __execute_only_pkey(struct mm_struct *mm) { return mm->context.execute_only_pkey; diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c index cab06331c0c0..cab06331c0c0 100644 --- a/arch/powerpc/mm/hugetlbpage-radix.c +++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 154472a28c77..974109bb85db 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -1,21 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Page table handling routines for radix page table. * * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) "radix-mmu: " fmt +#include <linux/io.h> #include <linux/kernel.h> #include <linux/sched/mm.h> #include <linux/memblock.h> #include <linux/of_fdt.h> #include <linux/mm.h> +#include <linux/hugetlb.h> #include <linux/string_helpers.h> #include <linux/stop_machine.h> @@ -29,25 +27,14 @@ #include <asm/powernv.h> #include <asm/sections.h> #include <asm/trace.h> +#include <asm/uaccess.h> +#include <asm/ultravisor.h> #include <trace/events/thp.h> unsigned int mmu_pid_bits; unsigned int mmu_base_pid; -static int native_register_process_table(unsigned long base, unsigned long pg_sz, - unsigned long table_size) -{ - unsigned long patb0, patb1; - - patb0 = be64_to_cpu(partition_tb[0].patb0); - patb1 = base | table_size | PATB_GR; - - mmu_partition_table_set_entry(0, patb0, patb1); - - return 0; -} - static __ref void *early_alloc_pgtable(unsigned long size, int nid, unsigned long region_start, unsigned long region_end) { @@ -135,6 +122,10 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa, */ BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); +#ifdef CONFIG_PPC_64K_PAGES + BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT)); +#endif + if (unlikely(!slab_is_available())) return early_map_kernel_page(ea, pa, flags, map_page_size, nid, region_start, region_end); @@ -197,14 +188,14 @@ void radix__change_memory_range(unsigned long start, unsigned long end, pudp = pud_alloc(&init_mm, pgdp, idx); if (!pudp) continue; - if (pud_huge(*pudp)) { + if (pud_is_leaf(*pudp)) { ptep = (pte_t *)pudp; goto update_the_pte; } pmdp = pmd_alloc(&init_mm, pudp, idx); if (!pmdp) continue; - if (pmd_huge(*pmdp)) { + if (pmd_is_leaf(*pmdp)) { ptep = pmdp_ptep(pmdp); goto update_the_pte; } @@ -318,7 +309,7 @@ static int __meminit create_physical_mapping(unsigned long start, return 0; } -void __init radix_init_pgtable(void) +static void __init radix_init_pgtable(void) { unsigned long rts_field; struct memblock_region *reg; @@ -334,6 +325,12 @@ void __init radix_init_pgtable(void) * page tables will be allocated within the range. No * need or a node (which we don't have yet). */ + + if ((reg->base + reg->size) >= RADIX_VMALLOC_START) { + pr_warn("Outside the supported range\n"); + continue; + } + WARN_ON(create_physical_mapping(reg->base, reg->base + reg->size, -1)); @@ -372,18 +369,6 @@ void __init radix_init_pgtable(void) */ rts_field = radix__get_tree_size(); process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE); - /* - * Fill in the partition table. We are suppose to use effective address - * of process table here. But our linear mapping also enable us to use - * physical address here. - */ - register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12); - pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd); - asm volatile("ptesync" : : : "memory"); - asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (0)); - asm volatile("eieio; tlbsync; ptesync" : : : "memory"); - trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1); /* * The init_mm context is given the first available (non-zero) PID, @@ -404,20 +389,15 @@ void __init radix_init_pgtable(void) static void __init radix_init_partition_table(void) { - unsigned long rts_field, dw0; + unsigned long rts_field, dw0, dw1; mmu_partition_table_init(); rts_field = radix__get_tree_size(); dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR; - mmu_partition_table_set_entry(0, dw0, 0); + dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR; + mmu_partition_table_set_entry(0, dw0, dw1, false); pr_info("Initializing Radix MMU\n"); - pr_info("Partition table %p\n", partition_tb); -} - -void __init radix_init_native(void) -{ - register_process_table = native_register_process_table; } static int __init get_idx_from_shift(unsigned int shift) @@ -508,14 +488,6 @@ void __init radix__early_init_devtree(void) mmu_psize_defs[MMU_PAGE_64K].shift = 16; mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; found: -#ifdef CONFIG_SPARSEMEM_VMEMMAP - if (mmu_psize_defs[MMU_PAGE_2M].shift) { - /* - * map vmemmap using 2M if available - */ - mmu_vmemmap_psize = MMU_PAGE_2M; - } -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ return; } @@ -531,8 +503,15 @@ static void radix_init_amor(void) mtspr(SPRN_AMOR, (3ul << 62)); } -static void radix_init_iamr(void) +#ifdef CONFIG_PPC_KUEP +void setup_kuep(bool disabled) { + if (disabled || !early_radix_enabled()) + return; + + if (smp_processor_id() == boot_cpuid) + pr_info("Activating Kernel Userspace Execution Prevention\n"); + /* * Radix always uses key0 of the IAMR to determine if an access is * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction @@ -540,6 +519,25 @@ static void radix_init_iamr(void) */ mtspr(SPRN_IAMR, (1ul << 62)); } +#endif + +#ifdef CONFIG_PPC_KUAP +void setup_kuap(bool disabled) +{ + if (disabled || !early_radix_enabled()) + return; + + if (smp_processor_id() == boot_cpuid) { + pr_info("Activating Kernel Userspace Access Prevention\n"); + cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP; + } + + /* Make sure userspace can't change the AMR */ + mtspr(SPRN_UAMOR, 0); + mtspr(SPRN_AMR, AMR_KUAP_BLOCKED); + isync(); +} +#endif void __init radix__early_init_mmu(void) { @@ -554,7 +552,13 @@ void __init radix__early_init_mmu(void) #ifdef CONFIG_SPARSEMEM_VMEMMAP /* vmemmap mapping */ - mmu_vmemmap_psize = mmu_virtual_psize; + if (mmu_psize_defs[MMU_PAGE_2M].shift) { + /* + * map vmemmap using 2M if available + */ + mmu_vmemmap_psize = MMU_PAGE_2M; + } else + mmu_vmemmap_psize = mmu_virtual_psize; #endif /* * initialize page table size @@ -574,11 +578,11 @@ void __init radix__early_init_mmu(void) __pgd_val_bits = RADIX_PGD_VAL_BITS; __kernel_virt_start = RADIX_KERN_VIRT_START; - __kernel_virt_size = RADIX_KERN_VIRT_SIZE; __vmalloc_start = RADIX_VMALLOC_START; __vmalloc_end = RADIX_VMALLOC_END; __kernel_io_start = RADIX_KERN_IO_START; - vmemmap = (struct page *)RADIX_VMEMMAP_BASE; + __kernel_io_end = RADIX_KERN_IO_END; + vmemmap = (struct page *)RADIX_VMEMMAP_START; ioremap_bot = IOREMAP_BASE; #ifdef CONFIG_PCI @@ -589,8 +593,9 @@ void __init radix__early_init_mmu(void) __pmd_frag_nr = RADIX_PMD_FRAG_NR; __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT; + radix_init_pgtable(); + if (!firmware_has_feature(FW_FEATURE_LPAR)) { - radix_init_native(); lpcr = mfspr(SPRN_LPCR); mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); radix_init_partition_table(); @@ -601,12 +606,9 @@ void __init radix__early_init_mmu(void) memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); - radix_init_iamr(); - radix_init_pgtable(); /* Switch to the guard PID before turning on MMU */ radix__switch_mmu_context(NULL, &init_mm); - if (cpu_has_feature(CPU_FTR_HVMODE)) - tlbiel_all(); + tlbiel_all(); } void radix__early_init_mmu_secondary(void) @@ -619,15 +621,14 @@ void radix__early_init_mmu_secondary(void) lpcr = mfspr(SPRN_LPCR); mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); - mtspr(SPRN_PTCR, - __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); + set_ptcr_when_no_uv(__pa(partition_tb) | + (PATB_SIZE_SHIFT - 12)); + radix_init_amor(); } - radix_init_iamr(); radix__switch_mmu_context(NULL, &init_mm); - if (cpu_has_feature(CPU_FTR_HVMODE)) - tlbiel_all(); + tlbiel_all(); } void radix__mmu_cleanup_all(void) @@ -637,7 +638,7 @@ void radix__mmu_cleanup_all(void) if (!firmware_has_feature(FW_FEATURE_LPAR)) { lpcr = mfspr(SPRN_LPCR); mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT); - mtspr(SPRN_PTCR, 0); + set_ptcr_when_no_uv(0); powernv_set_nmmu_ptcr(0); radix__flush_tlb_all(); } @@ -646,7 +647,8 @@ void radix__mmu_cleanup_all(void) void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size) { - /* We don't currently support the first MEMBLOCK not mapping 0 + /* + * We don't currently support the first MEMBLOCK not mapping 0 * physical on those processors */ BUG_ON(first_memblock_base != 0); @@ -706,8 +708,8 @@ static int __meminit stop_machine_change_mapping(void *data) spin_unlock(&init_mm.page_table_lock); pte_clear(&init_mm, params->aligned_start, params->pte); - create_physical_mapping(params->aligned_start, params->start, -1); - create_physical_mapping(params->end, params->aligned_end, -1); + create_physical_mapping(__pa(params->aligned_start), __pa(params->start), -1); + create_physical_mapping(__pa(params->end), __pa(params->aligned_end), -1); spin_lock(&init_mm.page_table_lock); return 0; } @@ -800,7 +802,7 @@ static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr, if (!pmd_present(*pmd)) continue; - if (pmd_huge(*pmd)) { + if (pmd_is_leaf(*pmd)) { split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd); continue; } @@ -825,7 +827,7 @@ static void remove_pud_table(pud_t *pud_start, unsigned long addr, if (!pud_present(*pud)) continue; - if (pud_huge(*pud)) { + if (pud_is_leaf(*pud)) { split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud); continue; } @@ -851,7 +853,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end) if (!pgd_present(*pgd)) continue; - if (pgd_huge(*pgd)) { + if (pgd_is_leaf(*pgd)) { split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd); continue; } @@ -866,7 +868,12 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end) int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid) { - return create_physical_mapping(start, end, nid); + if (end >= RADIX_VMALLOC_START) { + pr_warn("Outside the supported range\n"); + return -1; + } + + return create_physical_mapping(__pa(start), __pa(end), nid); } int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) @@ -893,6 +900,11 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start, int nid = early_pfn_to_nid(phys >> PAGE_SHIFT); int ret; + if ((start + page_size) >= RADIX_VMEMMAP_END) { + pr_warn("Outside the supported range\n"); + return -1; + } + ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid); BUG_ON(ret); @@ -958,45 +970,44 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) { - struct list_head *lh = (struct list_head *) pgtable; + struct list_head *lh = (struct list_head *) pgtable; - assert_spin_locked(pmd_lockptr(mm, pmdp)); + assert_spin_locked(pmd_lockptr(mm, pmdp)); - /* FIFO */ - if (!pmd_huge_pte(mm, pmdp)) - INIT_LIST_HEAD(lh); - else - list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); - pmd_huge_pte(mm, pmdp) = pgtable; + /* FIFO */ + if (!pmd_huge_pte(mm, pmdp)) + INIT_LIST_HEAD(lh); + else + list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); + pmd_huge_pte(mm, pmdp) = pgtable; } pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { - pte_t *ptep; - pgtable_t pgtable; - struct list_head *lh; - - assert_spin_locked(pmd_lockptr(mm, pmdp)); - - /* FIFO */ - pgtable = pmd_huge_pte(mm, pmdp); - lh = (struct list_head *) pgtable; - if (list_empty(lh)) - pmd_huge_pte(mm, pmdp) = NULL; - else { - pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; - list_del(lh); - } - ptep = (pte_t *) pgtable; - *ptep = __pte(0); - ptep++; - *ptep = __pte(0); - return pgtable; -} + pte_t *ptep; + pgtable_t pgtable; + struct list_head *lh; + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + pgtable = pmd_huge_pte(mm, pmdp); + lh = (struct list_head *) pgtable; + if (list_empty(lh)) + pmd_huge_pte(mm, pmdp) = NULL; + else { + pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; + list_del(lh); + } + ptep = (pte_t *) pgtable; + *ptep = __pte(0); + ptep++; + *ptep = __pte(0); + return pgtable; +} pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp) + unsigned long addr, pmd_t *pmdp) { pmd_t old_pmd; unsigned long old; @@ -1017,13 +1028,6 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, return old_pmd; } -int radix__has_transparent_hugepage(void) -{ - /* For radix 2M at PMD level means thp */ - if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT) - return 1; - return 0; -} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, @@ -1077,3 +1081,108 @@ void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, set_pte_at(mm, addr, ptep, pte); } + +int __init arch_ioremap_pud_supported(void) +{ + /* HPT does not cope with large pages in the vmalloc area */ + return radix_enabled(); +} + +int __init arch_ioremap_pmd_supported(void) +{ + return radix_enabled(); +} + +int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) +{ + return 0; +} + +int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) +{ + pte_t *ptep = (pte_t *)pud; + pte_t new_pud = pfn_pte(__phys_to_pfn(addr), prot); + + if (!radix_enabled()) + return 0; + + set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pud); + + return 1; +} + +int pud_clear_huge(pud_t *pud) +{ + if (pud_huge(*pud)) { + pud_clear(pud); + return 1; + } + + return 0; +} + +int pud_free_pmd_page(pud_t *pud, unsigned long addr) +{ + pmd_t *pmd; + int i; + + pmd = (pmd_t *)pud_page_vaddr(*pud); + pud_clear(pud); + + flush_tlb_kernel_range(addr, addr + PUD_SIZE); + + for (i = 0; i < PTRS_PER_PMD; i++) { + if (!pmd_none(pmd[i])) { + pte_t *pte; + pte = (pte_t *)pmd_page_vaddr(pmd[i]); + + pte_free_kernel(&init_mm, pte); + } + } + + pmd_free(&init_mm, pmd); + + return 1; +} + +int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) +{ + pte_t *ptep = (pte_t *)pmd; + pte_t new_pmd = pfn_pte(__phys_to_pfn(addr), prot); + + if (!radix_enabled()) + return 0; + + set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pmd); + + return 1; +} + +int pmd_clear_huge(pmd_t *pmd) +{ + if (pmd_huge(*pmd)) { + pmd_clear(pmd); + return 1; + } + + return 0; +} + +int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) +{ + pte_t *pte; + + pte = (pte_t *)pmd_page_vaddr(*pmd); + pmd_clear(pmd); + + flush_tlb_kernel_range(addr, addr + PMD_SIZE); + + pte_free_kernel(&init_mm, pte); + + return 1; +} + +int __init arch_ioremap_p4d_supported(void) +{ + return 0; +} diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 6a23b9ebd2a1..a95175c0972b 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * TLB flush routines for radix kernels. * * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/mm.h> @@ -29,7 +25,7 @@ * tlbiel instruction for radix, set invalidation * i.e., r=1 and is=01 or is=10 or is=11 */ -static inline void tlbiel_radix_set_isa300(unsigned int set, unsigned int is, +static __always_inline void tlbiel_radix_set_isa300(unsigned int set, unsigned int is, unsigned int pid, unsigned int ric, unsigned int prs) { @@ -55,11 +51,15 @@ static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) * and partition table entries. Then flush the remaining sets of the * TLB. */ - tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0); - for (set = 1; set < num_sets; set++) - tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0); - /* Do the same for process scoped entries. */ + if (early_cpu_has_feature(CPU_FTR_HVMODE)) { + /* MSR[HV] should flush partition scope translations first. */ + tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0); + for (set = 1; set < num_sets; set++) + tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0); + } + + /* Flush process scoped entries. */ tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1); for (set = 1; set < num_sets; set++) tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1); @@ -87,10 +87,10 @@ void radix__tlbiel_all(unsigned int action) else WARN(1, "%s called on pre-POWER9 CPU\n", __func__); - asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); + asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory"); } -static inline void __tlbiel_pid(unsigned long pid, int set, +static __always_inline void __tlbiel_pid(unsigned long pid, int set, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -106,7 +106,7 @@ static inline void __tlbiel_pid(unsigned long pid, int set, trace_tlbie(0, 1, rb, rs, ric, prs, r); } -static inline void __tlbie_pid(unsigned long pid, unsigned long ric) +static __always_inline void __tlbie_pid(unsigned long pid, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -120,23 +120,7 @@ static inline void __tlbie_pid(unsigned long pid, unsigned long ric) trace_tlbie(0, 0, rb, rs, ric, prs, r); } -static inline void __tlbiel_lpid(unsigned long lpid, int set, - unsigned long ric) -{ - unsigned long rb,rs,prs,r; - - rb = PPC_BIT(52); /* IS = 2 */ - rb |= set << PPC_BITLSHIFT(51); - rs = 0; /* LPID comes from LPIDR */ - prs = 0; /* partition scoped */ - r = 1; /* radix format */ - - asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(lpid, 1, rb, rs, ric, prs, r); -} - -static inline void __tlbie_lpid(unsigned long lpid, unsigned long ric) +static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -150,25 +134,22 @@ static inline void __tlbie_lpid(unsigned long lpid, unsigned long ric) trace_tlbie(lpid, 0, rb, rs, ric, prs, r); } -static inline void __tlbiel_lpid_guest(unsigned long lpid, int set, - unsigned long ric) +static __always_inline void __tlbie_lpid_guest(unsigned long lpid, unsigned long ric) { unsigned long rb,rs,prs,r; rb = PPC_BIT(52); /* IS = 2 */ - rb |= set << PPC_BITLSHIFT(51); - rs = 0; /* LPID comes from LPIDR */ + rs = lpid; prs = 1; /* process scoped */ r = 1; /* radix format */ - asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(lpid, 1, rb, rs, ric, prs, r); + trace_tlbie(lpid, 0, rb, rs, ric, prs, r); } - -static inline void __tlbiel_va(unsigned long va, unsigned long pid, - unsigned long ap, unsigned long ric) +static __always_inline void __tlbiel_va(unsigned long va, unsigned long pid, + unsigned long ap, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -183,8 +164,8 @@ static inline void __tlbiel_va(unsigned long va, unsigned long pid, trace_tlbie(0, 1, rb, rs, ric, prs, r); } -static inline void __tlbie_va(unsigned long va, unsigned long pid, - unsigned long ap, unsigned long ric) +static __always_inline void __tlbie_va(unsigned long va, unsigned long pid, + unsigned long ap, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -199,8 +180,8 @@ static inline void __tlbie_va(unsigned long va, unsigned long pid, trace_tlbie(0, 0, rb, rs, ric, prs, r); } -static inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid, - unsigned long ap, unsigned long ric) +static __always_inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid, + unsigned long ap, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -215,22 +196,83 @@ static inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid, trace_tlbie(lpid, 0, rb, rs, ric, prs, r); } -static inline void fixup_tlbie(void) + +static inline void fixup_tlbie_va(unsigned long va, unsigned long pid, + unsigned long ap) +{ + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_va(va, 0, ap, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_va(va, pid, ap, RIC_FLUSH_TLB); + } +} + +static inline void fixup_tlbie_va_range(unsigned long va, unsigned long pid, + unsigned long ap) { - unsigned long pid = 0; + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_pid(0, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_va(va, pid, ap, RIC_FLUSH_TLB); + } +} + +static inline void fixup_tlbie_pid(unsigned long pid) +{ + /* + * We can use any address for the invalidation, pick one which is + * probably unused as an optimisation. + */ unsigned long va = ((1UL << 52) - 1); - if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_pid(0, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { asm volatile("ptesync": : :"memory"); __tlbie_va(va, pid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB); } } + +static inline void fixup_tlbie_lpid_va(unsigned long va, unsigned long lpid, + unsigned long ap) +{ + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_lpid_va(va, 0, ap, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_lpid_va(va, lpid, ap, RIC_FLUSH_TLB); + } +} + static inline void fixup_tlbie_lpid(unsigned long lpid) { + /* + * We can use any address for the invalidation, pick one which is + * probably unused as an optimisation. + */ unsigned long va = ((1UL << 52) - 1); - if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + asm volatile("ptesync": : :"memory"); + __tlbie_lpid(0, RIC_FLUSH_TLB); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { asm volatile("ptesync": : :"memory"); __tlbie_lpid_va(va, lpid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB); } @@ -239,7 +281,7 @@ static inline void fixup_tlbie_lpid(unsigned long lpid) /* * We use 128 set in radix mode and 256 set in hpt mode. */ -static inline void _tlbiel_pid(unsigned long pid, unsigned long ric) +static __always_inline void _tlbiel_pid(unsigned long pid, unsigned long ric) { int set; @@ -262,7 +304,7 @@ static inline void _tlbiel_pid(unsigned long pid, unsigned long ric) __tlbiel_pid(pid, set, RIC_FLUSH_TLB); asm volatile("ptesync": : :"memory"); - asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); + asm volatile(PPC_RADIX_INVALIDATE_ERAT_USER "; isync" : : :"memory"); } static inline void _tlbie_pid(unsigned long pid, unsigned long ric) @@ -277,6 +319,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric) switch (ric) { case RIC_FLUSH_TLB: __tlbie_pid(pid, RIC_FLUSH_TLB); + fixup_tlbie_pid(pid); break; case RIC_FLUSH_PWC: __tlbie_pid(pid, RIC_FLUSH_PWC); @@ -284,37 +327,42 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric) case RIC_FLUSH_ALL: default: __tlbie_pid(pid, RIC_FLUSH_ALL); + fixup_tlbie_pid(pid); } - fixup_tlbie(); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } -static inline void _tlbiel_lpid(unsigned long lpid, unsigned long ric) +struct tlbiel_pid { + unsigned long pid; + unsigned long ric; +}; + +static void do_tlbiel_pid(void *info) { - int set; + struct tlbiel_pid *t = info; - VM_BUG_ON(mfspr(SPRN_LPID) != lpid); + if (t->ric == RIC_FLUSH_TLB) + _tlbiel_pid(t->pid, RIC_FLUSH_TLB); + else if (t->ric == RIC_FLUSH_PWC) + _tlbiel_pid(t->pid, RIC_FLUSH_PWC); + else + _tlbiel_pid(t->pid, RIC_FLUSH_ALL); +} - asm volatile("ptesync": : :"memory"); +static inline void _tlbiel_pid_multicast(struct mm_struct *mm, + unsigned long pid, unsigned long ric) +{ + struct cpumask *cpus = mm_cpumask(mm); + struct tlbiel_pid t = { .pid = pid, .ric = ric }; + on_each_cpu_mask(cpus, do_tlbiel_pid, &t, 1); /* - * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, - * also flush the entire Page Walk Cache. + * Always want the CPU translations to be invalidated with tlbiel in + * these paths, so while coprocessors must use tlbie, we can not + * optimise away the tlbiel component. */ - __tlbiel_lpid(lpid, 0, ric); - - /* For PWC, only one flush is needed */ - if (ric == RIC_FLUSH_PWC) { - asm volatile("ptesync": : :"memory"); - return; - } - - /* For the remaining sets, just flush the TLB */ - for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) - __tlbiel_lpid(lpid, set, RIC_FLUSH_TLB); - - asm volatile("ptesync": : :"memory"); - asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); + if (atomic_read(&mm->context.copros) > 0) + _tlbie_pid(pid, RIC_FLUSH_ALL); } static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric) @@ -329,6 +377,7 @@ static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric) switch (ric) { case RIC_FLUSH_TLB: __tlbie_lpid(lpid, RIC_FLUSH_TLB); + fixup_tlbie_lpid(lpid); break; case RIC_FLUSH_PWC: __tlbie_lpid(lpid, RIC_FLUSH_PWC); @@ -336,40 +385,33 @@ static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric) case RIC_FLUSH_ALL: default: __tlbie_lpid(lpid, RIC_FLUSH_ALL); + fixup_tlbie_lpid(lpid); } - fixup_tlbie_lpid(lpid); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } -static inline void _tlbiel_lpid_guest(unsigned long lpid, unsigned long ric) +static __always_inline void _tlbie_lpid_guest(unsigned long lpid, unsigned long ric) { - int set; - - VM_BUG_ON(mfspr(SPRN_LPID) != lpid); - - asm volatile("ptesync": : :"memory"); - /* - * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, - * also flush the entire Page Walk Cache. + * Workaround the fact that the "ric" argument to __tlbie_pid + * must be a compile-time contraint to match the "i" constraint + * in the asm statement. */ - __tlbiel_lpid_guest(lpid, 0, ric); - - /* For PWC, only one flush is needed */ - if (ric == RIC_FLUSH_PWC) { - asm volatile("ptesync": : :"memory"); - return; + switch (ric) { + case RIC_FLUSH_TLB: + __tlbie_lpid_guest(lpid, RIC_FLUSH_TLB); + break; + case RIC_FLUSH_PWC: + __tlbie_lpid_guest(lpid, RIC_FLUSH_PWC); + break; + case RIC_FLUSH_ALL: + default: + __tlbie_lpid_guest(lpid, RIC_FLUSH_ALL); } - - /* For the remaining sets, just flush the TLB */ - for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) - __tlbiel_lpid_guest(lpid, set, RIC_FLUSH_TLB); - - asm volatile("ptesync": : :"memory"); - asm volatile(PPC_INVALIDATE_ERAT : : :"memory"); + fixup_tlbie_lpid(lpid); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); } - static inline void __tlbiel_va_range(unsigned long start, unsigned long end, unsigned long pid, unsigned long page_size, unsigned long psize) @@ -381,8 +423,8 @@ static inline void __tlbiel_va_range(unsigned long start, unsigned long end, __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); } -static inline void _tlbiel_va(unsigned long va, unsigned long pid, - unsigned long psize, unsigned long ric) +static __always_inline void _tlbiel_va(unsigned long va, unsigned long pid, + unsigned long psize, unsigned long ric) { unsigned long ap = mmu_get_ap(psize); @@ -411,27 +453,76 @@ static inline void __tlbie_va_range(unsigned long start, unsigned long end, for (addr = start; addr < end; addr += page_size) __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); + + fixup_tlbie_va_range(addr - page_size, pid, ap); } -static inline void _tlbie_va(unsigned long va, unsigned long pid, - unsigned long psize, unsigned long ric) +static __always_inline void _tlbie_va(unsigned long va, unsigned long pid, + unsigned long psize, unsigned long ric) { unsigned long ap = mmu_get_ap(psize); asm volatile("ptesync": : :"memory"); __tlbie_va(va, pid, ap, ric); - fixup_tlbie(); + fixup_tlbie_va(va, pid, ap); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } -static inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid, +struct tlbiel_va { + unsigned long pid; + unsigned long va; + unsigned long psize; + unsigned long ric; +}; + +static void do_tlbiel_va(void *info) +{ + struct tlbiel_va *t = info; + + if (t->ric == RIC_FLUSH_TLB) + _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_TLB); + else if (t->ric == RIC_FLUSH_PWC) + _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_PWC); + else + _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_ALL); +} + +static inline void _tlbiel_va_multicast(struct mm_struct *mm, + unsigned long va, unsigned long pid, + unsigned long psize, unsigned long ric) +{ + struct cpumask *cpus = mm_cpumask(mm); + struct tlbiel_va t = { .va = va, .pid = pid, .psize = psize, .ric = ric }; + on_each_cpu_mask(cpus, do_tlbiel_va, &t, 1); + if (atomic_read(&mm->context.copros) > 0) + _tlbie_va(va, pid, psize, RIC_FLUSH_TLB); +} + +struct tlbiel_va_range { + unsigned long pid; + unsigned long start; + unsigned long end; + unsigned long page_size; + unsigned long psize; + bool also_pwc; +}; + +static void do_tlbiel_va_range(void *info) +{ + struct tlbiel_va_range *t = info; + + _tlbiel_va_range(t->start, t->end, t->pid, t->page_size, + t->psize, t->also_pwc); +} + +static __always_inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid, unsigned long psize, unsigned long ric) { unsigned long ap = mmu_get_ap(psize); asm volatile("ptesync": : :"memory"); __tlbie_lpid_va(va, lpid, ap, ric); - fixup_tlbie_lpid(lpid); + fixup_tlbie_lpid_va(va, lpid, ap); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } @@ -443,10 +534,24 @@ static inline void _tlbie_va_range(unsigned long start, unsigned long end, if (also_pwc) __tlbie_pid(pid, RIC_FLUSH_PWC); __tlbie_va_range(start, end, pid, page_size, psize); - fixup_tlbie(); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } +static inline void _tlbiel_va_range_multicast(struct mm_struct *mm, + unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize, bool also_pwc) +{ + struct cpumask *cpus = mm_cpumask(mm); + struct tlbiel_va_range t = { .start = start, .end = end, + .pid = pid, .page_size = page_size, + .psize = psize, .also_pwc = also_pwc }; + + on_each_cpu_mask(cpus, do_tlbiel_va_range, &t, 1); + if (atomic_read(&mm->context.copros) > 0) + _tlbie_va_range(start, end, pid, page_size, psize, also_pwc); +} + /* * Base TLB flushing operations: * @@ -584,10 +689,14 @@ void radix__flush_tlb_mm(struct mm_struct *mm) goto local; } - if (mm_needs_flush_escalation(mm)) - _tlbie_pid(pid, RIC_FLUSH_ALL); - else - _tlbie_pid(pid, RIC_FLUSH_TLB); + if (cputlb_use_tlbie()) { + if (mm_needs_flush_escalation(mm)) + _tlbie_pid(pid, RIC_FLUSH_ALL); + else + _tlbie_pid(pid, RIC_FLUSH_TLB); + } else { + _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB); + } } else { local: _tlbiel_pid(pid, RIC_FLUSH_TLB); @@ -613,25 +722,23 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm) goto local; } } - _tlbie_pid(pid, RIC_FLUSH_ALL); + if (cputlb_use_tlbie()) + _tlbie_pid(pid, RIC_FLUSH_ALL); + else + _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL); } else { local: _tlbiel_pid(pid, RIC_FLUSH_ALL); } preempt_enable(); } + void radix__flush_all_mm(struct mm_struct *mm) { __flush_all_mm(mm, false); } EXPORT_SYMBOL(radix__flush_all_mm); -void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) -{ - tlb->need_flush_all = 1; -} -EXPORT_SYMBOL(radix__flush_tlb_pwc); - void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize) { @@ -648,7 +755,10 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, exit_flush_lazy_tlbs(mm); goto local; } - _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB); + if (cputlb_use_tlbie()) + _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB); + else + _tlbiel_va_multicast(mm, vmaddr, pid, psize, RIC_FLUSH_TLB); } else { local: _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); @@ -670,9 +780,35 @@ EXPORT_SYMBOL(radix__flush_tlb_page); #define radix__flush_all_mm radix__local_flush_all_mm #endif /* CONFIG_SMP */ +static void do_tlbiel_kernel(void *info) +{ + _tlbiel_pid(0, RIC_FLUSH_ALL); +} + +static inline void _tlbiel_kernel_broadcast(void) +{ + on_each_cpu(do_tlbiel_kernel, NULL, 1); + if (tlbie_capable) { + /* + * Coherent accelerators don't refcount kernel memory mappings, + * so have to always issue a tlbie for them. This is quite a + * slow path anyway. + */ + _tlbie_pid(0, RIC_FLUSH_ALL); + } +} + +/* + * If kernel TLBIs ever become local rather than global, then + * drivers/misc/ocxl/link.c:ocxl_link_add_pe will need some work, as it + * assumes kernel TLBIs are global. + */ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) { - _tlbie_pid(0, RIC_FLUSH_ALL); + if (cputlb_use_tlbie()) + _tlbie_pid(0, RIC_FLUSH_ALL); + else + _tlbiel_kernel_broadcast(); } EXPORT_SYMBOL(radix__flush_tlb_kernel_range); @@ -691,8 +827,7 @@ static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; static inline void __radix__flush_tlb_range(struct mm_struct *mm, - unsigned long start, unsigned long end, - bool flush_all_sizes) + unsigned long start, unsigned long end) { unsigned long pid; @@ -728,54 +863,48 @@ is_local: if (local) { _tlbiel_pid(pid, RIC_FLUSH_TLB); } else { - if (mm_needs_flush_escalation(mm)) - _tlbie_pid(pid, RIC_FLUSH_ALL); - else - _tlbie_pid(pid, RIC_FLUSH_TLB); + if (cputlb_use_tlbie()) { + if (mm_needs_flush_escalation(mm)) + _tlbie_pid(pid, RIC_FLUSH_ALL); + else + _tlbie_pid(pid, RIC_FLUSH_TLB); + } else { + _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB); + } } } else { - bool hflush = flush_all_sizes; - bool gflush = flush_all_sizes; + bool hflush = false; unsigned long hstart, hend; - unsigned long gstart, gend; - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) - hflush = true; - - if (hflush) { + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { hstart = (start + PMD_SIZE - 1) & PMD_MASK; hend = end & PMD_MASK; if (hstart == hend) hflush = false; + else + hflush = true; } - if (gflush) { - gstart = (start + PUD_SIZE - 1) & PUD_MASK; - gend = end & PUD_MASK; - if (gstart == gend) - gflush = false; - } - - asm volatile("ptesync": : :"memory"); if (local) { + asm volatile("ptesync": : :"memory"); __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize); if (hflush) __tlbiel_va_range(hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M); - if (gflush) - __tlbiel_va_range(gstart, gend, pid, - PUD_SIZE, MMU_PAGE_1G); asm volatile("ptesync": : :"memory"); - } else { + } else if (cputlb_use_tlbie()) { + asm volatile("ptesync": : :"memory"); __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize); if (hflush) __tlbie_va_range(hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M); - if (gflush) - __tlbie_va_range(gstart, gend, pid, - PUD_SIZE, MMU_PAGE_1G); - fixup_tlbie(); asm volatile("eieio; tlbsync; ptesync": : :"memory"); + } else { + _tlbiel_va_range_multicast(mm, + start, end, pid, page_size, mmu_virtual_psize, false); + if (hflush) + _tlbiel_va_range_multicast(mm, + hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M, false); } } preempt_enable(); @@ -790,7 +919,7 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, return radix__flush_hugetlb_tlb_range(vma, start, end); #endif - __radix__flush_tlb_range(vma->vm_mm, start, end, false); + __radix__flush_tlb_range(vma->vm_mm, start, end); } EXPORT_SYMBOL(radix__flush_tlb_range); @@ -834,32 +963,19 @@ EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid); /* * Flush partition scoped translations from LPID (=LPIDR) */ -void radix__flush_tlb_lpid(unsigned int lpid) +void radix__flush_all_lpid(unsigned int lpid) { _tlbie_lpid(lpid, RIC_FLUSH_ALL); } -EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid); - -/* - * Flush partition scoped translations from LPID (=LPIDR) - */ -void radix__local_flush_tlb_lpid(unsigned int lpid) -{ - _tlbiel_lpid(lpid, RIC_FLUSH_ALL); -} -EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid); +EXPORT_SYMBOL_GPL(radix__flush_all_lpid); /* - * Flush process scoped translations from LPID (=LPIDR). - * Important difference, the guest normally manages its own translations, - * but some cases e.g., vCPU CPU migration require KVM to flush. + * Flush process scoped translations from LPID (=LPIDR) */ -void radix__local_flush_tlb_lpid_guest(unsigned int lpid) +void radix__flush_all_lpid_guest(unsigned int lpid) { - _tlbiel_lpid_guest(lpid, RIC_FLUSH_ALL); + _tlbie_lpid_guest(lpid, RIC_FLUSH_ALL); } -EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid_guest); - static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, unsigned long end, int psize); @@ -879,56 +995,22 @@ void radix__tlb_flush(struct mmu_gather *tlb) * that flushes the process table entry cache upon process teardown. * See the comment for radix in arch_exit_mmap(). */ - if (tlb->fullmm) { + if (tlb->fullmm || tlb->need_flush_all) { __flush_all_mm(mm, true); -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) - } else if (mm_tlb_flush_nested(mm)) { - /* - * If there is a concurrent invalidation that is clearing ptes, - * then it's possible this invalidation will miss one of those - * cleared ptes and miss flushing the TLB. If this invalidate - * returns before the other one flushes TLBs, that can result - * in it returning while there are still valid TLBs inside the - * range to be invalidated. - * - * See mm/memory.c:tlb_finish_mmu() for more details. - * - * The solution to this is ensure the entire range is always - * flushed here. The problem for powerpc is that the flushes - * are page size specific, so this "forced flush" would not - * do the right thing if there are a mix of page sizes in - * the range to be invalidated. So use __flush_tlb_range - * which invalidates all possible page sizes in the range. - * - * PWC flush probably is not be required because the core code - * shouldn't free page tables in this path, but accounting - * for the possibility makes us a bit more robust. - * - * need_flush_all is an uncommon case because page table - * teardown should be done with exclusive locks held (but - * after locks are dropped another invalidate could come - * in), it could be optimized further if necessary. - */ - if (!tlb->need_flush_all) - __radix__flush_tlb_range(mm, start, end, true); - else - radix__flush_all_mm(mm); -#endif } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) { - if (!tlb->need_flush_all) + if (!tlb->freed_tables) radix__flush_tlb_mm(mm); else radix__flush_all_mm(mm); } else { - if (!tlb->need_flush_all) + if (!tlb->freed_tables) radix__flush_tlb_range_psize(mm, start, end, psize); else radix__flush_tlb_pwc_range_psize(mm, start, end, psize); } - tlb->need_flush_all = 0; } -static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm, +static __always_inline void __radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, unsigned long end, int psize, bool also_pwc) { @@ -965,16 +1047,26 @@ is_local: if (local) { _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); } else { - if (mm_needs_flush_escalation(mm)) - also_pwc = true; + if (cputlb_use_tlbie()) { + if (mm_needs_flush_escalation(mm)) + also_pwc = true; + + _tlbie_pid(pid, + also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); + } else { + _tlbiel_pid_multicast(mm, pid, + also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); + } - _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); } } else { if (local) _tlbiel_va_range(start, end, pid, page_size, psize, also_pwc); - else + else if (cputlb_use_tlbie()) _tlbie_va_range(start, end, pid, page_size, psize, also_pwc); + else + _tlbiel_va_range_multicast(mm, + start, end, pid, page_size, psize, also_pwc); } preempt_enable(); } @@ -1016,7 +1108,11 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) exit_flush_lazy_tlbs(mm); goto local; } - _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); + if (cputlb_use_tlbie()) + _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); + else + _tlbiel_va_range_multicast(mm, + addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); } else { local: _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/book3s64/slb.c index 5986df48359b..716204aee3da 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/book3s64/slb.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerPC64 SLB support. * @@ -6,12 +7,6 @@ * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com * Copyright (c) 2001 Dave Engebretsen * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM - * - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <asm/asm-prototypes.h> @@ -554,7 +549,8 @@ void slb_initialize(void) asm volatile("isync; slbia; isync":::"memory"); create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, LINEAR_INDEX); - /* For the boot cpu, we're running on the stack in init_thread_union, + /* + * For the boot cpu, we're running on the stack in init_thread_union, * which is in the first segment of the linear mapping, and also * get_paca()->kstack hasn't been initialized yet. * For secondary cpus, we need to bolt the kernel stack entry now. @@ -691,10 +687,10 @@ static long slb_allocate_kernel(unsigned long ea, unsigned long id) unsigned long flags; int ssize; - if (id == KERNEL_REGION_ID) { + if (id == LINEAR_MAP_REGION_ID) { /* We only support upto MAX_PHYSMEM_BITS */ - if ((ea & ~REGION_MASK) > (1UL << MAX_PHYSMEM_BITS)) + if ((ea & EA_MASK) > (1UL << MAX_PHYSMEM_BITS)) return -EFAULT; flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp; @@ -702,20 +698,25 @@ static long slb_allocate_kernel(unsigned long ea, unsigned long id) #ifdef CONFIG_SPARSEMEM_VMEMMAP } else if (id == VMEMMAP_REGION_ID) { - if ((ea & ~REGION_MASK) >= (1ULL << MAX_EA_BITS_PER_CONTEXT)) + if (ea >= H_VMEMMAP_END) return -EFAULT; flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp; #endif } else if (id == VMALLOC_REGION_ID) { - if ((ea & ~REGION_MASK) >= (1ULL << MAX_EA_BITS_PER_CONTEXT)) + if (ea >= H_VMALLOC_END) return -EFAULT; - if (ea < H_VMALLOC_END) - flags = local_paca->vmalloc_sllp; - else - flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp; + flags = local_paca->vmalloc_sllp; + + } else if (id == IO_REGION_ID) { + + if (ea >= H_KERN_IO_END) + return -EFAULT; + + flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp; + } else { return -EFAULT; } @@ -725,6 +726,7 @@ static long slb_allocate_kernel(unsigned long ea, unsigned long id) ssize = MMU_SEGSIZE_256M; context = get_kernel_context(ea); + return slb_insert_entry(ea, context, flags, ssize, true); } @@ -739,7 +741,7 @@ static long slb_allocate_user(struct mm_struct *mm, unsigned long ea) * consider this as bad access if we take a SLB miss * on an address above addr limit. */ - if (ea >= mm->context.slb_addr_limit) + if (ea >= mm_ctx_slb_addr_limit(&mm->context)) return -EFAULT; context = get_user_context(&mm->context, ea); @@ -761,7 +763,7 @@ static long slb_allocate_user(struct mm_struct *mm, unsigned long ea) long do_slb_fault(struct pt_regs *regs, unsigned long ea) { - unsigned long id = REGION_ID(ea); + unsigned long id = get_region_id(ea); /* IRQs are not reconciled here, so can't check irqs_disabled */ VM_WARN_ON(mfmsr() & MSR_EE); @@ -784,7 +786,7 @@ long do_slb_fault(struct pt_regs *regs, unsigned long ea) * first class kernel code. But for performance it's probably nicer * if they go via fast_exception_return too. */ - if (id >= KERNEL_REGION_ID) { + if (id >= LINEAR_MAP_REGION_ID) { long err; #ifdef CONFIG_DEBUG_VM /* Catch recursive kernel SLB faults. */ diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index 5e4178790dee..2ef24a53f4c9 100644 --- a/arch/powerpc/mm/subpage-prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -1,17 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright 2007-2008 Paul Mackerras, IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/gfp.h> #include <linux/types.h> -#include <linux/mm.h> +#include <linux/pagewalk.h> #include <linux/hugetlb.h> #include <linux/syscalls.h> @@ -25,10 +21,13 @@ */ void subpage_prot_free(struct mm_struct *mm) { - struct subpage_prot_table *spt = &mm->context.spt; + struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); unsigned long i, j, addr; u32 **p; + if (!spt) + return; + for (i = 0; i < 4; ++i) { if (spt->low_prot[i]) { free_page((unsigned long)spt->low_prot[i]); @@ -48,13 +47,7 @@ void subpage_prot_free(struct mm_struct *mm) free_page((unsigned long)p); } spt->maxaddr = 0; -} - -void subpage_prot_init_new_context(struct mm_struct *mm) -{ - struct subpage_prot_table *spt = &mm->context.spt; - - memset(spt, 0, sizeof(*spt)); + kfree(spt); } static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, @@ -93,13 +86,18 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, static void subpage_prot_clear(unsigned long addr, unsigned long len) { struct mm_struct *mm = current->mm; - struct subpage_prot_table *spt = &mm->context.spt; + struct subpage_prot_table *spt; u32 **spm, *spp; unsigned long i; size_t nw; unsigned long next, limit; down_write(&mm->mmap_sem); + + spt = mm_ctx_subpage_prot(&mm->context); + if (!spt) + goto err_out; + limit = addr + len; if (limit > spt->maxaddr) limit = spt->maxaddr; @@ -127,6 +125,8 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len) /* now flush any existing HPTEs for the range */ hpte_flush_range(mm, addr, nw); } + +err_out: up_write(&mm->mmap_sem); } @@ -139,14 +139,14 @@ static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr, return 0; } +static const struct mm_walk_ops subpage_walk_ops = { + .pmd_entry = subpage_walk_pmd_entry, +}; + static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, unsigned long len) { struct vm_area_struct *vma; - struct mm_walk subpage_proto_walk = { - .mm = mm, - .pmd_entry = subpage_walk_pmd_entry, - }; /* * We don't try too hard, we just mark all the vma in that range @@ -163,7 +163,7 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, if (vma->vm_start >= (addr + len)) break; vma->vm_flags |= VM_NOHUGEPAGE; - walk_page_vma(vma, &subpage_proto_walk); + walk_page_vma(vma, &subpage_walk_ops, NULL); vma = vma->vm_next; } } @@ -189,7 +189,7 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr, unsigned long, len, u32 __user *, map) { struct mm_struct *mm = current->mm; - struct subpage_prot_table *spt = &mm->context.spt; + struct subpage_prot_table *spt; u32 **spm, *spp; unsigned long i; size_t nw; @@ -218,6 +218,21 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr, return -EFAULT; down_write(&mm->mmap_sem); + + spt = mm_ctx_subpage_prot(&mm->context); + if (!spt) { + /* + * Allocate subpage prot table if not already done. + * Do this with mmap_sem held + */ + spt = kzalloc(sizeof(struct subpage_prot_table), GFP_KERNEL); + if (!spt) { + err = -ENOMEM; + goto out; + } + mm->context.hash_context->spt = spt; + } + subpage_mark_vma_nohuge(mm, addr, len); for (limit = addr + len; addr < limit; addr = next) { next = pmd_addr_end(addr, limit); diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index c8da352e8686..beb060b96632 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * CoProcessor (SPU/AFU) mm fault handler * @@ -5,20 +6,6 @@ * * Author: Arnd Bergmann <arndb@de.ibm.com> * Author: Jeremy Kerr <jk@ozlabs.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include <linux/sched.h> #include <linux/mm.h> @@ -105,7 +92,7 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb) u64 vsid, vsidkey; int psize, ssize; - switch (REGION_ID(ea)) { + switch (get_region_id(ea)) { case USER_REGION_ID: pr_devel("%s: 0x%llx -- USER_REGION_ID\n", __func__, ea); if (mm == NULL) @@ -117,16 +104,20 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb) break; case VMALLOC_REGION_ID: pr_devel("%s: 0x%llx -- VMALLOC_REGION_ID\n", __func__, ea); - if (ea < VMALLOC_END) - psize = mmu_vmalloc_psize; - else - psize = mmu_io_psize; + psize = mmu_vmalloc_psize; + ssize = mmu_kernel_ssize; + vsid = get_kernel_vsid(ea, mmu_kernel_ssize); + vsidkey = SLB_VSID_KERNEL; + break; + case IO_REGION_ID: + pr_devel("%s: 0x%llx -- IO_REGION_ID\n", __func__, ea); + psize = mmu_io_psize; ssize = mmu_kernel_ssize; vsid = get_kernel_vsid(ea, mmu_kernel_ssize); vsidkey = SLB_VSID_KERNEL; break; - case KERNEL_REGION_ID: - pr_devel("%s: 0x%llx -- KERNEL_REGION_ID\n", __func__, ea); + case LINEAR_MAP_REGION_ID: + pr_devel("%s: 0x%llx -- LINEAR_MAP_REGION_ID\n", __func__, ea); psize = mmu_linear_psize; ssize = mmu_kernel_ssize; vsid = get_kernel_vsid(ea, mmu_kernel_ssize); diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c index b5d2658c26af..5ab4f868e919 100644 --- a/arch/powerpc/mm/dma-noncoherent.c +++ b/arch/powerpc/mm/dma-noncoherent.c @@ -1,316 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * PowerPC version derived from arch/arm/mm/consistent.c * Copyright (C) 2001 Dan Malek (dmalek@jlc.net) * * Copyright (C) 2000 Russell King - * - * Consistent memory allocators. Used for DMA devices that want to - * share uncached memory with the processor core. The function return - * is the virtual address and 'dma_handle' is the physical address. - * Mostly stolen from the ARM port, with some changes for PowerPC. - * -- Dan - * - * Reorganized to get rid of the arch-specific consistent_* functions - * and provide non-coherent implementations for the DMA API. -Matt - * - * Added in_interrupt() safe dma_alloc_coherent()/dma_free_coherent() - * implementation. This is pulled straight from ARM and barely - * modified. -Matt - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ -#include <linux/sched.h> -#include <linux/slab.h> #include <linux/kernel.h> #include <linux/errno.h> -#include <linux/string.h> #include <linux/types.h> #include <linux/highmem.h> #include <linux/dma-direct.h> #include <linux/dma-noncoherent.h> -#include <linux/export.h> #include <asm/tlbflush.h> #include <asm/dma.h> -#include "mmu_decl.h" - -/* - * This address range defaults to a value that is safe for all - * platforms which currently set CONFIG_NOT_COHERENT_CACHE. It - * can be further configured for specific applications under - * the "Advanced Setup" menu. -Matt - */ -#define CONSISTENT_BASE (IOREMAP_TOP) -#define CONSISTENT_END (CONSISTENT_BASE + CONFIG_CONSISTENT_SIZE) -#define CONSISTENT_OFFSET(x) (((unsigned long)(x) - CONSISTENT_BASE) >> PAGE_SHIFT) - -/* - * This is the page table (2MB) covering uncached, DMA consistent allocations - */ -static DEFINE_SPINLOCK(consistent_lock); - -/* - * VM region handling support. - * - * This should become something generic, handling VM region allocations for - * vmalloc and similar (ioremap, module space, etc). - * - * I envisage vmalloc()'s supporting vm_struct becoming: - * - * struct vm_struct { - * struct vm_region region; - * unsigned long flags; - * struct page **pages; - * unsigned int nr_pages; - * unsigned long phys_addr; - * }; - * - * get_vm_area() would then call vm_region_alloc with an appropriate - * struct vm_region head (eg): - * - * struct vm_region vmalloc_head = { - * .vm_list = LIST_HEAD_INIT(vmalloc_head.vm_list), - * .vm_start = VMALLOC_START, - * .vm_end = VMALLOC_END, - * }; - * - * However, vmalloc_head.vm_start is variable (typically, it is dependent on - * the amount of RAM found at boot time.) I would imagine that get_vm_area() - * would have to initialise this each time prior to calling vm_region_alloc(). - */ -struct ppc_vm_region { - struct list_head vm_list; - unsigned long vm_start; - unsigned long vm_end; -}; - -static struct ppc_vm_region consistent_head = { - .vm_list = LIST_HEAD_INIT(consistent_head.vm_list), - .vm_start = CONSISTENT_BASE, - .vm_end = CONSISTENT_END, -}; - -static struct ppc_vm_region * -ppc_vm_region_alloc(struct ppc_vm_region *head, size_t size, gfp_t gfp) -{ - unsigned long addr = head->vm_start, end = head->vm_end - size; - unsigned long flags; - struct ppc_vm_region *c, *new; - - new = kmalloc(sizeof(struct ppc_vm_region), gfp); - if (!new) - goto out; - - spin_lock_irqsave(&consistent_lock, flags); - - list_for_each_entry(c, &head->vm_list, vm_list) { - if ((addr + size) < addr) - goto nospc; - if ((addr + size) <= c->vm_start) - goto found; - addr = c->vm_end; - if (addr > end) - goto nospc; - } - - found: - /* - * Insert this entry _before_ the one we found. - */ - list_add_tail(&new->vm_list, &c->vm_list); - new->vm_start = addr; - new->vm_end = addr + size; - - spin_unlock_irqrestore(&consistent_lock, flags); - return new; - - nospc: - spin_unlock_irqrestore(&consistent_lock, flags); - kfree(new); - out: - return NULL; -} - -static struct ppc_vm_region *ppc_vm_region_find(struct ppc_vm_region *head, unsigned long addr) -{ - struct ppc_vm_region *c; - - list_for_each_entry(c, &head->vm_list, vm_list) { - if (c->vm_start == addr) - goto out; - } - c = NULL; - out: - return c; -} - -/* - * Allocate DMA-coherent memory space and return both the kernel remapped - * virtual and bus address for that space. - */ -void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, unsigned long attrs) -{ - struct page *page; - struct ppc_vm_region *c; - unsigned long order; - u64 mask = ISA_DMA_THRESHOLD, limit; - - if (dev) { - mask = dev->coherent_dma_mask; - - /* - * Sanity check the DMA mask - it must be non-zero, and - * must be able to be satisfied by a DMA allocation. - */ - if (mask == 0) { - dev_warn(dev, "coherent DMA mask is unset\n"); - goto no_page; - } - - if ((~mask) & ISA_DMA_THRESHOLD) { - dev_warn(dev, "coherent DMA mask %#llx is smaller " - "than system GFP_DMA mask %#llx\n", - mask, (unsigned long long)ISA_DMA_THRESHOLD); - goto no_page; - } - } - - - size = PAGE_ALIGN(size); - limit = (mask + 1) & ~mask; - if ((limit && size >= limit) || - size >= (CONSISTENT_END - CONSISTENT_BASE)) { - printk(KERN_WARNING "coherent allocation too big (requested %#x mask %#Lx)\n", - size, mask); - return NULL; - } - - order = get_order(size); - - /* Might be useful if we ever have a real legacy DMA zone... */ - if (mask != 0xffffffff) - gfp |= GFP_DMA; - - page = alloc_pages(gfp, order); - if (!page) - goto no_page; - - /* - * Invalidate any data that might be lurking in the - * kernel direct-mapped region for device DMA. - */ - { - unsigned long kaddr = (unsigned long)page_address(page); - memset(page_address(page), 0, size); - flush_dcache_range(kaddr, kaddr + size); - } - - /* - * Allocate a virtual address in the consistent mapping region. - */ - c = ppc_vm_region_alloc(&consistent_head, size, - gfp & ~(__GFP_DMA | __GFP_HIGHMEM)); - if (c) { - unsigned long vaddr = c->vm_start; - struct page *end = page + (1 << order); - - split_page(page, order); - - /* - * Set the "dma handle" - */ - *dma_handle = phys_to_dma(dev, page_to_phys(page)); - - do { - SetPageReserved(page); - map_kernel_page(vaddr, page_to_phys(page), - pgprot_noncached(PAGE_KERNEL)); - page++; - vaddr += PAGE_SIZE; - } while (size -= PAGE_SIZE); - - /* - * Free the otherwise unused pages. - */ - while (page < end) { - __free_page(page); - page++; - } - - return (void *)c->vm_start; - } - - if (page) - __free_pages(page, order); - no_page: - return NULL; -} - -/* - * free a page as defined by the above mapping. - */ -void arch_dma_free(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle, unsigned long attrs) -{ - struct ppc_vm_region *c; - unsigned long flags, addr; - - size = PAGE_ALIGN(size); - - spin_lock_irqsave(&consistent_lock, flags); - - c = ppc_vm_region_find(&consistent_head, (unsigned long)vaddr); - if (!c) - goto no_area; - - if ((c->vm_end - c->vm_start) != size) { - printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n", - __func__, c->vm_end - c->vm_start, size); - dump_stack(); - size = c->vm_end - c->vm_start; - } - - addr = c->vm_start; - do { - pte_t *ptep; - unsigned long pfn; - - ptep = pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(addr), - addr), - addr), - addr); - if (!pte_none(*ptep) && pte_present(*ptep)) { - pfn = pte_pfn(*ptep); - pte_clear(&init_mm, addr, ptep); - if (pfn_valid(pfn)) { - struct page *page = pfn_to_page(pfn); - __free_reserved_page(page); - } - } - addr += PAGE_SIZE; - } while (size -= PAGE_SIZE); - - flush_tlb_kernel_range(c->vm_start, c->vm_end); - - list_del(&c->vm_list); - - spin_unlock_irqrestore(&consistent_lock, flags); - - kfree(c); - return; - - no_area: - spin_unlock_irqrestore(&consistent_lock, flags); - printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n", - __func__, vaddr); - dump_stack(); -} - /* * make an area consistent. */ @@ -399,35 +104,21 @@ static void __dma_sync_page(phys_addr_t paddr, size_t size, int dir) #endif } -void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { __dma_sync_page(paddr, size, dir); } -void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { __dma_sync_page(paddr, size, dir); } -/* - * Return the PFN for a given cpu virtual address returned by arch_dma_alloc. - */ -long arch_dma_coherent_to_pfn(struct device *dev, void *vaddr, - dma_addr_t dma_addr) +void arch_dma_prep_coherent(struct page *page, size_t size) { - /* This should always be populated, so we don't test every - * level. If that fails, we'll have a nice crash which - * will be as good as a BUG_ON() - */ - unsigned long cpu_addr = (unsigned long)vaddr; - pgd_t *pgd = pgd_offset_k(cpu_addr); - pud_t *pud = pud_offset(pgd, cpu_addr); - pmd_t *pmd = pmd_offset(pud, cpu_addr); - pte_t *ptep = pte_offset_kernel(pmd, cpu_addr); + unsigned long kaddr = (unsigned long)page_address(page); - if (pte_none(*ptep) || !pte_present(*ptep)) - return 0; - return pte_pfn(*ptep); + flush_dcache_range(kaddr, kaddr + size); } diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c index 3f1803672c9b..59327cefbc6a 100644 --- a/arch/powerpc/mm/drmem.c +++ b/arch/powerpc/mm/drmem.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Dynamic reconfiguration memory support * * Copyright 2017 IBM Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) "drmem: " fmt @@ -366,8 +362,10 @@ static void __init init_drmem_v1_lmbs(const __be32 *prop) if (!drmem_info->lmbs) return; - for_each_drmem_lmb(lmb) + for_each_drmem_lmb(lmb) { read_drconf_v1_cell(lmb, &prop); + lmb_set_nid(lmb); + } } static void __init init_drmem_v2_lmbs(const __be32 *prop) @@ -412,6 +410,8 @@ static void __init init_drmem_v2_lmbs(const __be32 *prop) lmb->aa_index = dr_cell.aa_index; lmb->flags = dr_cell.flags; + + lmb_set_nid(lmb); } } } diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 887f11bcf330..b5047f9b5dec 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerPC version * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) @@ -8,11 +9,6 @@ * Modified by Cort Dougan and Paul Mackerras. * * Modified for PPC64 by Dave Engebretsen (engebret@ibm.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/signal.h> @@ -44,26 +40,7 @@ #include <asm/mmu_context.h> #include <asm/siginfo.h> #include <asm/debug.h> - -static inline bool notify_page_fault(struct pt_regs *regs) -{ - bool ret = false; - -#ifdef CONFIG_KPROBES - /* kprobe_running() needs smp_processor_id() */ - if (!user_mode(regs)) { - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, 11)) - ret = true; - preempt_enable(); - } -#endif /* CONFIG_KPROBES */ - - if (unlikely(debugger_fault_handler(regs))) - ret = true; - - return ret; -} +#include <asm/kup.h> /* * Check whether the instruction inst is a store using @@ -181,13 +158,12 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address, if (fault & VM_FAULT_HWPOISON) lsb = PAGE_SHIFT; - force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, - current); + force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb); return 0; } #endif - force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, current); + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); return 0; } @@ -223,19 +199,46 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, } /* Is this a bad kernel fault ? */ -static bool bad_kernel_fault(bool is_exec, unsigned long error_code, - unsigned long address) +static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code, + unsigned long address, bool is_write) { + int is_exec = TRAP(regs) == 0x400; + /* NX faults set DSISR_PROTFAULT on the 8xx, DSISR_NOEXEC_OR_G on others */ if (is_exec && (error_code & (DSISR_NOEXEC_OR_G | DSISR_KEYFAULT | DSISR_PROTFAULT))) { - printk_ratelimited(KERN_CRIT "kernel tried to execute" - " exec-protected page (%lx) -" - "exploit attempt? (uid: %d)\n", - address, from_kuid(&init_user_ns, - current_uid())); + pr_crit_ratelimited("kernel tried to execute %s page (%lx) - exploit attempt? (uid: %d)\n", + address >= TASK_SIZE ? "exec-protected" : "user", + address, + from_kuid(&init_user_ns, current_uid())); + + // Kernel exec fault is always bad + return true; + } + + if (!is_exec && address < TASK_SIZE && (error_code & DSISR_PROTFAULT) && + !search_exception_tables(regs->nip)) { + pr_crit_ratelimited("Kernel attempted to access user page (%lx) - exploit attempt? (uid: %d)\n", + address, + from_kuid(&init_user_ns, current_uid())); } - return is_exec || (address >= TASK_SIZE); + + // Kernel fault on kernel address is bad + if (address >= TASK_SIZE) + return true; + + // Fault on user outside of certain regions (eg. copy_tofrom_user()) is bad + if (!search_exception_tables(regs->nip)) + return true; + + // Read/write fault in a valid region (the exception table search passed + // above), but blocked by KUAP is bad, it can never succeed. + if (bad_kuap_fault(regs, is_write)) + return true; + + // What's left? Kernel fault on user in well defined regions (extable + // matched), and allowed by KUAP in the faulting context. + return false; } static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address, @@ -438,8 +441,9 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address, int is_write = page_fault_is_write(error_code); vm_fault_t fault, major = 0; bool must_retry = false; + bool kprobe_fault = kprobe_page_fault(regs, 11); - if (notify_page_fault(regs)) + if (unlikely(debugger_fault_handler(regs) || kprobe_fault)) return 0; if (unlikely(page_fault_is_bad(error_code))) { @@ -455,9 +459,10 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address, /* * The kernel should never take an execute fault nor should it - * take a page fault to a kernel address. + * take a page fault to a kernel address or a page fault to a user + * address outside of dedicated places */ - if (unlikely(!is_user && bad_kernel_fault(is_exec, error_code, address))) + if (unlikely(!is_user && bad_kernel_fault(regs, error_code, address, is_write))) return SIGSEGV; /* @@ -640,6 +645,7 @@ NOKPROBE_SYMBOL(do_page_fault); void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) { const struct exception_table_entry *entry; + int is_write = page_fault_is_write(regs->dsisr); /* Are we prepared to handle this fault? */ if ((entry = search_exception_tables(regs->nip)) != NULL) { @@ -653,9 +659,10 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) case 0x300: case 0x380: case 0xe00: - pr_alert("BUG: %s at 0x%08lx\n", + pr_alert("BUG: %s on %s at 0x%08lx\n", regs->dar < PAGE_SIZE ? "Kernel NULL pointer dereference" : - "Unable to handle kernel data access", regs->dar); + "Unable to handle kernel data access", + is_write ? "write" : "read", regs->dar); break; case 0x400: case 0x480: diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c index 82a0e37557a5..320c1672b2ae 100644 --- a/arch/powerpc/mm/highmem.c +++ b/arch/powerpc/mm/highmem.c @@ -43,9 +43,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); -#ifdef CONFIG_DEBUG_HIGHMEM - BUG_ON(!pte_none(*(kmap_pte-idx))); -#endif + WARN_ON(IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !pte_none(*(kmap_pte - idx))); __set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot), 1); local_flush_tlb_page(NULL, vaddr); @@ -56,7 +54,6 @@ EXPORT_SYMBOL(kmap_atomic_prot); void __kunmap_atomic(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - int type __maybe_unused; if (vaddr < __fix_to_virt(FIX_KMAP_END)) { pagefault_enable(); @@ -64,14 +61,12 @@ void __kunmap_atomic(void *kvaddr) return; } - type = kmap_atomic_idx(); - -#ifdef CONFIG_DEBUG_HIGHMEM - { + if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM)) { + int type = kmap_atomic_idx(); unsigned int idx; idx = type + KM_TYPE_NR * smp_processor_id(); - BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); + WARN_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); /* * force other mappings to Oops if they'll try to access @@ -80,7 +75,6 @@ void __kunmap_atomic(void *kvaddr) pte_clear(&init_mm, vaddr, kmap_pte-idx); local_flush_tlb_page(NULL, vaddr); } -#endif kmap_atomic_idx_pop(); pagefault_enable(); diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 9e732bb2c84a..73d4873fc7f8 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -26,20 +26,8 @@ #include <asm/hugetlb.h> #include <asm/pte-walk.h> - -#ifdef CONFIG_HUGETLB_PAGE - -#define PAGE_SHIFT_64K 16 -#define PAGE_SHIFT_512K 19 -#define PAGE_SHIFT_8M 23 -#define PAGE_SHIFT_16M 24 -#define PAGE_SHIFT_16G 34 - bool hugetlb_disabled = false; -unsigned int HPAGE_SHIFT; -EXPORT_SYMBOL(HPAGE_SHIFT); - #define hugepd_none(hpd) (hpd_val(hpd) == 0) #define PTE_T_ORDER (__builtin_ffs(sizeof(pte_t)) - __builtin_ffs(sizeof(void *))) @@ -73,12 +61,17 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, num_hugepd = 1; } + if (!cachep) { + WARN_ONCE(1, "No page table cache created for hugetlb tables"); + return -ENOMEM; + } + new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL)); BUG_ON(pshift > HUGEPD_SHIFT_MASK); BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); - if (! new) + if (!new) return -ENOMEM; /* @@ -98,19 +91,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, for (i = 0; i < num_hugepd; i++, hpdp++) { if (unlikely(!hugepd_none(*hpdp))) break; - else { -#ifdef CONFIG_PPC_BOOK3S_64 - *hpdp = __hugepd(__pa(new) | HUGEPD_VAL_BITS | - (shift_to_mmu_psize(pshift) << 2)); -#elif defined(CONFIG_PPC_8xx) - *hpdp = __hugepd(__pa(new) | _PMD_USER | - (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M : - _PMD_PAGE_512K) | _PMD_PRESENT); -#else - /* We use the old format for PPC_FSL_BOOK3E */ - *hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift); -#endif - } + hugepd_populate(hpdp, new, pshift); } /* If we bailed from the for loop early, an error occurred, clean up */ if (i < num_hugepd) { @@ -154,6 +135,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz } else { pdshift = PUD_SHIFT; pu = pud_alloc(mm, pg, addr); + if (!pu) + return NULL; if (pshift == PUD_SHIFT) return (pte_t *)pu; else if (pshift > PMD_SHIFT) { @@ -162,6 +145,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz } else { pdshift = PMD_SHIFT; pm = pmd_alloc(mm, pu, addr); + if (!pm) + return NULL; if (pshift == PMD_SHIFT) /* 16MB hugepage */ return (pte_t *)pm; @@ -178,12 +163,16 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz } else { pdshift = PUD_SHIFT; pu = pud_alloc(mm, pg, addr); + if (!pu) + return NULL; if (pshift >= PUD_SHIFT) { ptl = pud_lockptr(mm, pu); hpdp = (hugepd_t *)pu; } else { pdshift = PMD_SHIFT; pm = pmd_alloc(mm, pu, addr); + if (!pm) + return NULL; ptl = pmd_lockptr(mm, pm); hpdp = (hugepd_t *)pm; } @@ -250,7 +239,7 @@ int __init alloc_bootmem_huge_page(struct hstate *h) return __alloc_bootmem_huge_page(h); } -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) +#ifndef CONFIG_PPC_BOOK3S_64 #define HUGEPD_FREELIST_SIZE \ ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) @@ -535,30 +524,6 @@ retry: return page; } -static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, - unsigned long sz) -{ - unsigned long __boundary = (addr + sz) & ~(sz-1); - return (__boundary - 1 < end - 1) ? __boundary : end; -} - -int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift, - unsigned long end, int write, struct page **pages, int *nr) -{ - pte_t *ptep; - unsigned long sz = 1UL << hugepd_shift(hugepd); - unsigned long next; - - ptep = hugepte_offset(hugepd, addr, pdshift); - do { - next = hugepte_addr_end(addr, end, sz); - if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr)) - return 0; - } while (ptep++, addr = next, addr != end); - - return 1; -} - #ifdef CONFIG_PPC_MM_SLICES unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, @@ -578,24 +543,15 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { -#ifdef CONFIG_PPC_MM_SLICES /* With radix we don't use slice, so derive it from vma*/ - if (!radix_enabled()) { + if (IS_ENABLED(CONFIG_PPC_MM_SLICES) && !radix_enabled()) { unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); return 1UL << mmu_psize_to_shift(psize); } -#endif return vma_kernel_pagesize(vma); } -static inline bool is_power_of_4(unsigned long x) -{ - if (is_power_of_2(x)) - return (__ilog2(x) % 2) ? false : true; - return false; -} - static int __init add_huge_page_size(unsigned long long size) { int shift = __ffs(size); @@ -603,37 +559,13 @@ static int __init add_huge_page_size(unsigned long long size) /* Check that it is a page size supported by the hardware and * that it fits within pagetable and slice limits. */ - if (size <= PAGE_SIZE) - return -EINVAL; -#if defined(CONFIG_PPC_FSL_BOOK3E) - if (!is_power_of_4(size)) + if (size <= PAGE_SIZE || !is_power_of_2(size)) return -EINVAL; -#elif !defined(CONFIG_PPC_8xx) - if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT)) - return -EINVAL; -#endif - if ((mmu_psize = shift_to_mmu_psize(shift)) < 0) + mmu_psize = check_and_get_huge_psize(shift); + if (mmu_psize < 0) return -EINVAL; -#ifdef CONFIG_PPC_BOOK3S_64 - /* - * We need to make sure that for different page sizes reported by - * firmware we only add hugetlb support for page sizes that can be - * supported by linux page table layout. - * For now we have - * Radix: 2M and 1G - * Hash: 16M and 16G - */ - if (radix_enabled()) { - if (mmu_psize != MMU_PAGE_2M && mmu_psize != MMU_PAGE_1G) - return -EINVAL; - } else { - if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G) - return -EINVAL; - } -#endif - BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); /* Return if huge page size has already been setup */ @@ -662,6 +594,7 @@ __setup("hugepagesz=", hugepage_setup_sz); static int __init hugetlbpage_init(void) { + bool configured = false; int psize; if (hugetlb_disabled) { @@ -669,10 +602,10 @@ static int __init hugetlbpage_init(void) return 0; } -#if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx) - if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE)) + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !radix_enabled() && + !mmu_has_feature(MMU_FTR_16M_PAGE)) return -ENODEV; -#endif + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { unsigned shift; unsigned pdshift; @@ -710,29 +643,18 @@ static int __init hugetlbpage_init(void) pgtable_cache_add(PTE_INDEX_SIZE); else if (pdshift > shift) pgtable_cache_add(pdshift - shift); -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) - else + else if (IS_ENABLED(CONFIG_PPC_FSL_BOOK3E) || IS_ENABLED(CONFIG_PPC_8xx)) pgtable_cache_add(PTE_T_ORDER); -#endif + + configured = true; } -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) - /* Default hpage size = 4M on FSL_BOOK3E and 512k on 8xx */ - if (mmu_psize_defs[MMU_PAGE_4M].shift) - HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift; - else if (mmu_psize_defs[MMU_PAGE_512K].shift) - HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_512K].shift; -#else - /* Set default large page size. Currently, we pick 16M or 1M - * depending on what is available - */ - if (mmu_psize_defs[MMU_PAGE_16M].shift) - HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift; - else if (mmu_psize_defs[MMU_PAGE_1M].shift) - HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift; - else if (mmu_psize_defs[MMU_PAGE_2M].shift) - HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift; -#endif + if (configured) { + if (IS_ENABLED(CONFIG_HUGETLB_PAGE_SIZE_VARIABLE)) + hugetlbpage_init_default(); + } else + pr_info("Failed to initialize. Disabling HugeTLB"); + return 0; } @@ -745,7 +667,7 @@ void flush_dcache_icache_hugepage(struct page *page) BUG_ON(!PageCompound(page)); - for (i = 0; i < (1UL << compound_order(page)); i++) { + for (i = 0; i < compound_nr(page); i++) { if (!PageHighMem(page)) { __flush_dcache_icache(page_address(page+i)); } else { @@ -755,156 +677,3 @@ void flush_dcache_icache_hugepage(struct page *page) } } } - -#endif /* CONFIG_HUGETLB_PAGE */ - -/* - * We have 4 cases for pgds and pmds: - * (1) invalid (all zeroes) - * (2) pointer to next table, as normal; bottom 6 bits == 0 - * (3) leaf pte for huge page _PAGE_PTE set - * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table - * - * So long as we atomically load page table pointers we are safe against teardown, - * we can follow the address down to the the page and take a ref on it. - * This function need to be called with interrupts disabled. We use this variant - * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED - */ -pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, - bool *is_thp, unsigned *hpage_shift) -{ - pgd_t pgd, *pgdp; - pud_t pud, *pudp; - pmd_t pmd, *pmdp; - pte_t *ret_pte; - hugepd_t *hpdp = NULL; - unsigned pdshift = PGDIR_SHIFT; - - if (hpage_shift) - *hpage_shift = 0; - - if (is_thp) - *is_thp = false; - - pgdp = pgdir + pgd_index(ea); - pgd = READ_ONCE(*pgdp); - /* - * Always operate on the local stack value. This make sure the - * value don't get updated by a parallel THP split/collapse, - * page fault or a page unmap. The return pte_t * is still not - * stable. So should be checked there for above conditions. - */ - if (pgd_none(pgd)) - return NULL; - else if (pgd_huge(pgd)) { - ret_pte = (pte_t *) pgdp; - goto out; - } else if (is_hugepd(__hugepd(pgd_val(pgd)))) - hpdp = (hugepd_t *)&pgd; - else { - /* - * Even if we end up with an unmap, the pgtable will not - * be freed, because we do an rcu free and here we are - * irq disabled - */ - pdshift = PUD_SHIFT; - pudp = pud_offset(&pgd, ea); - pud = READ_ONCE(*pudp); - - if (pud_none(pud)) - return NULL; - else if (pud_huge(pud)) { - ret_pte = (pte_t *) pudp; - goto out; - } else if (is_hugepd(__hugepd(pud_val(pud)))) - hpdp = (hugepd_t *)&pud; - else { - pdshift = PMD_SHIFT; - pmdp = pmd_offset(&pud, ea); - pmd = READ_ONCE(*pmdp); - /* - * A hugepage collapse is captured by pmd_none, because - * it mark the pmd none and do a hpte invalidate. - */ - if (pmd_none(pmd)) - return NULL; - - if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) { - if (is_thp) - *is_thp = true; - ret_pte = (pte_t *) pmdp; - goto out; - } - /* - * pmd_large check below will handle the swap pmd pte - * we need to do both the check because they are config - * dependent. - */ - if (pmd_huge(pmd) || pmd_large(pmd)) { - ret_pte = (pte_t *) pmdp; - goto out; - } else if (is_hugepd(__hugepd(pmd_val(pmd)))) - hpdp = (hugepd_t *)&pmd; - else - return pte_offset_kernel(&pmd, ea); - } - } - if (!hpdp) - return NULL; - - ret_pte = hugepte_offset(*hpdp, ea, pdshift); - pdshift = hugepd_shift(*hpdp); -out: - if (hpage_shift) - *hpage_shift = pdshift; - return ret_pte; -} -EXPORT_SYMBOL_GPL(__find_linux_pte); - -int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - unsigned long pte_end; - struct page *head, *page; - pte_t pte; - int refs; - - pte_end = (addr + sz) & ~(sz-1); - if (pte_end < end) - end = pte_end; - - pte = READ_ONCE(*ptep); - - if (!pte_access_permitted(pte, write)) - return 0; - - /* hugepages are never "special" */ - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - - refs = 0; - head = pte_page(pte); - - page = head + ((addr & (sz-1)) >> PAGE_SHIFT); - do { - VM_BUG_ON(compound_head(page) != head); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - - if (!page_cache_add_speculative(head, refs)) { - *nr -= refs; - return 0; - } - - if (unlikely(pte_val(pte) != pte_val(*ptep))) { - /* Could be optimized better */ - *nr -= refs; - while (refs--) - put_page(head); - return 0; - } - - return 1; -} diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index 1e6910eb70ed..42ef7a6e6098 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerPC version * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) @@ -11,12 +12,6 @@ * * Dave Engebretsen <engebret@us.ibm.com> * Rework for PPC64 port. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #undef DEBUG @@ -24,6 +19,39 @@ #include <linux/string.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> +#include <asm/kup.h> + +phys_addr_t memstart_addr __ro_after_init = (phys_addr_t)~0ull; +EXPORT_SYMBOL_GPL(memstart_addr); +phys_addr_t kernstart_addr __ro_after_init; +EXPORT_SYMBOL_GPL(kernstart_addr); +unsigned long kernstart_virt_addr __ro_after_init = KERNELBASE; +EXPORT_SYMBOL_GPL(kernstart_virt_addr); + +static bool disable_kuep = !IS_ENABLED(CONFIG_PPC_KUEP); +static bool disable_kuap = !IS_ENABLED(CONFIG_PPC_KUAP); + +static int __init parse_nosmep(char *p) +{ + disable_kuep = true; + pr_warn("Disabling Kernel Userspace Execution Prevention\n"); + return 0; +} +early_param("nosmep", parse_nosmep); + +static int __init parse_nosmap(char *p) +{ + disable_kuap = true; + pr_warn("Disabling Kernel Userspace Access Protection\n"); + return 0; +} +early_param("nosmap", parse_nosmap); + +void __ref setup_kup(void) +{ + setup_kuep(disable_kuep); + setup_kuap(disable_kuap); +} #define CTOR(shift) static void ctor_##shift(void *addr) \ { \ diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 41a3513cadc9..872df48ae41b 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerPC version * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) @@ -9,12 +10,6 @@ * * Derived from "arch/i386/mm/init.c" * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <linux/module.h> @@ -45,8 +40,10 @@ #include <asm/tlb.h> #include <asm/sections.h> #include <asm/hugetlb.h> +#include <asm/kup.h> +#include <asm/kasan.h> -#include "mmu_decl.h" +#include <mm/mmu_decl.h> #if defined(CONFIG_KERNEL_START_BOOL) || defined(CONFIG_LOWMEM_SIZE_BOOL) /* The amount of lowmem must be within 0xF0000000 - KERNELBASE. */ @@ -59,11 +56,6 @@ phys_addr_t total_memory; phys_addr_t total_lowmem; -phys_addr_t memstart_addr = (phys_addr_t)~0ull; -EXPORT_SYMBOL(memstart_addr); -phys_addr_t kernstart_addr; -EXPORT_SYMBOL(kernstart_addr); - #ifdef CONFIG_RELOCATABLE /* Used in __va()/__pa() */ long long virt_phys_offset; @@ -178,6 +170,10 @@ void __init MMU_init(void) btext_unmap(); #endif + kasan_mmu_init(); + + setup_kup(); + /* Shortly after that, the entire linear mapping will be available */ memblock_set_current_limit(lowmem_end_addr); } diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index a4c155af1597..4002ced3596f 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerPC version * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) @@ -11,12 +12,6 @@ * * Dave Engebretsen <engebret@us.ibm.com> * Rework for PPC64 port. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #undef DEBUG @@ -66,40 +61,50 @@ #include <asm/iommu.h> #include <asm/vdso.h> -#include "mmu_decl.h" - -phys_addr_t memstart_addr = ~0; -EXPORT_SYMBOL_GPL(memstart_addr); -phys_addr_t kernstart_addr; -EXPORT_SYMBOL_GPL(kernstart_addr); +#include <mm/mmu_decl.h> #ifdef CONFIG_SPARSEMEM_VMEMMAP /* - * Given an address within the vmemmap, determine the pfn of the page that - * represents the start of the section it is within. Note that we have to + * Given an address within the vmemmap, determine the page that + * represents the start of the subsection it is within. Note that we have to * do this by hand as the proffered address may not be correctly aligned. * Subtraction of non-aligned pointers produces undefined results. */ -static unsigned long __meminit vmemmap_section_start(unsigned long page) +static struct page * __meminit vmemmap_subsection_start(unsigned long vmemmap_addr) { - unsigned long offset = page - ((unsigned long)(vmemmap)); + unsigned long start_pfn; + unsigned long offset = vmemmap_addr - ((unsigned long)(vmemmap)); /* Return the pfn of the start of the section. */ - return (offset / sizeof(struct page)) & PAGE_SECTION_MASK; + start_pfn = (offset / sizeof(struct page)) & PAGE_SUBSECTION_MASK; + return pfn_to_page(start_pfn); } /* - * Check if this vmemmap page is already initialised. If any section - * which overlaps this vmemmap page is initialised then this page is - * initialised already. + * Since memory is added in sub-section chunks, before creating a new vmemmap + * mapping, the kernel should check whether there is an existing memmap mapping + * covering the new subsection added. This is needed because kernel can map + * vmemmap area using 16MB pages which will cover a memory range of 16G. Such + * a range covers multiple subsections (2M) + * + * If any subsection in the 16G range mapped by vmemmap is valid we consider the + * vmemmap populated (There is a page table entry already present). We can't do + * a page table lookup here because with the hash translation we don't keep + * vmemmap details in linux page table. */ -static int __meminit vmemmap_populated(unsigned long start, int page_size) +static int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size) { - unsigned long end = start + page_size; - start = (unsigned long)(pfn_to_page(vmemmap_section_start(start))); + struct page *start; + unsigned long vmemmap_end = vmemmap_addr + vmemmap_map_size; + start = vmemmap_subsection_start(vmemmap_addr); - for (; start < end; start += (PAGES_PER_SECTION * sizeof(struct page))) - if (pfn_valid(page_to_pfn((struct page *)start))) + for (; (unsigned long)start < vmemmap_end; start += PAGES_PER_SUBSECTION) + /* + * pfn valid check here is intended to really check + * whether we have any subsection already initialized + * in this range. + */ + if (pfn_valid(page_to_pfn(start))) return 1; return 0; @@ -177,6 +182,21 @@ static __meminit void vmemmap_list_populate(unsigned long phys, vmemmap_list = vmem_back; } +static bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, + unsigned long page_size) +{ + unsigned long nr_pfn = page_size / sizeof(struct page); + unsigned long start_pfn = page_to_pfn((struct page *)start); + + if ((start_pfn + nr_pfn) > altmap->end_pfn) + return true; + + if (start_pfn < altmap->base_pfn) + return true; + + return false; +} + int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) { @@ -191,6 +211,12 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, void *p = NULL; int rc; + /* + * This vmemmap range is backing different subsections. If any + * of that subsection is marked valid, that means we already + * have initialized a page table covering this range and hence + * the vmemmap range is populated. + */ if (vmemmap_populated(start, page_size)) continue; @@ -199,8 +225,11 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, * fail due to alignment issues when using 16MB hugepages, so * fall back to system memory if the altmap allocation fail. */ - if (altmap) + if (altmap && !altmap_cross_boundary(altmap, start, page_size)) { p = altmap_alloc_block_buf(page_size, altmap); + if (!p) + pr_debug("altmap block allocation failed, falling back to system memory"); + } if (!p) p = vmemmap_alloc_block_buf(page_size, node); if (!p) @@ -277,9 +306,10 @@ void __ref vmemmap_free(unsigned long start, unsigned long end, struct page *page; /* - * the section has already be marked as invalid, so - * vmemmap_populated() true means some other sections still - * in this page, so skip it. + * We have already marked the subsection we are trying to remove + * invalid. So if we want to remove the vmemmap range, we + * need to make sure there is no subsection marked valid + * in this range. */ if (vmemmap_populated(start, page_size)) continue; diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c new file mode 100644 index 000000000000..fc669643ce6a --- /dev/null +++ b/arch/powerpc/mm/ioremap.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/io.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <asm/io-workarounds.h> + +unsigned long ioremap_bot; +EXPORT_SYMBOL(ioremap_bot); + +void __iomem *ioremap(phys_addr_t addr, unsigned long size) +{ + pgprot_t prot = pgprot_noncached(PAGE_KERNEL); + void *caller = __builtin_return_address(0); + + if (iowa_is_active()) + return iowa_ioremap(addr, size, prot, caller); + return __ioremap_caller(addr, size, prot, caller); +} +EXPORT_SYMBOL(ioremap); + +void __iomem *ioremap_wc(phys_addr_t addr, unsigned long size) +{ + pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL); + void *caller = __builtin_return_address(0); + + if (iowa_is_active()) + return iowa_ioremap(addr, size, prot, caller); + return __ioremap_caller(addr, size, prot, caller); +} +EXPORT_SYMBOL(ioremap_wc); + +void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size) +{ + pgprot_t prot = pgprot_cached(PAGE_KERNEL); + void *caller = __builtin_return_address(0); + + if (iowa_is_active()) + return iowa_ioremap(addr, size, prot, caller); + return __ioremap_caller(addr, size, prot, caller); +} + +void __iomem *ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) +{ + pte_t pte = __pte(flags); + void *caller = __builtin_return_address(0); + + /* writeable implies dirty for kernel addresses */ + if (pte_write(pte)) + pte = pte_mkdirty(pte); + + /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ + pte = pte_exprotect(pte); + pte = pte_mkprivileged(pte); + + if (iowa_is_active()) + return iowa_ioremap(addr, size, pte_pgprot(pte), caller); + return __ioremap_caller(addr, size, pte_pgprot(pte), caller); +} +EXPORT_SYMBOL(ioremap_prot); + +int early_ioremap_range(unsigned long ea, phys_addr_t pa, + unsigned long size, pgprot_t prot) +{ + unsigned long i; + + for (i = 0; i < size; i += PAGE_SIZE) { + int err = map_kernel_page(ea + i, pa + i, prot); + + if (WARN_ON_ONCE(err)) /* Should clean up */ + return err; + } + + return 0; +} + +void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size, + pgprot_t prot, void *caller) +{ + struct vm_struct *area; + int ret; + unsigned long va; + + area = __get_vm_area_caller(size, VM_IOREMAP, IOREMAP_START, IOREMAP_END, caller); + if (area == NULL) + return NULL; + + area->phys_addr = pa; + va = (unsigned long)area->addr; + + ret = ioremap_page_range(va, va + size, pa, prot); + if (!ret) + return (void __iomem *)area->addr + offset; + + unmap_kernel_range(va, size); + free_vm_area(area); + + return NULL; +} diff --git a/arch/powerpc/mm/ioremap_32.c b/arch/powerpc/mm/ioremap_32.c new file mode 100644 index 000000000000..743e11384dea --- /dev/null +++ b/arch/powerpc/mm/ioremap_32.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/io.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +#include <mm/mmu_decl.h> + +void __iomem *ioremap_wt(phys_addr_t addr, unsigned long size) +{ + pgprot_t prot = pgprot_cached_wthru(PAGE_KERNEL); + + return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_wt); + +void __iomem * +__ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller) +{ + unsigned long v; + phys_addr_t p, offset; + int err; + + /* + * Choose an address to map it to. + * Once the vmalloc system is running, we use it. + * Before then, we use space going down from IOREMAP_TOP + * (ioremap_bot records where we're up to). + */ + p = addr & PAGE_MASK; + offset = addr & ~PAGE_MASK; + size = PAGE_ALIGN(addr + size) - p; + + /* + * If the address lies within the first 16 MB, assume it's in ISA + * memory space + */ + if (p < 16 * 1024 * 1024) + p += _ISA_MEM_BASE; + +#ifndef CONFIG_CRASH_DUMP + /* + * Don't allow anybody to remap normal RAM that we're using. + * mem_init() sets high_memory so only do the check after that. + */ + if (slab_is_available() && p <= virt_to_phys(high_memory - 1) && + page_is_ram(__phys_to_pfn(p))) { + pr_warn("%s(): phys addr 0x%llx is RAM lr %ps\n", __func__, + (unsigned long long)p, __builtin_return_address(0)); + return NULL; + } +#endif + + if (size == 0) + return NULL; + + /* + * Is it already mapped? Perhaps overlapped by a previous + * mapping. + */ + v = p_block_mapped(p); + if (v) + return (void __iomem *)v + offset; + + if (slab_is_available()) + return do_ioremap(p, offset, size, prot, caller); + + /* + * Should check if it is a candidate for a BAT mapping + */ + pr_warn("ioremap() called early from %pS. Use early_ioremap() instead\n", caller); + + err = early_ioremap_range(ioremap_bot - size, p, size, prot); + if (err) + return NULL; + ioremap_bot -= size; + + return (void __iomem *)ioremap_bot + offset; +} + +void iounmap(volatile void __iomem *addr) +{ + /* + * If mapped by BATs then there is nothing to do. + * Calling vfree() generates a benign warning. + */ + if (v_block_mapped((unsigned long)addr)) + return; + + if (addr > high_memory && (unsigned long)addr < ioremap_bot) + vunmap((void *)(PAGE_MASK & (unsigned long)addr)); +} +EXPORT_SYMBOL(iounmap); diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c new file mode 100644 index 000000000000..50a99d9684f7 --- /dev/null +++ b/arch/powerpc/mm/ioremap_64.c @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/io.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +/** + * Low level function to establish the page tables for an IO mapping + */ +void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_t prot) +{ + int ret; + unsigned long va = (unsigned long)ea; + + /* We don't support the 4K PFN hack with ioremap */ + if (pgprot_val(prot) & H_PAGE_4K_PFN) + return NULL; + + if ((ea + size) >= (void *)IOREMAP_END) { + pr_warn("Outside the supported range\n"); + return NULL; + } + + WARN_ON(pa & ~PAGE_MASK); + WARN_ON(((unsigned long)ea) & ~PAGE_MASK); + WARN_ON(size & ~PAGE_MASK); + + if (slab_is_available()) { + ret = ioremap_page_range(va, va + size, pa, prot); + if (ret) + unmap_kernel_range(va, size); + } else { + ret = early_ioremap_range(va, pa, size, prot); + } + + if (ret) + return NULL; + + return (void __iomem *)ea; +} +EXPORT_SYMBOL(__ioremap_at); + +/** + * Low level function to tear down the page tables for an IO mapping. This is + * used for mappings that are manipulated manually, like partial unmapping of + * PCI IOs or ISA space. + */ +void __iounmap_at(void *ea, unsigned long size) +{ + WARN_ON(((unsigned long)ea) & ~PAGE_MASK); + WARN_ON(size & ~PAGE_MASK); + + unmap_kernel_range((unsigned long)ea, size); +} +EXPORT_SYMBOL(__iounmap_at); + +void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size, + pgprot_t prot, void *caller) +{ + phys_addr_t paligned, offset; + void __iomem *ret; + int err; + + /* We don't support the 4K PFN hack with ioremap */ + if (pgprot_val(prot) & H_PAGE_4K_PFN) + return NULL; + + /* + * Choose an address to map it to. Once the vmalloc system is running, + * we use it. Before that, we map using addresses going up from + * ioremap_bot. vmalloc will use the addresses from IOREMAP_BASE + * through ioremap_bot. + */ + paligned = addr & PAGE_MASK; + offset = addr & ~PAGE_MASK; + size = PAGE_ALIGN(addr + size) - paligned; + + if (size == 0 || paligned == 0) + return NULL; + + if (slab_is_available()) + return do_ioremap(paligned, offset, size, prot, caller); + + pr_warn("ioremap() called early from %pS. Use early_ioremap() instead\n", caller); + + err = early_ioremap_range(ioremap_bot, paligned, size, prot); + if (err) + return NULL; + + ret = (void __iomem *)ioremap_bot + offset; + ioremap_bot += size; + + return ret; +} + +/* + * Unmap an IO region and remove it from vmalloc'd list. + * Access to IO memory should be serialized by driver. + */ +void iounmap(volatile void __iomem *token) +{ + void *addr; + + if (!slab_is_available()) + return; + + addr = (void *)((unsigned long __force)PCI_FIX_ADDR(token) & PAGE_MASK); + + if ((unsigned long)addr < ioremap_bot) { + pr_warn("Attempt to iounmap early bolted mapping at 0x%p\n", addr); + return; + } + vunmap(addr); +} +EXPORT_SYMBOL(iounmap); diff --git a/arch/powerpc/mm/kasan/Makefile b/arch/powerpc/mm/kasan/Makefile new file mode 100644 index 000000000000..6577897673dd --- /dev/null +++ b/arch/powerpc/mm/kasan/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +KASAN_SANITIZE := n + +obj-$(CONFIG_PPC32) += kasan_init_32.o diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c new file mode 100644 index 000000000000..0e6ed4413eea --- /dev/null +++ b/arch/powerpc/mm/kasan/kasan_init_32.c @@ -0,0 +1,221 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define DISABLE_BRANCH_PROFILING + +#include <linux/kasan.h> +#include <linux/printk.h> +#include <linux/memblock.h> +#include <linux/moduleloader.h> +#include <linux/sched/task.h> +#include <linux/vmalloc.h> +#include <asm/pgalloc.h> +#include <asm/code-patching.h> +#include <mm/mmu_decl.h> + +static pgprot_t kasan_prot_ro(void) +{ + if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) + return PAGE_READONLY; + + return PAGE_KERNEL_RO; +} + +static void kasan_populate_pte(pte_t *ptep, pgprot_t prot) +{ + unsigned long va = (unsigned long)kasan_early_shadow_page; + phys_addr_t pa = __pa(kasan_early_shadow_page); + int i; + + for (i = 0; i < PTRS_PER_PTE; i++, ptep++) + __set_pte_at(&init_mm, va, ptep, pfn_pte(PHYS_PFN(pa), prot), 0); +} + +static int __ref kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_end) +{ + pmd_t *pmd; + unsigned long k_cur, k_next; + pgprot_t prot = slab_is_available() ? kasan_prot_ro() : PAGE_KERNEL; + + pmd = pmd_offset(pud_offset(pgd_offset_k(k_start), k_start), k_start); + + for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd++) { + pte_t *new; + + k_next = pgd_addr_end(k_cur, k_end); + if ((void *)pmd_page_vaddr(*pmd) != kasan_early_shadow_pte) + continue; + + if (slab_is_available()) + new = pte_alloc_one_kernel(&init_mm); + else + new = memblock_alloc(PTE_FRAG_SIZE, PTE_FRAG_SIZE); + + if (!new) + return -ENOMEM; + kasan_populate_pte(new, prot); + + smp_wmb(); /* See comment in __pte_alloc */ + + spin_lock(&init_mm.page_table_lock); + /* Has another populated it ? */ + if (likely((void *)pmd_page_vaddr(*pmd) == kasan_early_shadow_pte)) { + pmd_populate_kernel(&init_mm, pmd, new); + new = NULL; + } + spin_unlock(&init_mm.page_table_lock); + + if (new && slab_is_available()) + pte_free_kernel(&init_mm, new); + } + return 0; +} + +static void __ref *kasan_get_one_page(void) +{ + if (slab_is_available()) + return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + + return memblock_alloc(PAGE_SIZE, PAGE_SIZE); +} + +static int __ref kasan_init_region(void *start, size_t size) +{ + unsigned long k_start = (unsigned long)kasan_mem_to_shadow(start); + unsigned long k_end = (unsigned long)kasan_mem_to_shadow(start + size); + unsigned long k_cur; + int ret; + void *block = NULL; + + ret = kasan_init_shadow_page_tables(k_start, k_end); + if (ret) + return ret; + + if (!slab_is_available()) + block = memblock_alloc(k_end - k_start, PAGE_SIZE); + + for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) { + pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur); + void *va = block ? block + k_cur - k_start : kasan_get_one_page(); + pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL); + + if (!va) + return -ENOMEM; + + __set_pte_at(&init_mm, k_cur, pte_offset_kernel(pmd, k_cur), pte, 0); + } + flush_tlb_kernel_range(k_start, k_end); + return 0; +} + +static void __init kasan_remap_early_shadow_ro(void) +{ + pgprot_t prot = kasan_prot_ro(); + unsigned long k_start = KASAN_SHADOW_START; + unsigned long k_end = KASAN_SHADOW_END; + unsigned long k_cur; + phys_addr_t pa = __pa(kasan_early_shadow_page); + + kasan_populate_pte(kasan_early_shadow_pte, prot); + + for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) { + pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur); + pte_t *ptep = pte_offset_kernel(pmd, k_cur); + + if ((pte_val(*ptep) & PTE_RPN_MASK) != pa) + continue; + + __set_pte_at(&init_mm, k_cur, ptep, pfn_pte(PHYS_PFN(pa), prot), 0); + } + flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END); +} + +void __init kasan_mmu_init(void) +{ + int ret; + struct memblock_region *reg; + + if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) { + ret = kasan_init_shadow_page_tables(KASAN_SHADOW_START, KASAN_SHADOW_END); + + if (ret) + panic("kasan: kasan_init_shadow_page_tables() failed"); + } + + for_each_memblock(memory, reg) { + phys_addr_t base = reg->base; + phys_addr_t top = min(base + reg->size, total_lowmem); + + if (base >= top) + continue; + + ret = kasan_init_region(__va(base), top - base); + if (ret) + panic("kasan: kasan_init_region() failed"); + } +} + +void __init kasan_init(void) +{ + kasan_remap_early_shadow_ro(); + + clear_page(kasan_early_shadow_page); + + /* At this point kasan is fully initialized. Enable error messages */ + init_task.kasan_depth = 0; + pr_info("KASAN init done\n"); +} + +#ifdef CONFIG_MODULES +void *module_alloc(unsigned long size) +{ + void *base; + + base = __vmalloc_node_range(size, MODULE_ALIGN, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, + NUMA_NO_NODE, __builtin_return_address(0)); + + if (!base) + return NULL; + + if (!kasan_init_region(base, size)) + return base; + + vfree(base); + + return NULL; +} +#endif + +#ifdef CONFIG_PPC_BOOK3S_32 +u8 __initdata early_hash[256 << 10] __aligned(256 << 10) = {0}; + +static void __init kasan_early_hash_table(void) +{ + modify_instruction_site(&patch__hash_page_A0, 0xffff, __pa(early_hash) >> 16); + modify_instruction_site(&patch__flush_hash_A0, 0xffff, __pa(early_hash) >> 16); + + Hash = (struct hash_pte *)early_hash; +} +#else +static void __init kasan_early_hash_table(void) {} +#endif + +void __init kasan_early_init(void) +{ + unsigned long addr = KASAN_SHADOW_START; + unsigned long end = KASAN_SHADOW_END; + unsigned long next; + pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(addr), addr), addr); + + BUILD_BUG_ON(KASAN_SHADOW_START & ~PGDIR_MASK); + + kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL); + + do { + next = pgd_addr_end(addr, end); + pmd_populate_kernel(&init_mm, pmd, kasan_early_shadow_pte); + } while (pmd++, addr = next, addr != end); + + if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) + kasan_early_hash_table(); +} diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index f6787f90e158..ad299e72ec30 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerPC version * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) @@ -9,12 +10,6 @@ * * Derived from "arch/i386/mm/init.c" * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <linux/export.h> @@ -36,6 +31,7 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/memremap.h> +#include <linux/dma-direct.h> #include <asm/pgalloc.h> #include <asm/prom.h> @@ -54,7 +50,7 @@ #include <asm/swiotlb.h> #include <asm/rtas.h> -#include "mmu_decl.h" +#include <mm/mmu_decl.h> #ifndef CPU_FTR_COHERENT_ICACHE #define CPU_FTR_COHERENT_ICACHE 0 /* XXX for now */ @@ -109,8 +105,29 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end) return -ENODEV; } -int __meminit arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +#define FLUSH_CHUNK_SIZE SZ_1G +/** + * flush_dcache_range_chunked(): Write any modified data cache blocks out to + * memory and invalidate them, in chunks of up to FLUSH_CHUNK_SIZE + * Does not invalidate the corresponding instruction cache blocks. + * + * @start: the start address + * @stop: the stop address (exclusive) + * @chunk: the max size of the chunks + */ +static void flush_dcache_range_chunked(unsigned long start, unsigned long stop, + unsigned long chunk) +{ + unsigned long i; + + for (i = start; i < stop; i += chunk) { + flush_dcache_range(i, min(stop, start + chunk)); + cond_resched(); + } +} + +int __ref arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; @@ -125,48 +142,36 @@ int __meminit arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap * start, start + size, rc); return -EFAULT; } - flush_inval_dcache_range(start, start + size); - return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + return __add_pages(nid, start_pfn, nr_pages, restrictions); } -#ifdef CONFIG_MEMORY_HOTREMOVE -int __meminit arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void __ref arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct page *page; + struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap); int ret; - /* - * If we have an altmap then we need to skip over any reserved PFNs - * when querying the zone. - */ - page = pfn_to_page(start_pfn); - if (altmap) - page += vmem_altmap_offset(altmap); - - ret = __remove_pages(page_zone(page), start_pfn, nr_pages, altmap); - if (ret) - return ret; + __remove_pages(page_zone(page), start_pfn, nr_pages, altmap); /* Remove htab bolted mappings for this section of memory */ start = (unsigned long)__va(start); - flush_inval_dcache_range(start, start + size); + flush_dcache_range_chunked(start, start + size, FLUSH_CHUNK_SIZE); + ret = remove_section_mapping(start, start + size); + WARN_ON_ONCE(ret); /* Ensure all vmalloc mappings are flushed in case they also * hit that section of memory */ vm_unmap_aliases(); - resize_hpt_for_hotplug(memblock_phys_mem_size()); - - return ret; + if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC) + pr_warn("Hash collision while resizing HPT\n"); } #endif -#endif /* CONFIG_MEMORY_HOTPLUG */ #ifndef CONFIG_NEED_MULTIPLE_NODES void __init mem_topology_setup(void) @@ -218,10 +223,10 @@ static int __init mark_nonram_nosave(void) * everything else. GFP_DMA32 page allocations automatically fall back to * ZONE_DMA. * - * By using 31-bit unconditionally, we can exploit ARCH_ZONE_DMA_BITS to - * inform the generic DMA mapping code. 32-bit only devices (if not handled - * by an IOMMU anyway) will take a first dip into ZONE_NORMAL and get - * otherwise served by ZONE_DMA. + * By using 31-bit unconditionally, we can exploit zone_dma_bits to inform the + * generic DMA mapping code. 32-bit only devices (if not handled by an IOMMU + * anyway) will take a first dip into ZONE_NORMAL and get otherwise served by + * ZONE_DMA. */ static unsigned long max_zone_pfns[MAX_NR_ZONES]; @@ -233,15 +238,13 @@ void __init paging_init(void) unsigned long long total_ram = memblock_phys_mem_size(); phys_addr_t top_of_ram = memblock_end_of_DRAM(); -#ifdef CONFIG_PPC32 - unsigned long v = __fix_to_virt(__end_of_fixed_addresses - 1); - unsigned long end = __fix_to_virt(FIX_HOLE); +#ifdef CONFIG_HIGHMEM + unsigned long v = __fix_to_virt(FIX_KMAP_END); + unsigned long end = __fix_to_virt(FIX_KMAP_BEGIN); for (; v < end; v += PAGE_SIZE) map_kernel_page(v, 0, __pgprot(0)); /* XXX gross */ -#endif -#ifdef CONFIG_HIGHMEM map_kernel_page(PKMAP_BASE, 0, __pgprot(0)); /* XXX gross */ pkmap_page_table = virt_to_kpte(PKMAP_BASE); @@ -254,8 +257,18 @@ void __init paging_init(void) printk(KERN_DEBUG "Memory hole size: %ldMB\n", (long int)((top_of_ram - total_ram) >> 20)); + /* + * Allow 30-bit DMA for very limited Broadcom wifi chips on many + * powerbooks. + */ + if (IS_ENABLED(CONFIG_PPC32)) + zone_dma_bits = 30; + else + zone_dma_bits = 31; + #ifdef CONFIG_ZONE_DMA - max_zone_pfns[ZONE_DMA] = min(max_low_pfn, 0x7fffffffUL >> PAGE_SHIFT); + max_zone_pfns[ZONE_DMA] = min(max_low_pfn, + 1UL << (zone_dma_bits - PAGE_SHIFT)); #endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM @@ -309,17 +322,18 @@ void __init mem_init(void) mem_init_print_info(NULL); #ifdef CONFIG_PPC32 pr_info("Kernel virtual memory layout:\n"); +#ifdef CONFIG_KASAN + pr_info(" * 0x%08lx..0x%08lx : kasan shadow mem\n", + KASAN_SHADOW_START, KASAN_SHADOW_END); +#endif pr_info(" * 0x%08lx..0x%08lx : fixmap\n", FIXADDR_START, FIXADDR_TOP); #ifdef CONFIG_HIGHMEM pr_info(" * 0x%08lx..0x%08lx : highmem PTEs\n", PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP)); #endif /* CONFIG_HIGHMEM */ -#ifdef CONFIG_NOT_COHERENT_CACHE - pr_info(" * 0x%08lx..0x%08lx : consistent mem\n", - IOREMAP_TOP, IOREMAP_TOP + CONFIG_CONSISTENT_SIZE); -#endif /* CONFIG_NOT_COHERENT_CACHE */ - pr_info(" * 0x%08lx..0x%08lx : early ioremap\n", - ioremap_bot, IOREMAP_TOP); + if (ioremap_bot != IOREMAP_TOP) + pr_info(" * 0x%08lx..0x%08lx : early ioremap\n", + ioremap_bot, IOREMAP_TOP); pr_info(" * 0x%08lx..0x%08lx : vmalloc & ioremap\n", VMALLOC_START, VMALLOC_END); #endif /* CONFIG_PPC32 */ @@ -333,12 +347,119 @@ void free_initmem(void) free_initmem_default(POISON_FREE_INITMEM); } -#ifdef CONFIG_BLK_DEV_INITRD -void __init free_initrd_mem(unsigned long start, unsigned long end) +/** + * flush_coherent_icache() - if a CPU has a coherent icache, flush it + * @addr: The base address to use (can be any valid address, the whole cache will be flushed) + * Return true if the cache was flushed, false otherwise + */ +static inline bool flush_coherent_icache(unsigned long addr) { - free_reserved_area((void *)start, (void *)end, -1, "initrd"); + /* + * For a snooping icache, we still need a dummy icbi to purge all the + * prefetched instructions from the ifetch buffers. We also need a sync + * before the icbi to order the the actual stores to memory that might + * have modified instructions with the icbi. + */ + if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { + mb(); /* sync */ + icbi((void *)addr); + mb(); /* sync */ + isync(); + return true; + } + + return false; } -#endif + +/** + * invalidate_icache_range() - Flush the icache by issuing icbi across an address range + * @start: the start address + * @stop: the stop address (exclusive) + */ +static void invalidate_icache_range(unsigned long start, unsigned long stop) +{ + unsigned long shift = l1_icache_shift(); + unsigned long bytes = l1_icache_bytes(); + char *addr = (char *)(start & ~(bytes - 1)); + unsigned long size = stop - (unsigned long)addr + (bytes - 1); + unsigned long i; + + for (i = 0; i < size >> shift; i++, addr += bytes) + icbi(addr); + + mb(); /* sync */ + isync(); +} + +/** + * flush_icache_range: Write any modified data cache blocks out to memory + * and invalidate the corresponding blocks in the instruction cache + * + * Generic code will call this after writing memory, before executing from it. + * + * @start: the start address + * @stop: the stop address (exclusive) + */ +void flush_icache_range(unsigned long start, unsigned long stop) +{ + if (flush_coherent_icache(start)) + return; + + clean_dcache_range(start, stop); + + if (IS_ENABLED(CONFIG_44x)) { + /* + * Flash invalidate on 44x because we are passed kmapped + * addresses and this doesn't work for userspace pages due to + * the virtually tagged icache. + */ + iccci((void *)start); + mb(); /* sync */ + isync(); + } else + invalidate_icache_range(start, stop); +} +EXPORT_SYMBOL(flush_icache_range); + +#if !defined(CONFIG_PPC_8xx) && !defined(CONFIG_PPC64) +/** + * flush_dcache_icache_phys() - Flush a page by it's physical address + * @physaddr: the physical address of the page + */ +static void flush_dcache_icache_phys(unsigned long physaddr) +{ + unsigned long bytes = l1_dcache_bytes(); + unsigned long nb = PAGE_SIZE / bytes; + unsigned long addr = physaddr & PAGE_MASK; + unsigned long msr, msr0; + unsigned long loop1 = addr, loop2 = addr; + + msr0 = mfmsr(); + msr = msr0 & ~MSR_DR; + /* + * This must remain as ASM to prevent potential memory accesses + * while the data MMU is disabled + */ + asm volatile( + " mtctr %2;\n" + " mtmsr %3;\n" + " isync;\n" + "0: dcbst 0, %0;\n" + " addi %0, %0, %4;\n" + " bdnz 0b;\n" + " sync;\n" + " mtctr %2;\n" + "1: icbi 0, %1;\n" + " addi %1, %1, %4;\n" + " bdnz 1b;\n" + " sync;\n" + " mtmsr %5;\n" + " isync;\n" + : "+&r" (loop1), "+&r" (loop2) + : "r" (nb), "r" (msr), "i" (bytes), "r" (msr0) + : "ctr", "memory"); +} +#endif // !defined(CONFIG_PPC_8xx) && !defined(CONFIG_PPC64) /* * This is called when a page has been modified by the kernel. @@ -372,12 +493,46 @@ void flush_dcache_icache_page(struct page *page) __flush_dcache_icache(start); kunmap_atomic(start); } else { - __flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT); + unsigned long addr = page_to_pfn(page) << PAGE_SHIFT; + + if (flush_coherent_icache(addr)) + return; + flush_dcache_icache_phys(addr); } #endif } EXPORT_SYMBOL(flush_dcache_icache_page); +/** + * __flush_dcache_icache(): Flush a particular page from the data cache to RAM. + * Note: this is necessary because the instruction cache does *not* + * snoop from the data cache. + * + * @page: the address of the page to flush + */ +void __flush_dcache_icache(void *p) +{ + unsigned long addr = (unsigned long)p; + + if (flush_coherent_icache(addr)) + return; + + clean_dcache_range(addr, addr + PAGE_SIZE); + + /* + * We don't flush the icache on 44x. Those have a virtual icache and we + * don't have access to the virtual address here (it's not the page + * vaddr but where it's mapped in user space). The flushing of the + * icache on these is handled elsewhere, when a change in the address + * space occurs, before returning to user space. + */ + + if (cpu_has_feature(MMU_FTR_TYPE_44x)) + return; + + invalidate_icache_range(addr, addr + PAGE_SIZE); +} + void clear_user_page(void *page, unsigned long vaddr, struct page *pg) { clear_page(page); @@ -426,63 +581,6 @@ void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, EXPORT_SYMBOL(flush_icache_user_range); /* - * This is called at the end of handling a user page fault, when the - * fault has been handled by updating a PTE in the linux page tables. - * We use it to preload an HPTE into the hash table corresponding to - * the updated linux PTE. - * - * This must always be called with the pte lock held. - */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, - pte_t *ptep) -{ -#ifdef CONFIG_PPC_BOOK3S - /* - * We don't need to worry about _PAGE_PRESENT here because we are - * called with either mm->page_table_lock held or ptl lock held - */ - unsigned long trap; - bool is_exec; - - if (radix_enabled()) { - prefetch((void *)address); - return; - } - - /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ - if (!pte_young(*ptep) || address >= TASK_SIZE) - return; - - /* We try to figure out if we are coming from an instruction - * access fault and pass that down to __hash_page so we avoid - * double-faulting on execution of fresh text. We have to test - * for regs NULL since init will get here first thing at boot - * - * We also avoid filling the hash if not coming from a fault - */ - - trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL; - switch (trap) { - case 0x300: - is_exec = false; - break; - case 0x400: - is_exec = true; - break; - default: - return; - } - - hash_preload(vma->vm_mm, address, is_exec, trap); -#endif /* CONFIG_PPC_BOOK3S */ -#if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \ - && defined(CONFIG_HUGETLB_PAGE) - if (is_vm_hugetlb_page(vma)) - book3e_hugetlb_preload(vma, address, *ptep); -#endif -} - -/* * System memory should not be in /proc/iomem but various tools expect it * (eg kdump). */ diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c index b24ce40acd47..ae683fdc716c 100644 --- a/arch/powerpc/mm/mmap.c +++ b/arch/powerpc/mm/mmap.c @@ -1,24 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * flexible mmap layout support * * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. * All Rights Reserved. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * * Started by Ingo Molnar <mingo@elte.hu> */ diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c index bb52320b7369..18f20da0d348 100644 --- a/arch/powerpc/mm/mmu_context.c +++ b/arch/powerpc/mm/mmu_context.c @@ -1,13 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Common implementation of switch_mm_irqs_off * * Copyright IBM Corp. 2017 - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <linux/mm.h> @@ -98,7 +93,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, switch_mmu_context(prev, next, tsk); } -#ifdef CONFIG_PPC32 +#ifndef CONFIG_PPC_BOOK3S_64 void arch_exit_mmap(struct mm_struct *mm) { void *frag = pte_frag_get(&mm->context); diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 74ff61dabcb1..8e99649c24fc 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Declarations of procedures and variables shared between files * in arch/ppc/mm/. @@ -11,12 +12,6 @@ * * Derived from "arch/i386/mm/init.c" * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <linux/mm.h> #include <asm/mmu.h> @@ -83,19 +78,21 @@ static inline void _tlbivax_bcast(unsigned long address, unsigned int pid, } #endif -#else /* CONFIG_PPC_MMU_NOHASH */ - -extern void hash_preload(struct mm_struct *mm, unsigned long ea, - bool is_exec, unsigned long trap); +static inline void print_system_hash_info(void) {} +#else /* CONFIG_PPC_MMU_NOHASH */ extern void _tlbie(unsigned long address); extern void _tlbia(void); +void print_system_hash_info(void); + #endif /* CONFIG_PPC_MMU_NOHASH */ #ifdef CONFIG_PPC32 +void hash_preload(struct mm_struct *mm, unsigned long ea); + extern void mapin_ram(void); extern void setbat(int index, unsigned long virt, phys_addr_t phys, unsigned int size, pgprot_t prot); @@ -104,12 +101,11 @@ extern int __map_without_bats; extern unsigned int rtas_data, rtas_size; struct hash_pte; -extern struct hash_pte *Hash, *Hash_end; -extern unsigned long Hash_size, Hash_mask; +extern struct hash_pte *Hash; +extern u8 early_hash[]; #endif /* CONFIG_PPC32 */ -extern unsigned long ioremap_bot; extern unsigned long __max_low_memory; extern phys_addr_t __initial_memory_limit_addr; extern phys_addr_t total_memory; @@ -130,6 +126,7 @@ extern void wii_memory_fixups(void); */ #ifdef CONFIG_PPC32 extern void MMU_init_hw(void); +void MMU_init_hw_patch(void); unsigned long mmu_mapin_ram(unsigned long base, unsigned long top); #endif @@ -142,10 +139,21 @@ extern unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, extern void adjust_total_lowmem(void); extern int switch_to_as1(void); extern void restore_to_as0(int esel, int offset, void *dt_ptr, int bootcpu); +void create_kaslr_tlb_entry(int entry, unsigned long virt, phys_addr_t phys); +void reloc_kernel_entry(void *fdt, int addr); +extern int is_second_reloc; #endif extern void loadcam_entry(unsigned int index); extern void loadcam_multi(int first_idx, int num, int tmp_idx); +#ifdef CONFIG_RANDOMIZE_BASE +void kaslr_early_init(void *dt_ptr, phys_addr_t size); +void kaslr_late_init(void); +#else +static inline void kaslr_early_init(void *dt_ptr, phys_addr_t size) {} +static inline void kaslr_late_init(void) {} +#endif + struct tlbcam { u32 MAS0; u32 MAS1; diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/nohash/40x.c index b9cf6f8764b0..f348104eb461 100644 --- a/arch/powerpc/mm/40x_mmu.c +++ b/arch/powerpc/mm/nohash/40x.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * This file contains the routines for initializing the MMU * on the 4xx series of chips. @@ -12,12 +13,6 @@ * * Derived from "arch/i386/mm/init.c" * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <linux/signal.h> @@ -49,7 +44,7 @@ #include <asm/machdep.h> #include <asm/setup.h> -#include "mmu_decl.h" +#include <mm/mmu_decl.h> extern int __map_without_ltlbs; /* diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/nohash/44x.c index aad127acdbaa..3d6ae7c72412 100644 --- a/arch/powerpc/mm/44x_mmu.c +++ b/arch/powerpc/mm/nohash/44x.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Modifications by Matt Porter (mporter@mvista.com) to support * PPC44x Book E processors. @@ -15,12 +16,6 @@ * * Derived from "arch/i386/mm/init.c" * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <linux/init.h> @@ -31,7 +26,7 @@ #include <asm/cacheflush.h> #include <asm/code-patching.h> -#include "mmu_decl.h" +#include <mm/mmu_decl.h> /* Used by the 44x TLB replacement exception handler. * Just needed it declared someplace. diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/nohash/8xx.c index fe1f6443d57f..090af2d2d3e4 100644 --- a/arch/powerpc/mm/8xx_mmu.c +++ b/arch/powerpc/mm/nohash/8xx.c @@ -1,15 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * This file contains the routines for initializing the MMU * on the 8xx series of chips. * -- christophe * * Derived from arch/powerpc/mm/40x_mmu.c: - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <linux/memblock.h> @@ -17,7 +12,7 @@ #include <asm/fixmap.h> #include <asm/code-patching.h> -#include "mmu_decl.h" +#include <mm/mmu_decl.h> #define IMMR_SIZE (FIX_IMMR_SIZE << PAGE_SHIFT) @@ -108,6 +103,19 @@ static void mmu_patch_addis(s32 *site, long simm) patch_instruction_site(site, instr); } +void __init mmu_mapin_ram_chunk(unsigned long offset, unsigned long top, pgprot_t prot) +{ + unsigned long s = offset; + unsigned long v = PAGE_OFFSET + s; + phys_addr_t p = memstart_addr + s; + + for (; s < top; s += PAGE_SIZE) { + map_kernel_page(v, p, prot); + v += PAGE_SIZE; + p += PAGE_SIZE; + } +} + unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { unsigned long mapped; @@ -120,10 +128,20 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT)) mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, 0); } else { + unsigned long einittext8 = ALIGN(__pa(_einittext), SZ_8M); + mapped = top & ~(LARGE_PAGE_SIZE_8M - 1); if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT)) - mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, - _ALIGN(__pa(_einittext), 8 << 20)); + mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, einittext8); + + /* + * Populate page tables to: + * - have them appear in /sys/kernel/debug/kernel_page_tables + * - allow the BDI to find the pages when they are not PINNED + */ + mmu_mapin_ram_chunk(0, einittext8, PAGE_KERNEL_X); + mmu_mapin_ram_chunk(einittext8, mapped, PAGE_KERNEL); + mmu_mapin_immr(); } mmu_patch_cmp_limit(&patch__dtlbmiss_linmem_top, mapped); @@ -149,18 +167,41 @@ void mmu_mark_initmem_nx(void) if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) && CONFIG_ETEXT_SHIFT < 23) mmu_patch_addis(&patch__itlbmiss_linmem_top8, -((long)_etext & ~(LARGE_PAGE_SIZE_8M - 1))); - if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT)) + if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT)) { + unsigned long einittext8 = ALIGN(__pa(_einittext), SZ_8M); + unsigned long etext8 = ALIGN(__pa(_etext), SZ_8M); + unsigned long etext = __pa(_etext); + mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, __pa(_etext)); + + /* Update page tables for PTDUMP and BDI */ + mmu_mapin_ram_chunk(0, einittext8, __pgprot(0)); + if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) { + mmu_mapin_ram_chunk(0, etext, PAGE_KERNEL_TEXT); + mmu_mapin_ram_chunk(etext, einittext8, PAGE_KERNEL); + } else { + mmu_mapin_ram_chunk(0, etext8, PAGE_KERNEL_TEXT); + mmu_mapin_ram_chunk(etext8, einittext8, PAGE_KERNEL); + } + } } #ifdef CONFIG_STRICT_KERNEL_RWX void mmu_mark_rodata_ro(void) { + unsigned long sinittext = __pa(_sinittext); + unsigned long etext = __pa(_etext); + if (CONFIG_DATA_SHIFT < 23) mmu_patch_addis(&patch__dtlbmiss_romem_top8, -__pa(((unsigned long)_sinittext) & ~(LARGE_PAGE_SIZE_8M - 1))); mmu_patch_addis(&patch__dtlbmiss_romem_top, -__pa(_sinittext)); + + /* Update page tables for PTDUMP and BDI */ + mmu_mapin_ram_chunk(0, sinittext, __pgprot(0)); + mmu_mapin_ram_chunk(0, etext, PAGE_KERNEL_ROX); + mmu_mapin_ram_chunk(etext, sinittext, PAGE_KERNEL_RO); } #endif @@ -213,3 +254,27 @@ void flush_instruction_cache(void) mtspr(SPRN_IC_CST, IDC_INVALL); isync(); } + +#ifdef CONFIG_PPC_KUEP +void __init setup_kuep(bool disabled) +{ + if (disabled) + return; + + pr_info("Activating Kernel Userspace Execution Prevention\n"); + + mtspr(SPRN_MI_AP, MI_APG_KUEP); +} +#endif + +#ifdef CONFIG_PPC_KUAP +void __init setup_kuap(bool disabled) +{ + pr_info("Activating Kernel Userspace Access Protection\n"); + + if (disabled) + pr_warn("KUAP cannot be disabled yet on 8xx when compiled in\n"); + + mtspr(SPRN_MD_AP, MD_APG_KUAP); +} +#endif diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile new file mode 100644 index 000000000000..0424f6ce5bd8 --- /dev/null +++ b/arch/powerpc/mm/nohash/Makefile @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0 + +ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) + +obj-y += mmu_context.o tlb.o tlb_low.o +obj-$(CONFIG_PPC_BOOK3E_64) += tlb_low_64e.o book3e_pgtable.o +obj-$(CONFIG_40x) += 40x.o +obj-$(CONFIG_44x) += 44x.o +obj-$(CONFIG_PPC_8xx) += 8xx.o +obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke.o +obj-$(CONFIG_RANDOMIZE_BASE) += kaslr_booke.o +ifdef CONFIG_HUGETLB_PAGE +obj-$(CONFIG_PPC_FSL_BOOK3E) += book3e_hugetlbpage.o +endif + +# Disable kcov instrumentation on sensitive code +# This is necessary for booting with kcov enabled on book3e machines +KCOV_INSTRUMENT_tlb.o := n +KCOV_INSTRUMENT_fsl_booke.o := n diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c index f84ec46cdb26..8b88be91b622 100644 --- a/arch/powerpc/mm/hugetlbpage-book3e.c +++ b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c @@ -11,8 +11,9 @@ #include <asm/mmu.h> -#ifdef CONFIG_PPC_FSL_BOOK3E #ifdef CONFIG_PPC64 +#include <asm/paca.h> + static inline int tlb1_next(void) { struct paca_struct *paca = get_paca(); @@ -29,33 +30,6 @@ static inline int tlb1_next(void) tcd->esel_next = next; return this; } -#else -static inline int tlb1_next(void) -{ - int index, ncams; - - ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; - - index = this_cpu_read(next_tlbcam_idx); - - /* Just round-robin the entries and wrap when we hit the end */ - if (unlikely(index == ncams - 1)) - __this_cpu_write(next_tlbcam_idx, tlbcam_index); - else - __this_cpu_inc(next_tlbcam_idx); - - return index; -} -#endif /* !PPC64 */ -#endif /* FSL */ - -static inline int mmu_get_tsize(int psize) -{ - return mmu_psize_defs[psize].enc; -} - -#if defined(CONFIG_PPC_FSL_BOOK3E) && defined(CONFIG_PPC64) -#include <asm/paca.h> static inline void book3e_tlb_lock(void) { @@ -98,6 +72,23 @@ static inline void book3e_tlb_unlock(void) paca->tcd_ptr->lock = 0; } #else +static inline int tlb1_next(void) +{ + int index, ncams; + + ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; + + index = this_cpu_read(next_tlbcam_idx); + + /* Just round-robin the entries and wrap when we hit the end */ + if (unlikely(index == ncams - 1)) + __this_cpu_write(next_tlbcam_idx, tlbcam_index); + else + __this_cpu_inc(next_tlbcam_idx); + + return index; +} + static inline void book3e_tlb_lock(void) { } @@ -131,18 +122,15 @@ static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid) return found; } -void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, - pte_t pte) +static void +book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte) { unsigned long mas1, mas2; u64 mas7_3; unsigned long psize, tsize, shift; unsigned long flags; struct mm_struct *mm; - -#ifdef CONFIG_PPC_FSL_BOOK3E int index; -#endif if (unlikely(is_kernel_addr(ea))) return; @@ -166,11 +154,9 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, return; } -#ifdef CONFIG_PPC_FSL_BOOK3E /* We have to use the CAM(TLB1) on FSL parts for hugepages */ index = tlb1_next(); mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1)); -#endif mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize); mas2 = ea & ~((1UL << shift) - 1); @@ -197,6 +183,18 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, local_irq_restore(flags); } +/* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a PTE in the linux page tables. + * + * This must always be called with the pte lock held. + */ +void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) +{ + if (is_vm_hugetlb_page(vma)) + book3e_hugetlb_preload(vma, address, *ptep); +} + void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { struct hstate *hstate = hstate_file(vma->vm_file); diff --git a/arch/powerpc/mm/pgtable-book3e.c b/arch/powerpc/mm/nohash/book3e_pgtable.c index 1032ef7aaf62..4637fdd469cf 100644 --- a/arch/powerpc/mm/pgtable-book3e.c +++ b/arch/powerpc/mm/nohash/book3e_pgtable.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright 2005, Paul Mackerras, IBM Corporation. * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation. * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/sched.h> @@ -15,7 +11,7 @@ #include <asm/tlb.h> #include <asm/dma.h> -#include "mmu_decl.h" +#include <mm/mmu_decl.h> #ifdef CONFIG_SPARSEMEM_VMEMMAP /* @@ -55,7 +51,7 @@ void vmemmap_remove_mapping(unsigned long start, #endif #endif /* CONFIG_SPARSEMEM_VMEMMAP */ -static __ref void *early_alloc_pgtable(unsigned long size) +static void __init *early_alloc_pgtable(unsigned long size) { void *ptr; @@ -74,7 +70,7 @@ static __ref void *early_alloc_pgtable(unsigned long size) * map_kernel_page adds an entry to the ioremap page table * and adds an entry to the HPT, possibly bolting it */ -int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) +int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) { pgd_t *pgdp; pud_t *pudp; @@ -98,20 +94,17 @@ int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) #ifndef __PAGETABLE_PUD_FOLDED if (pgd_none(*pgdp)) { pudp = early_alloc_pgtable(PUD_TABLE_SIZE); - BUG_ON(pudp == NULL); pgd_populate(&init_mm, pgdp, pudp); } #endif /* !__PAGETABLE_PUD_FOLDED */ pudp = pud_offset(pgdp, ea); if (pud_none(*pudp)) { pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); - BUG_ON(pmdp == NULL); pud_populate(&init_mm, pudp, pmdp); } pmdp = pmd_offset(pudp, ea); if (!pmd_present(*pmdp)) { ptep = early_alloc_pgtable(PAGE_SIZE); - BUG_ON(ptep == NULL); pmd_populate_kernel(&init_mm, pmdp, ptep); } ptep = pte_offset_kernel(pmdp, ea); diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/nohash/fsl_booke.c index 210cbc1faf63..b4eb06ceb189 100644 --- a/arch/powerpc/mm/fsl_booke_mmu.c +++ b/arch/powerpc/mm/nohash/fsl_booke.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Modifications by Kumar Gala (galak@kernel.crashing.org) to support * E500 Book E processors. @@ -17,12 +18,6 @@ * * Derived from "arch/i386/mm/init.c" * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <linux/signal.h> @@ -54,7 +49,7 @@ #include <asm/setup.h> #include <asm/paca.h> -#include "mmu_decl.h" +#include <mm/mmu_decl.h> unsigned int tlbcam_index; @@ -268,11 +263,13 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base, int __initdata is_second_reloc; notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start) { - unsigned long base = KERNELBASE; + unsigned long base = kernstart_virt_addr; + phys_addr_t size; kernstart_addr = start; if (is_second_reloc) { virt_phys_offset = PAGE_OFFSET - memstart_addr; + kaslr_late_init(); return; } @@ -296,7 +293,7 @@ notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start) start &= ~0x3ffffff; base &= ~0x3ffffff; virt_phys_offset = base - start; - early_get_first_memblock_info(__va(dt_ptr), NULL); + early_get_first_memblock_info(__va(dt_ptr), &size); /* * We now get the memstart_addr, then we should check if this * address is the same as what the PAGE_OFFSET map to now. If @@ -321,6 +318,8 @@ notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start) /* We should never reach here */ panic("Relocation error"); } + + kaslr_early_init(__va(dt_ptr), size); } #endif #endif diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c new file mode 100644 index 000000000000..4a75f2d9bf0e --- /dev/null +++ b/arch/powerpc/mm/nohash/kaslr_booke.c @@ -0,0 +1,401 @@ +// SPDX-License-Identifier: GPL-2.0-only +// +// Copyright (C) 2019 Jason Yan <yanaijie@huawei.com> + +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/stddef.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/memblock.h> +#include <linux/libfdt.h> +#include <linux/crash_core.h> +#include <asm/pgalloc.h> +#include <asm/prom.h> +#include <asm/kdump.h> +#include <mm/mmu_decl.h> +#include <generated/compile.h> +#include <generated/utsrelease.h> + +struct regions { + unsigned long pa_start; + unsigned long pa_end; + unsigned long kernel_size; + unsigned long dtb_start; + unsigned long dtb_end; + unsigned long initrd_start; + unsigned long initrd_end; + unsigned long crash_start; + unsigned long crash_end; + int reserved_mem; + int reserved_mem_addr_cells; + int reserved_mem_size_cells; +}; + +/* Simplified build-specific string for starting entropy. */ +static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" + LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; + +struct regions __initdata regions; + +static __init void kaslr_get_cmdline(void *fdt) +{ + int node = fdt_path_offset(fdt, "/chosen"); + + early_init_dt_scan_chosen(node, "chosen", 1, boot_command_line); +} + +static unsigned long __init rotate_xor(unsigned long hash, const void *area, + size_t size) +{ + size_t i; + const unsigned long *ptr = area; + + for (i = 0; i < size / sizeof(hash); i++) { + /* Rotate by odd number of bits and XOR. */ + hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7); + hash ^= ptr[i]; + } + + return hash; +} + +/* Attempt to create a simple starting entropy. This can make it defferent for + * every build but it is still not enough. Stronger entropy should + * be added to make it change for every boot. + */ +static unsigned long __init get_boot_seed(void *fdt) +{ + unsigned long hash = 0; + + hash = rotate_xor(hash, build_str, sizeof(build_str)); + hash = rotate_xor(hash, fdt, fdt_totalsize(fdt)); + + return hash; +} + +static __init u64 get_kaslr_seed(void *fdt) +{ + int node, len; + fdt64_t *prop; + u64 ret; + + node = fdt_path_offset(fdt, "/chosen"); + if (node < 0) + return 0; + + prop = fdt_getprop_w(fdt, node, "kaslr-seed", &len); + if (!prop || len != sizeof(u64)) + return 0; + + ret = fdt64_to_cpu(*prop); + *prop = 0; + return ret; +} + +static __init bool regions_overlap(u32 s1, u32 e1, u32 s2, u32 e2) +{ + return e1 >= s2 && e2 >= s1; +} + +static __init bool overlaps_reserved_region(const void *fdt, u32 start, + u32 end) +{ + int subnode, len, i; + u64 base, size; + + /* check for overlap with /memreserve/ entries */ + for (i = 0; i < fdt_num_mem_rsv(fdt); i++) { + if (fdt_get_mem_rsv(fdt, i, &base, &size) < 0) + continue; + if (regions_overlap(start, end, base, base + size)) + return true; + } + + if (regions.reserved_mem < 0) + return false; + + /* check for overlap with static reservations in /reserved-memory */ + for (subnode = fdt_first_subnode(fdt, regions.reserved_mem); + subnode >= 0; + subnode = fdt_next_subnode(fdt, subnode)) { + const fdt32_t *reg; + u64 rsv_end; + + len = 0; + reg = fdt_getprop(fdt, subnode, "reg", &len); + while (len >= (regions.reserved_mem_addr_cells + + regions.reserved_mem_size_cells)) { + base = fdt32_to_cpu(reg[0]); + if (regions.reserved_mem_addr_cells == 2) + base = (base << 32) | fdt32_to_cpu(reg[1]); + + reg += regions.reserved_mem_addr_cells; + len -= 4 * regions.reserved_mem_addr_cells; + + size = fdt32_to_cpu(reg[0]); + if (regions.reserved_mem_size_cells == 2) + size = (size << 32) | fdt32_to_cpu(reg[1]); + + reg += regions.reserved_mem_size_cells; + len -= 4 * regions.reserved_mem_size_cells; + + if (base >= regions.pa_end) + continue; + + rsv_end = min(base + size, (u64)U32_MAX); + + if (regions_overlap(start, end, base, rsv_end)) + return true; + } + } + return false; +} + +static __init bool overlaps_region(const void *fdt, u32 start, + u32 end) +{ + if (regions_overlap(start, end, __pa(_stext), __pa(_end))) + return true; + + if (regions_overlap(start, end, regions.dtb_start, + regions.dtb_end)) + return true; + + if (regions_overlap(start, end, regions.initrd_start, + regions.initrd_end)) + return true; + + if (regions_overlap(start, end, regions.crash_start, + regions.crash_end)) + return true; + + return overlaps_reserved_region(fdt, start, end); +} + +static void __init get_crash_kernel(void *fdt, unsigned long size) +{ +#ifdef CONFIG_CRASH_CORE + unsigned long long crash_size, crash_base; + int ret; + + ret = parse_crashkernel(boot_command_line, size, &crash_size, + &crash_base); + if (ret != 0 || crash_size == 0) + return; + if (crash_base == 0) + crash_base = KDUMP_KERNELBASE; + + regions.crash_start = (unsigned long)crash_base; + regions.crash_end = (unsigned long)(crash_base + crash_size); + + pr_debug("crash_base=0x%llx crash_size=0x%llx\n", crash_base, crash_size); +#endif +} + +static void __init get_initrd_range(void *fdt) +{ + u64 start, end; + int node, len; + const __be32 *prop; + + node = fdt_path_offset(fdt, "/chosen"); + if (node < 0) + return; + + prop = fdt_getprop(fdt, node, "linux,initrd-start", &len); + if (!prop) + return; + start = of_read_number(prop, len / 4); + + prop = fdt_getprop(fdt, node, "linux,initrd-end", &len); + if (!prop) + return; + end = of_read_number(prop, len / 4); + + regions.initrd_start = (unsigned long)start; + regions.initrd_end = (unsigned long)end; + + pr_debug("initrd_start=0x%llx initrd_end=0x%llx\n", start, end); +} + +static __init unsigned long get_usable_address(const void *fdt, + unsigned long start, + unsigned long offset) +{ + unsigned long pa; + unsigned long pa_end; + + for (pa = offset; (long)pa > (long)start; pa -= SZ_16K) { + pa_end = pa + regions.kernel_size; + if (overlaps_region(fdt, pa, pa_end)) + continue; + + return pa; + } + return 0; +} + +static __init void get_cell_sizes(const void *fdt, int node, int *addr_cells, + int *size_cells) +{ + const int *prop; + int len; + + /* + * Retrieve the #address-cells and #size-cells properties + * from the 'node', or use the default if not provided. + */ + *addr_cells = *size_cells = 1; + + prop = fdt_getprop(fdt, node, "#address-cells", &len); + if (len == 4) + *addr_cells = fdt32_to_cpu(*prop); + prop = fdt_getprop(fdt, node, "#size-cells", &len); + if (len == 4) + *size_cells = fdt32_to_cpu(*prop); +} + +static unsigned long __init kaslr_legal_offset(void *dt_ptr, unsigned long index, + unsigned long offset) +{ + unsigned long koffset = 0; + unsigned long start; + + while ((long)index >= 0) { + offset = memstart_addr + index * SZ_64M + offset; + start = memstart_addr + index * SZ_64M; + koffset = get_usable_address(dt_ptr, start, offset); + if (koffset) + break; + index--; + } + + if (koffset != 0) + koffset -= memstart_addr; + + return koffset; +} + +static inline __init bool kaslr_disabled(void) +{ + return strstr(boot_command_line, "nokaslr") != NULL; +} + +static unsigned long __init kaslr_choose_location(void *dt_ptr, phys_addr_t size, + unsigned long kernel_sz) +{ + unsigned long offset, random; + unsigned long ram, linear_sz; + u64 seed; + unsigned long index; + + kaslr_get_cmdline(dt_ptr); + if (kaslr_disabled()) + return 0; + + random = get_boot_seed(dt_ptr); + + seed = get_tb() << 32; + seed ^= get_tb(); + random = rotate_xor(random, &seed, sizeof(seed)); + + /* + * Retrieve (and wipe) the seed from the FDT + */ + seed = get_kaslr_seed(dt_ptr); + if (seed) + random = rotate_xor(random, &seed, sizeof(seed)); + else + pr_warn("KASLR: No safe seed for randomizing the kernel base.\n"); + + ram = min_t(phys_addr_t, __max_low_memory, size); + ram = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, true); + linear_sz = min_t(unsigned long, ram, SZ_512M); + + /* If the linear size is smaller than 64M, do not randmize */ + if (linear_sz < SZ_64M) + return 0; + + /* check for a reserved-memory node and record its cell sizes */ + regions.reserved_mem = fdt_path_offset(dt_ptr, "/reserved-memory"); + if (regions.reserved_mem >= 0) + get_cell_sizes(dt_ptr, regions.reserved_mem, + ®ions.reserved_mem_addr_cells, + ®ions.reserved_mem_size_cells); + + regions.pa_start = memstart_addr; + regions.pa_end = memstart_addr + linear_sz; + regions.dtb_start = __pa(dt_ptr); + regions.dtb_end = __pa(dt_ptr) + fdt_totalsize(dt_ptr); + regions.kernel_size = kernel_sz; + + get_initrd_range(dt_ptr); + get_crash_kernel(dt_ptr, ram); + + /* + * Decide which 64M we want to start + * Only use the low 8 bits of the random seed + */ + index = random & 0xFF; + index %= linear_sz / SZ_64M; + + /* Decide offset inside 64M */ + offset = random % (SZ_64M - kernel_sz); + offset = round_down(offset, SZ_16K); + + return kaslr_legal_offset(dt_ptr, index, offset); +} + +/* + * To see if we need to relocate the kernel to a random offset + * void *dt_ptr - address of the device tree + * phys_addr_t size - size of the first memory block + */ +notrace void __init kaslr_early_init(void *dt_ptr, phys_addr_t size) +{ + unsigned long tlb_virt; + phys_addr_t tlb_phys; + unsigned long offset; + unsigned long kernel_sz; + + kernel_sz = (unsigned long)_end - (unsigned long)_stext; + + offset = kaslr_choose_location(dt_ptr, size, kernel_sz); + if (offset == 0) + return; + + kernstart_virt_addr += offset; + kernstart_addr += offset; + + is_second_reloc = 1; + + if (offset >= SZ_64M) { + tlb_virt = round_down(kernstart_virt_addr, SZ_64M); + tlb_phys = round_down(kernstart_addr, SZ_64M); + + /* Create kernel map to relocate in */ + create_kaslr_tlb_entry(1, tlb_virt, tlb_phys); + } + + /* Copy the kernel to it's new location and run */ + memcpy((void *)kernstart_virt_addr, (void *)_stext, kernel_sz); + flush_icache_range(kernstart_virt_addr, kernstart_virt_addr + kernel_sz); + + reloc_kernel_entry(dt_ptr, kernstart_virt_addr); +} + +void __init kaslr_late_init(void) +{ + /* If randomized, clear the original kernel */ + if (kernstart_virt_addr != KERNELBASE) { + unsigned long kernel_sz; + + kernel_sz = (unsigned long)_end - kernstart_virt_addr; + memzero_explicit((void *)KERNELBASE, kernel_sz); + } +} diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/nohash/mmu_context.c index 1945c5f19f5e..aac81c9f84a5 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/nohash/mmu_context.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * This file contains the routines for handling the MMU on those * PowerPC implementations where the MMU is not using the hash @@ -9,11 +10,6 @@ * Derived from previous arch/powerpc/mm/mmu_context.c * and arch/powerpc/include/asm/mmu_context.h * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * TODO: * * - The global context lock will not scale very well @@ -52,7 +48,7 @@ #include <asm/mmu_context.h> #include <asm/tlbflush.h> -#include "mmu_decl.h" +#include <mm/mmu_decl.h> /* * The MPC8xx has only 16 contexts. We rotate through them on each task switch. diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/nohash/tlb.c index ac23dc1c6535..696f568253a0 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/nohash/tlb.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * This file contains the routines for TLB flushing. * On machines where the MMU does not use a hash table to store virtual to @@ -19,12 +20,6 @@ * * Derived from "arch/i386/mm/init.c" * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <linux/kernel.h> @@ -46,7 +41,7 @@ #include <asm/hugetlb.h> #include <asm/paca.h> -#include "mmu_decl.h" +#include <mm/mmu_decl.h> /* * This struct lists the sw-supported page sizes. The hardawre MMU may support @@ -433,11 +428,7 @@ void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address) unsigned long rid = (address & rmask) | 0x1000000000000000ul; unsigned long vpte = address & ~rmask; -#ifdef CONFIG_PPC_64K_PAGES - vpte = (vpte >> (PAGE_SHIFT - 4)) & ~0xfffful; -#else vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful; -#endif vpte |= rid; __flush_tlb_page(tlb->mm, vpte, tsize, 0); } @@ -625,21 +616,12 @@ static void early_init_this_mmu(void) case PPC_HTW_IBM: mas4 |= MAS4_INDD; -#ifdef CONFIG_PPC_64K_PAGES - mas4 |= BOOK3E_PAGESZ_256M << MAS4_TSIZED_SHIFT; - mmu_pte_psize = MMU_PAGE_256M; -#else mas4 |= BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT; mmu_pte_psize = MMU_PAGE_1M; -#endif break; case PPC_HTW_NONE: -#ifdef CONFIG_PPC_64K_PAGES - mas4 |= BOOK3E_PAGESZ_64K << MAS4_TSIZED_SHIFT; -#else mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT; -#endif mmu_pte_psize = mmu_virtual_psize; break; } @@ -648,7 +630,6 @@ static void early_init_this_mmu(void) #ifdef CONFIG_PPC_FSL_BOOK3E if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { unsigned int num_cams; - int __maybe_unused cpu = smp_processor_id(); bool map = true; /* use a quarter of the TLBCAM for bolted linear map */ @@ -722,6 +703,8 @@ static void __init early_init_mmu_global(void) * for use by the TLB miss code */ linear_map_top = memblock_end_of_DRAM(); + + ioremap_bot = IOREMAP_BASE; } static void __init early_mmu_set_memory_limit(void) @@ -800,5 +783,9 @@ void __init early_init_mmu(void) #ifdef CONFIG_PPC_47x early_init_mmu_47x(); #endif + +#ifdef CONFIG_PPC_MM_SLICES + mm_ctx_set_slb_addr_limit(&init_mm.context, SLB_ADDR_LIMIT_DEFAULT); +#endif } #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/nohash/tlb_low.S index e066a658acac..2ca407cedbe7 100644 --- a/arch/powerpc/mm/tlb_nohash_low.S +++ b/arch/powerpc/mm/nohash/tlb_low.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * This file contains low-level functions for performing various * types of TLB invalidations on various processors with no hash @@ -18,12 +19,6 @@ * * Partially rewritten by Cort Dougan (cort@cs.nmt.edu) * Paul Mackerras, Kumar Gala and Benjamin Herrenschmidt. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <asm/reg.h> diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/nohash/tlb_low_64e.S index 9ed90064f542..1f110c3c48fb 100644 --- a/arch/powerpc/mm/tlb_low_64e.S +++ b/arch/powerpc/mm/nohash/tlb_low_64e.S @@ -1,13 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Low level TLB miss handlers for Book3E * * Copyright (C) 2008-2009 * Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <asm/processor.h> @@ -24,11 +20,7 @@ #include <asm/kvm_booke_hv_asm.h> #include <asm/feature-fixups.h> -#ifdef CONFIG_PPC_64K_PAGES -#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE+1) -#else #define VPTE_PMD_SHIFT (PTE_INDEX_SIZE) -#endif #define VPTE_PUD_SHIFT (VPTE_PMD_SHIFT + PMD_INDEX_SIZE) #define VPTE_PGD_SHIFT (VPTE_PUD_SHIFT + PUD_INDEX_SIZE) #define VPTE_INDEX_SIZE (VPTE_PGD_SHIFT + PGD_INDEX_SIZE) @@ -167,13 +159,11 @@ MMU_FTR_SECTION_ELSE ldx r14,r14,r15 /* grab pgd entry */ ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV) -#ifndef CONFIG_PPC_64K_PAGES rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3 clrrdi r15,r15,3 cmpdi cr0,r14,0 bge tlb_miss_fault_bolted /* Bad pgd entry or hugepage; bail */ ldx r14,r14,r15 /* grab pud entry */ -#endif /* CONFIG_PPC_64K_PAGES */ rldicl r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3 clrrdi r15,r15,3 @@ -682,18 +672,7 @@ normal_tlb_miss: * order to handle the weird page table format used by linux */ ori r10,r15,0x1 -#ifdef CONFIG_PPC_64K_PAGES - /* For the top bits, 16 bytes per PTE */ - rldicl r14,r16,64-(PAGE_SHIFT-4),PAGE_SHIFT-4+4 - /* Now create the bottom bits as 0 in position 0x8000 and - * the rest calculated for 8 bytes per PTE - */ - rldicl r15,r16,64-(PAGE_SHIFT-3),64-15 - /* Insert the bottom bits in */ - rlwimi r14,r15,0,16,31 -#else rldicl r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4 -#endif sldi r15,r10,60 clrrdi r14,r14,3 or r10,r15,r14 @@ -732,11 +711,7 @@ finish_normal_tlb_miss: /* Check page size, if not standard, update MAS1 */ rldicl r11,r14,64-8,64-8 -#ifdef CONFIG_PPC_64K_PAGES - cmpldi cr0,r11,BOOK3E_PAGESZ_64K -#else cmpldi cr0,r11,BOOK3E_PAGESZ_4K -#endif beq- 1f mfspr r11,SPRN_MAS1 rlwimi r11,r14,31,21,24 @@ -857,14 +832,12 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) cmpdi cr0,r15,0 bge virt_page_table_tlb_miss_fault -#ifndef CONFIG_PPC_64K_PAGES /* Get to PUD entry */ rldicl r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3 clrrdi r10,r11,3 ldx r15,r10,r15 cmpdi cr0,r15,0 bge virt_page_table_tlb_miss_fault -#endif /* CONFIG_PPC_64K_PAGES */ /* Get to PMD entry */ rldicl r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3 @@ -1106,14 +1079,12 @@ htw_tlb_miss: cmpdi cr0,r15,0 bge htw_tlb_miss_fault -#ifndef CONFIG_PPC_64K_PAGES /* Get to PUD entry */ rldicl r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3 clrrdi r10,r11,3 ldx r15,r10,r15 cmpdi cr0,r15,0 bge htw_tlb_miss_fault -#endif /* CONFIG_PPC_64K_PAGES */ /* Get to PMD entry */ rldicl r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3 @@ -1132,9 +1103,7 @@ htw_tlb_miss: * 4K page we need to extract a bit from the virtual address and * insert it into the "PA52" bit of the RPN. */ -#ifndef CONFIG_PPC_64K_PAGES rlwimi r15,r16,32-9,20,20 -#endif /* Now we build the MAS: * * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG @@ -1144,11 +1113,7 @@ htw_tlb_miss: * MAS 2 : Use defaults * MAS 3+7 : Needs to be done */ -#ifdef CONFIG_PPC_64K_PAGES - ori r10,r15,(BOOK3E_PAGESZ_64K << MAS3_SPSIZE_SHIFT) -#else ori r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT) -#endif BEGIN_MMU_FTR_SECTION srdi r16,r10,32 diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index f976676004ad..50d68d21ddcc 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * pSeries NUMA support * * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) "numa: " fmt @@ -32,7 +28,6 @@ #include <asm/sparsemem.h> #include <asm/prom.h> #include <asm/smp.h> -#include <asm/cputhreads.h> #include <asm/topology.h> #include <asm/firmware.h> #include <asm/paca.h> @@ -168,6 +163,22 @@ static void unmap_cpu_from_node(unsigned long cpu) } #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ +int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) +{ + int dist = 0; + + int i, index; + + for (i = 0; i < distance_ref_points_depth; i++) { + index = be32_to_cpu(distance_ref_points[i]); + if (cpu1_assoc[index] == cpu2_assoc[index]) + break; + dist++; + } + + return dist; +} + /* must hold reference to node during call */ static const __be32 *of_get_associativity(struct device_node *dev) { @@ -217,7 +228,7 @@ static int associativity_to_nid(const __be32 *associativity) { int nid = NUMA_NO_NODE; - if (min_common_depth == -1) + if (!numa_enabled) goto out; if (of_read_number(associativity, 1) >= min_common_depth) @@ -421,17 +432,19 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa) static int of_drconf_to_nid_single(struct drmem_lmb *lmb) { struct assoc_arrays aa = { .arrays = NULL }; - int default_nid = 0; + int default_nid = NUMA_NO_NODE; int nid = default_nid; int rc, index; + if ((min_common_depth < 0) || !numa_enabled) + return default_nid; + rc = of_get_assoc_arrays(&aa); if (rc) return default_nid; - if (min_common_depth > 0 && min_common_depth <= aa.array_sz && - !(lmb->flags & DRCONF_MEM_AI_INVALID) && - lmb->aa_index < aa.n_arrays) { + if (min_common_depth <= aa.array_sz && + !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { index = lmb->aa_index * aa.array_sz + min_common_depth - 1; nid = of_read_number(&aa.arrays[index], 1); @@ -631,8 +644,14 @@ static int __init parse_numa_properties(void) min_common_depth = find_min_common_depth(); - if (min_common_depth < 0) + if (min_common_depth < 0) { + /* + * if we fail to parse min_common_depth from device tree + * mark the numa disabled, boot with numa disabled. + */ + numa_enabled = false; return min_common_depth; + } dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); @@ -748,7 +767,7 @@ void __init dump_numa_cpu_topology(void) unsigned int node; unsigned int cpu, count; - if (min_common_depth == -1 || !numa_enabled) + if (!numa_enabled) return; for_each_online_node(node) { @@ -813,7 +832,7 @@ static void __init find_possible_nodes(void) struct device_node *rtas; u32 numnodes, i; - if (min_common_depth <= 0) + if (!numa_enabled) return; rtas = of_find_node_by_path("/rtas"); @@ -908,16 +927,22 @@ static int __init early_numa(char *p) } early_param("numa", early_numa); -static bool topology_updates_enabled = true; +/* + * The platform can inform us through one of several mechanisms + * (post-migration device tree updates, PRRN or VPHN) that the NUMA + * assignment of a resource has changed. This controls whether we act + * on that. Disabled by default. + */ +static bool topology_updates_enabled; static int __init early_topology_updates(char *p) { if (!p) return 0; - if (!strcmp(p, "off")) { - pr_info("Disabling topology updates\n"); - topology_updates_enabled = false; + if (!strcmp(p, "on")) { + pr_warn("Caution: enabling topology updates\n"); + topology_updates_enabled = true; } return 0; @@ -1009,7 +1034,7 @@ int hot_add_scn_to_nid(unsigned long scn_addr) struct device_node *memory = NULL; int nid; - if (!numa_enabled || (min_common_depth < 0)) + if (!numa_enabled) return first_online_node; memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); @@ -1062,9 +1087,6 @@ u64 memory_hotplug_max(void) /* Virtual Processor Home Node (VPHN) support */ #ifdef CONFIG_PPC_SPLPAR - -#include "vphn.h" - struct topology_update_data { struct topology_update_data *next; unsigned int cpu; @@ -1160,25 +1182,13 @@ static int update_cpu_associativity_changes_mask(void) * Retrieve the new associativity information for a virtual processor's * home node. */ -static long hcall_vphn(unsigned long cpu, __be32 *associativity) -{ - long rc; - long retbuf[PLPAR_HCALL9_BUFSIZE] = {0}; - u64 flags = 1; - int hwcpu = get_hard_smp_processor_id(cpu); - - rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu); - vphn_unpack_associativity(retbuf, associativity); - - return rc; -} - static long vphn_get_associativity(unsigned long cpu, __be32 *associativity) { long rc; - rc = hcall_vphn(cpu, associativity); + rc = hcall_vphn(get_hard_smp_processor_id(cpu), + VPHN_FLAG_VCPU, associativity); switch (rc) { case H_FUNCTION: @@ -1498,6 +1508,9 @@ int start_topology_update(void) { int rc = 0; + if (!topology_updates_enabled) + return 0; + if (firmware_has_feature(FW_FEATURE_PRRN)) { if (!prrn_enabled) { prrn_enabled = 1; @@ -1531,6 +1544,9 @@ int stop_topology_update(void) { int rc = 0; + if (!topology_updates_enabled) + return 0; + if (prrn_enabled) { prrn_enabled = 0; #ifdef CONFIG_SMP @@ -1588,11 +1604,13 @@ static ssize_t topology_write(struct file *file, const char __user *buf, kbuf[read_len] = '\0'; - if (!strncmp(kbuf, "on", 2)) + if (!strncmp(kbuf, "on", 2)) { + topology_updates_enabled = true; start_topology_update(); - else if (!strncmp(kbuf, "off", 3)) + } else if (!strncmp(kbuf, "off", 3)) { stop_topology_update(); - else + topology_updates_enabled = false; + } else return -EINVAL; return count; @@ -1607,9 +1625,7 @@ static const struct file_operations topology_ops = { static int topology_update_init(void) { - /* Do not poll for changes if disabled at boot */ - if (topology_updates_enabled) - start_topology_update(); + start_topology_update(); if (vphn_enabled) topology_schedule_update(); diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c index a7b05214760c..ee4bd6d38602 100644 --- a/arch/powerpc/mm/pgtable-frag.c +++ b/arch/powerpc/mm/pgtable-frag.c @@ -25,7 +25,7 @@ void pte_frag_destroy(void *pte_frag) count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT; /* We allow PTE_FRAG_NR fragments from a PTE page */ if (atomic_sub_and_test(PTE_FRAG_NR - count, &page->pt_frag_refcount)) { - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } } @@ -61,7 +61,7 @@ static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel) page = alloc_page(PGALLOC_GFP | __GFP_ACCOUNT); if (!page) return NULL; - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } @@ -113,7 +113,7 @@ void pte_fragment_free(unsigned long *table, int kernel) BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); if (atomic_dec_and_test(&page->pt_frag_refcount)) { if (!kernel) - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } } diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index d3d61d29b4f1..e3759b69f81b 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * This file contains common routines for dealing with free of page tables * Along with common page table handling code @@ -14,11 +15,6 @@ * * Dave Engebretsen <engebret@us.ibm.com> * Rework for PPC64 port. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/kernel.h> @@ -30,6 +26,7 @@ #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/tlb.h> +#include <asm/hugetlb.h> static inline int is_exec_fault(void) { @@ -299,3 +296,128 @@ unsigned long vmalloc_to_phys(void *va) return __pa(pfn_to_kaddr(pfn)) + offset_in_page(va); } EXPORT_SYMBOL_GPL(vmalloc_to_phys); + +/* + * We have 4 cases for pgds and pmds: + * (1) invalid (all zeroes) + * (2) pointer to next table, as normal; bottom 6 bits == 0 + * (3) leaf pte for huge page _PAGE_PTE set + * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table + * + * So long as we atomically load page table pointers we are safe against teardown, + * we can follow the address down to the the page and take a ref on it. + * This function need to be called with interrupts disabled. We use this variant + * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED + */ +pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, + bool *is_thp, unsigned *hpage_shift) +{ + pgd_t pgd, *pgdp; + pud_t pud, *pudp; + pmd_t pmd, *pmdp; + pte_t *ret_pte; + hugepd_t *hpdp = NULL; + unsigned pdshift = PGDIR_SHIFT; + + if (hpage_shift) + *hpage_shift = 0; + + if (is_thp) + *is_thp = false; + + pgdp = pgdir + pgd_index(ea); + pgd = READ_ONCE(*pgdp); + /* + * Always operate on the local stack value. This make sure the + * value don't get updated by a parallel THP split/collapse, + * page fault or a page unmap. The return pte_t * is still not + * stable. So should be checked there for above conditions. + */ + if (pgd_none(pgd)) + return NULL; + + if (pgd_is_leaf(pgd)) { + ret_pte = (pte_t *)pgdp; + goto out; + } + + if (is_hugepd(__hugepd(pgd_val(pgd)))) { + hpdp = (hugepd_t *)&pgd; + goto out_huge; + } + + /* + * Even if we end up with an unmap, the pgtable will not + * be freed, because we do an rcu free and here we are + * irq disabled + */ + pdshift = PUD_SHIFT; + pudp = pud_offset(&pgd, ea); + pud = READ_ONCE(*pudp); + + if (pud_none(pud)) + return NULL; + + if (pud_is_leaf(pud)) { + ret_pte = (pte_t *)pudp; + goto out; + } + + if (is_hugepd(__hugepd(pud_val(pud)))) { + hpdp = (hugepd_t *)&pud; + goto out_huge; + } + + pdshift = PMD_SHIFT; + pmdp = pmd_offset(&pud, ea); + pmd = READ_ONCE(*pmdp); + + /* + * A hugepage collapse is captured by this condition, see + * pmdp_collapse_flush. + */ + if (pmd_none(pmd)) + return NULL; + +#ifdef CONFIG_PPC_BOOK3S_64 + /* + * A hugepage split is captured by this condition, see + * pmdp_invalidate. + * + * Huge page modification can be caught here too. + */ + if (pmd_is_serializing(pmd)) + return NULL; +#endif + + if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) { + if (is_thp) + *is_thp = true; + ret_pte = (pte_t *)pmdp; + goto out; + } + + if (pmd_is_leaf(pmd)) { + ret_pte = (pte_t *)pmdp; + goto out; + } + + if (is_hugepd(__hugepd(pmd_val(pmd)))) { + hpdp = (hugepd_t *)&pmd; + goto out_huge; + } + + return pte_offset_kernel(&pmd, ea); + +out_huge: + if (!hpdp) + return NULL; + + ret_pte = hugepte_offset(*hpdp, ea, pdshift); + pdshift = hugepd_shift(*hpdp); +out: + if (hpage_shift) + *hpage_shift = pdshift; + return ret_pte; +} +EXPORT_SYMBOL_GPL(__find_linux_pte); diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 6e56a6240bfa..73b84166d06a 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * This file contains the routines setting up the linux page tables. * -- paulus @@ -11,12 +12,6 @@ * * Derived from "arch/i386/mm/init.c" * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <linux/kernel.h> @@ -32,180 +27,36 @@ #include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/fixmap.h> -#include <asm/io.h> #include <asm/setup.h> #include <asm/sections.h> -#include "mmu_decl.h" - -unsigned long ioremap_bot; -EXPORT_SYMBOL(ioremap_bot); /* aka VMALLOC_END */ +#include <mm/mmu_decl.h> extern char etext[], _stext[], _sinittext[], _einittext[]; -__ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - if (!slab_is_available()) - return memblock_alloc(PTE_FRAG_SIZE, PTE_FRAG_SIZE); - - return (pte_t *)pte_fragment_alloc(mm, 1); -} - -pgtable_t pte_alloc_one(struct mm_struct *mm) -{ - return (pgtable_t)pte_fragment_alloc(mm, 0); -} - -void __iomem * -ioremap(phys_addr_t addr, unsigned long size) -{ - pgprot_t prot = pgprot_noncached(PAGE_KERNEL); - - return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); -} -EXPORT_SYMBOL(ioremap); - -void __iomem * -ioremap_wc(phys_addr_t addr, unsigned long size) -{ - pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL); - - return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); -} -EXPORT_SYMBOL(ioremap_wc); - -void __iomem * -ioremap_wt(phys_addr_t addr, unsigned long size) +static void __init *early_alloc_pgtable(unsigned long size) { - pgprot_t prot = pgprot_cached_wthru(PAGE_KERNEL); - - return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); -} -EXPORT_SYMBOL(ioremap_wt); + void *ptr = memblock_alloc(size, size); -void __iomem * -ioremap_coherent(phys_addr_t addr, unsigned long size) -{ - pgprot_t prot = pgprot_cached(PAGE_KERNEL); + if (!ptr) + panic("%s: Failed to allocate %lu bytes align=0x%lx\n", + __func__, size, size); - return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); + return ptr; } -EXPORT_SYMBOL(ioremap_coherent); -void __iomem * -ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) +static pte_t __init *early_pte_alloc_kernel(pmd_t *pmdp, unsigned long va) { - pte_t pte = __pte(flags); - - /* writeable implies dirty for kernel addresses */ - if (pte_write(pte)) - pte = pte_mkdirty(pte); - - /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ - pte = pte_exprotect(pte); - pte = pte_mkprivileged(pte); + if (pmd_none(*pmdp)) { + pte_t *ptep = early_alloc_pgtable(PTE_FRAG_SIZE); - return __ioremap_caller(addr, size, pte_pgprot(pte), __builtin_return_address(0)); -} -EXPORT_SYMBOL(ioremap_prot); - -void __iomem * -__ioremap(phys_addr_t addr, unsigned long size, unsigned long flags) -{ - return __ioremap_caller(addr, size, __pgprot(flags), __builtin_return_address(0)); -} - -void __iomem * -__ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller) -{ - unsigned long v, i; - phys_addr_t p; - int err; - - /* - * Choose an address to map it to. - * Once the vmalloc system is running, we use it. - * Before then, we use space going down from IOREMAP_TOP - * (ioremap_bot records where we're up to). - */ - p = addr & PAGE_MASK; - size = PAGE_ALIGN(addr + size) - p; - - /* - * If the address lies within the first 16 MB, assume it's in ISA - * memory space - */ - if (p < 16*1024*1024) - p += _ISA_MEM_BASE; - -#ifndef CONFIG_CRASH_DUMP - /* - * Don't allow anybody to remap normal RAM that we're using. - * mem_init() sets high_memory so only do the check after that. - */ - if (slab_is_available() && p <= virt_to_phys(high_memory - 1) && - page_is_ram(__phys_to_pfn(p))) { - printk("__ioremap(): phys addr 0x%llx is RAM lr %ps\n", - (unsigned long long)p, __builtin_return_address(0)); - return NULL; - } -#endif - - if (size == 0) - return NULL; - - /* - * Is it already mapped? Perhaps overlapped by a previous - * mapping. - */ - v = p_block_mapped(p); - if (v) - goto out; - - if (slab_is_available()) { - struct vm_struct *area; - area = get_vm_area_caller(size, VM_IOREMAP, caller); - if (area == 0) - return NULL; - area->phys_addr = p; - v = (unsigned long) area->addr; - } else { - v = (ioremap_bot -= size); + pmd_populate_kernel(&init_mm, pmdp, ptep); } - - /* - * Should check if it is a candidate for a BAT mapping - */ - - err = 0; - for (i = 0; i < size && err == 0; i += PAGE_SIZE) - err = map_kernel_page(v + i, p + i, prot); - if (err) { - if (slab_is_available()) - vunmap((void *)v); - return NULL; - } - -out: - return (void __iomem *) (v + ((unsigned long)addr & ~PAGE_MASK)); + return pte_offset_kernel(pmdp, va); } -EXPORT_SYMBOL(__ioremap); - -void iounmap(volatile void __iomem *addr) -{ - /* - * If mapped by BATs then there is nothing to do. - * Calling vfree() generates a benign warning. - */ - if (v_block_mapped((unsigned long)addr)) - return; - if (addr > high_memory && (unsigned long) addr < ioremap_bot) - vunmap((void *) (PAGE_MASK & (unsigned long)addr)); -} -EXPORT_SYMBOL(iounmap); -int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot) +int __ref map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot) { pmd_t *pd; pte_t *pg; @@ -214,7 +65,10 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot) /* Use upper 10 bits of VA to index the first level map */ pd = pmd_offset(pud_offset(pgd_offset_k(va), va), va); /* Use middle 10 bits of VA to index the second-level map */ - pg = pte_alloc_kernel(pd, va); + if (likely(slab_is_available())) + pg = pte_alloc_kernel(pd, va); + else + pg = early_pte_alloc_kernel(pd, va); if (pg != 0) { err = 0; /* The PTE should never be already set nor present in the @@ -245,7 +99,7 @@ static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) map_kernel_page(v, p, ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL); #ifdef CONFIG_PPC_BOOK3S_32 if (ktext) - hash_preload(&init_mm, v, false, 0x300); + hash_preload(&init_mm, v); #endif v += PAGE_SIZE; p += PAGE_SIZE; @@ -263,10 +117,7 @@ void __init mapin_ram(void) if (base >= top) continue; base = mmu_mapin_ram(base, top); - if (IS_ENABLED(CONFIG_BDI_SWITCH)) - __mapin_ram_chunk(reg->base, top); - else - __mapin_ram_chunk(base, top); + __mapin_ram_chunk(base, top); } } @@ -353,7 +204,7 @@ void mark_initmem_nx(void) unsigned long numpages = PFN_UP((unsigned long)_einittext) - PFN_DOWN((unsigned long)_sinittext); - if (v_block_mapped((unsigned long)_stext) + 1) + if (v_block_mapped((unsigned long)_stext + 1)) mmu_mark_initmem_nx(); else change_page_attr(page, numpages, PAGE_KERNEL); @@ -384,6 +235,9 @@ void mark_rodata_ro(void) PFN_DOWN((unsigned long)__start_rodata); change_page_attr(page, numpages, PAGE_KERNEL_RO); + + // mark_initmem_nx() should have already run by now + ptdump_check_wx(); } #endif diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index fb1375c07e8c..e78832dce7bb 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -1,5 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * This file contains ioremap and related functions for 64-bit machines. + * This file contains pgtable related functions for 64-bit machines. * * Derived from arch/ppc64/mm/init.c * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) @@ -13,12 +14,6 @@ * * Dave Engebretsen <engebret@us.ibm.com> * Rework for PPC64 port. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <linux/signal.h> @@ -39,7 +34,6 @@ #include <asm/pgalloc.h> #include <asm/page.h> #include <asm/prom.h> -#include <asm/io.h> #include <asm/mmu_context.h> #include <asm/pgtable.h> #include <asm/mmu.h> @@ -52,7 +46,7 @@ #include <asm/firmware.h> #include <asm/dma.h> -#include "mmu_decl.h" +#include <mm/mmu_decl.h> #ifdef CONFIG_PPC_BOOK3S_64 @@ -90,218 +84,39 @@ unsigned long __pgd_val_bits; EXPORT_SYMBOL(__pgd_val_bits); unsigned long __kernel_virt_start; EXPORT_SYMBOL(__kernel_virt_start); -unsigned long __kernel_virt_size; -EXPORT_SYMBOL(__kernel_virt_size); unsigned long __vmalloc_start; EXPORT_SYMBOL(__vmalloc_start); unsigned long __vmalloc_end; EXPORT_SYMBOL(__vmalloc_end); unsigned long __kernel_io_start; EXPORT_SYMBOL(__kernel_io_start); +unsigned long __kernel_io_end; struct page *vmemmap; EXPORT_SYMBOL(vmemmap); unsigned long __pte_frag_nr; EXPORT_SYMBOL(__pte_frag_nr); unsigned long __pte_frag_size_shift; EXPORT_SYMBOL(__pte_frag_size_shift); -unsigned long ioremap_bot; -#else /* !CONFIG_PPC_BOOK3S_64 */ -unsigned long ioremap_bot = IOREMAP_BASE; #endif -/** - * __ioremap_at - Low level function to establish the page tables - * for an IO mapping - */ -void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_t prot) -{ - unsigned long i; - - /* We don't support the 4K PFN hack with ioremap */ - if (pgprot_val(prot) & H_PAGE_4K_PFN) - return NULL; - - WARN_ON(pa & ~PAGE_MASK); - WARN_ON(((unsigned long)ea) & ~PAGE_MASK); - WARN_ON(size & ~PAGE_MASK); - - for (i = 0; i < size; i += PAGE_SIZE) - if (map_kernel_page((unsigned long)ea + i, pa + i, prot)) - return NULL; - - return (void __iomem *)ea; -} - -/** - * __iounmap_from - Low level function to tear down the page tables - * for an IO mapping. This is used for mappings that - * are manipulated manually, like partial unmapping of - * PCI IOs or ISA space. - */ -void __iounmap_at(void *ea, unsigned long size) -{ - WARN_ON(((unsigned long)ea) & ~PAGE_MASK); - WARN_ON(size & ~PAGE_MASK); - - unmap_kernel_range((unsigned long)ea, size); -} - -void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size, - pgprot_t prot, void *caller) -{ - phys_addr_t paligned; - void __iomem *ret; - - /* - * Choose an address to map it to. - * Once the imalloc system is running, we use it. - * Before that, we map using addresses going - * up from ioremap_bot. imalloc will use - * the addresses from ioremap_bot through - * IMALLOC_END - * - */ - paligned = addr & PAGE_MASK; - size = PAGE_ALIGN(addr + size) - paligned; - - if ((size == 0) || (paligned == 0)) - return NULL; - - if (slab_is_available()) { - struct vm_struct *area; - - area = __get_vm_area_caller(size, VM_IOREMAP, - ioremap_bot, IOREMAP_END, - caller); - if (area == NULL) - return NULL; - - area->phys_addr = paligned; - ret = __ioremap_at(paligned, area->addr, size, prot); - if (!ret) - vunmap(area->addr); - } else { - ret = __ioremap_at(paligned, (void *)ioremap_bot, size, prot); - if (ret) - ioremap_bot += size; - } - - if (ret) - ret += addr & ~PAGE_MASK; - return ret; -} - -void __iomem * __ioremap(phys_addr_t addr, unsigned long size, - unsigned long flags) -{ - return __ioremap_caller(addr, size, __pgprot(flags), __builtin_return_address(0)); -} - -void __iomem * ioremap(phys_addr_t addr, unsigned long size) -{ - pgprot_t prot = pgprot_noncached(PAGE_KERNEL); - void *caller = __builtin_return_address(0); - - if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, prot, caller); - return __ioremap_caller(addr, size, prot, caller); -} - -void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size) -{ - pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL); - void *caller = __builtin_return_address(0); - - if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, prot, caller); - return __ioremap_caller(addr, size, prot, caller); -} - -void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size) -{ - pgprot_t prot = pgprot_cached(PAGE_KERNEL); - void *caller = __builtin_return_address(0); - - if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, prot, caller); - return __ioremap_caller(addr, size, prot, caller); -} - -void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, - unsigned long flags) -{ - pte_t pte = __pte(flags); - void *caller = __builtin_return_address(0); - - /* writeable implies dirty for kernel addresses */ - if (pte_write(pte)) - pte = pte_mkdirty(pte); - - /* we don't want to let _PAGE_EXEC leak out */ - pte = pte_exprotect(pte); - /* - * Force kernel mapping. - */ - pte = pte_mkprivileged(pte); - - if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, pte_pgprot(pte), caller); - return __ioremap_caller(addr, size, pte_pgprot(pte), caller); -} - - -/* - * Unmap an IO region and remove it from imalloc'd list. - * Access to IO memory should be serialized by driver. - */ -void __iounmap(volatile void __iomem *token) -{ - void *addr; - - if (!slab_is_available()) - return; - - addr = (void *) ((unsigned long __force) - PCI_FIX_ADDR(token) & PAGE_MASK); - if ((unsigned long)addr < ioremap_bot) { - printk(KERN_WARNING "Attempt to iounmap early bolted mapping" - " at 0x%p\n", addr); - return; - } - vunmap(addr); -} - -void iounmap(volatile void __iomem *token) -{ - if (ppc_md.iounmap) - ppc_md.iounmap(token); - else - __iounmap(token); -} - -EXPORT_SYMBOL(ioremap); -EXPORT_SYMBOL(ioremap_wc); -EXPORT_SYMBOL(ioremap_prot); -EXPORT_SYMBOL(__ioremap); -EXPORT_SYMBOL(__ioremap_at); -EXPORT_SYMBOL(iounmap); -EXPORT_SYMBOL(__iounmap); -EXPORT_SYMBOL(__iounmap_at); - #ifndef __PAGETABLE_PUD_FOLDED /* 4 level page table */ struct page *pgd_page(pgd_t pgd) { - if (pgd_huge(pgd)) + if (pgd_is_leaf(pgd)) { + VM_WARN_ON(!pgd_huge(pgd)); return pte_page(pgd_pte(pgd)); + } return virt_to_page(pgd_page_vaddr(pgd)); } #endif struct page *pud_page(pud_t pud) { - if (pud_huge(pud)) + if (pud_is_leaf(pud)) { + VM_WARN_ON(!pud_huge(pud)); return pte_page(pud_pte(pud)); + } return virt_to_page(pud_page_vaddr(pud)); } @@ -311,8 +126,10 @@ struct page *pud_page(pud_t pud) */ struct page *pmd_page(pmd_t pmd) { - if (pmd_large(pmd) || pmd_huge(pmd) || pmd_devmap(pmd)) + if (pmd_is_leaf(pmd)) { + VM_WARN_ON(!(pmd_large(pmd) || pmd_huge(pmd))); return pte_page(pmd_pte(pmd)); + } return virt_to_page(pmd_page_vaddr(pmd)); } @@ -328,6 +145,9 @@ void mark_rodata_ro(void) radix__mark_rodata_ro(); else hash__mark_rodata_ro(); + + // mark_initmem_nx() should have already run by now + ptdump_check_wx(); } void mark_initmem_nx(void) diff --git a/arch/powerpc/mm/ptdump/bats.c b/arch/powerpc/mm/ptdump/bats.c index a0d23e96e841..4154feac1da3 100644 --- a/arch/powerpc/mm/ptdump/bats.c +++ b/arch/powerpc/mm/ptdump/bats.c @@ -149,7 +149,7 @@ static int bats_show_603(struct seq_file *m, void *v) static int bats_open(struct inode *inode, struct file *file) { - if (cpu_has_feature(CPU_FTR_601)) + if (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) return single_open(file, bats_show_601, NULL); return single_open(file, bats_show_603, NULL); diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c index b430e4e08af6..a07278027c6f 100644 --- a/arch/powerpc/mm/ptdump/hashpagetable.c +++ b/arch/powerpc/mm/ptdump/hashpagetable.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2016, Rashmica Gupta, IBM Corp. * @@ -7,11 +8,6 @@ * * If radix is enabled then there is no hash page table and so no debugfs file * is generated. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. */ #include <linux/debugfs.h> #include <linux/fs.h> @@ -241,7 +237,6 @@ static int native_find(unsigned long ea, int psize, bool primary, u64 *v, u64 return -1; } -#ifdef CONFIG_PPC_PSERIES static int pseries_find(unsigned long ea, int psize, bool primary, u64 *v, u64 *r) { struct hash_pte ptes[4]; @@ -278,7 +273,6 @@ static int pseries_find(unsigned long ea, int psize, bool primary, u64 *v, u64 * } return -1; } -#endif static void decode_r(int bps, unsigned long r, unsigned long *rpn, int *aps, unsigned long *lp_bits) @@ -320,10 +314,9 @@ static void decode_r(int bps, unsigned long r, unsigned long *rpn, int *aps, static int base_hpte_find(unsigned long ea, int psize, bool primary, u64 *v, u64 *r) { -#ifdef CONFIG_PPC_PSERIES - if (firmware_has_feature(FW_FEATURE_LPAR)) + if (IS_ENABLED(CONFIG_PPC_PSERIES) && firmware_has_feature(FW_FEATURE_LPAR)) return pseries_find(ea, psize, primary, v, r); -#endif + return native_find(ea, psize, primary, v, r); } @@ -390,12 +383,13 @@ static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start) psize = mmu_vmalloc_psize; else psize = mmu_io_psize; -#ifdef CONFIG_PPC_64K_PAGES + /* check for secret 4K mappings */ - if (((pteval & H_PAGE_COMBO) == H_PAGE_COMBO) || - ((pteval & H_PAGE_4K_PFN) == H_PAGE_4K_PFN)) + if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && + ((pteval & H_PAGE_COMBO) == H_PAGE_COMBO || + (pteval & H_PAGE_4K_PFN) == H_PAGE_4K_PFN)) psize = mmu_io_psize; -#endif + /* check for hashpte */ status = hpte_find(st, addr, psize); @@ -473,9 +467,10 @@ static void walk_linearmapping(struct pg_state *st) static void walk_vmemmap(struct pg_state *st) { -#ifdef CONFIG_SPARSEMEM_VMEMMAP struct vmemmap_backing *ptr = vmemmap_list; + if (!IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) + return; /* * Traverse the vmemmaped memory and dump pages that are in the hash * pagetable. @@ -485,7 +480,6 @@ static void walk_vmemmap(struct pg_state *st) ptr = ptr->list; } seq_puts(st->seq, "---[ vmemmap end ]---\n"); -#endif } static void populate_markers(void) @@ -499,11 +493,7 @@ static void populate_markers(void) address_markers[6].start_address = PHB_IO_END; address_markers[7].start_address = IOREMAP_BASE; address_markers[8].start_address = IOREMAP_END; -#ifdef CONFIG_PPC_BOOK3S_64 - address_markers[9].start_address = H_VMEMMAP_BASE; -#else - address_markers[9].start_address = VMEMMAP_BASE; -#endif + address_markers[9].start_address = H_VMEMMAP_START; } static int ptdump_show(struct seq_file *m, void *v) diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 37138428ab55..2f9ddc29c535 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2016, Rashmica Gupta, IBM Corp. * @@ -8,11 +9,6 @@ * Derived from the arm64 implementation: * Copyright (c) 2014, The Linux Foundation, Laura Abbott. * (C) Copyright 2008 Intel Corporation, Arjan van de Ven. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. */ #include <linux/debugfs.h> #include <linux/fs.h> @@ -30,10 +26,6 @@ #include "ptdump.h" -#ifdef CONFIG_PPC32 -#define KERN_VIRT_START 0 -#endif - /* * To visualise what is happening, * @@ -68,6 +60,8 @@ struct pg_state { unsigned long last_pa; unsigned int level; u64 current_flags; + bool check_wx; + unsigned long wx_pages; }; struct addr_marker { @@ -90,10 +84,6 @@ static struct addr_marker address_markers[] = { #else { 0, "Early I/O remap start" }, { 0, "Early I/O remap end" }, -#ifdef CONFIG_NOT_COHERENT_CACHE - { 0, "Consistent mem start" }, - { 0, "Consistent mem end" }, -#endif #ifdef CONFIG_HIGHMEM { 0, "Highmem PTEs start" }, { 0, "Highmem PTEs end" }, @@ -101,9 +91,25 @@ static struct addr_marker address_markers[] = { { 0, "Fixmap start" }, { 0, "Fixmap end" }, #endif +#ifdef CONFIG_KASAN + { 0, "kasan shadow mem start" }, + { 0, "kasan shadow mem end" }, +#endif { -1, NULL }, }; +#define pt_dump_seq_printf(m, fmt, args...) \ +({ \ + if (m) \ + seq_printf(m, fmt, ##args); \ +}) + +#define pt_dump_seq_putc(m, c) \ +({ \ + if (m) \ + seq_putc(m, c); \ +}) + static void dump_flag_info(struct pg_state *st, const struct flag_info *flag, u64 pte, int num) { @@ -121,19 +127,19 @@ static void dump_flag_info(struct pg_state *st, const struct flag_info val = pte & flag->val; if (flag->shift) val = val >> flag->shift; - seq_printf(st->seq, " %s:%llx", flag->set, val); + pt_dump_seq_printf(st->seq, " %s:%llx", flag->set, val); } else { if ((pte & flag->mask) == flag->val) s = flag->set; else s = flag->clear; if (s) - seq_printf(st->seq, " %s", s); + pt_dump_seq_printf(st->seq, " %s", s); } st->current_flags &= ~flag->mask; } if (st->current_flags != 0) - seq_printf(st->seq, " unknown flags:%llx", st->current_flags); + pt_dump_seq_printf(st->seq, " unknown flags:%llx", st->current_flags); } static void dump_addr(struct pg_state *st, unsigned long addr) @@ -148,12 +154,12 @@ static void dump_addr(struct pg_state *st, unsigned long addr) #define REG "0x%08lx" #endif - seq_printf(st->seq, REG "-" REG " ", st->start_address, addr - 1); + pt_dump_seq_printf(st->seq, REG "-" REG " ", st->start_address, addr - 1); if (st->start_pa == st->last_pa && st->start_address + PAGE_SIZE != addr) { - seq_printf(st->seq, "[" REG "]", st->start_pa); + pt_dump_seq_printf(st->seq, "[" REG "]", st->start_pa); delta = PAGE_SIZE >> 10; } else { - seq_printf(st->seq, " " REG " ", st->start_pa); + pt_dump_seq_printf(st->seq, " " REG " ", st->start_pa); delta = (addr - st->start_address) >> 10; } /* Work out what appropriate unit to use */ @@ -161,8 +167,22 @@ static void dump_addr(struct pg_state *st, unsigned long addr) delta >>= 10; unit++; } - seq_printf(st->seq, "%9lu%c", delta, *unit); + pt_dump_seq_printf(st->seq, "%9lu%c", delta, *unit); + +} +static void note_prot_wx(struct pg_state *st, unsigned long addr) +{ + if (!IS_ENABLED(CONFIG_PPC_DEBUG_WX) || !st->check_wx) + return; + + if (!((st->current_flags & pgprot_val(PAGE_KERNEL_X)) == pgprot_val(PAGE_KERNEL_X))) + return; + + WARN_ONCE(1, "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n", + (void *)st->start_address, (void *)st->start_address); + + st->wx_pages += (addr - st->start_address) / PAGE_SIZE; } static void note_page(struct pg_state *st, unsigned long addr, @@ -178,7 +198,7 @@ static void note_page(struct pg_state *st, unsigned long addr, st->start_address = addr; st->start_pa = pa; st->last_pa = pa; - seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); + pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); /* * Dump the section of virtual memory when: * - the PTE flags from one entry to the next differs. @@ -194,6 +214,7 @@ static void note_page(struct pg_state *st, unsigned long addr, /* Check the PTE flags */ if (st->current_flags) { + note_prot_wx(st, addr); dump_addr(st, addr); /* Dump all the flags */ @@ -202,7 +223,7 @@ static void note_page(struct pg_state *st, unsigned long addr, st->current_flags, pg_level[st->level].num); - seq_putc(st->seq, '\n'); + pt_dump_seq_putc(st->seq, '\n'); } /* @@ -211,7 +232,7 @@ static void note_page(struct pg_state *st, unsigned long addr, */ while (addr >= st->marker[1].start_address) { st->marker++; - seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); + pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); } st->start_address = addr; st->start_pa = pa; @@ -244,7 +265,7 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) for (i = 0; i < PTRS_PER_PMD; i++, pmd++) { addr = start + i * PMD_SIZE; - if (!pmd_none(*pmd) && !pmd_huge(*pmd)) + if (!pmd_none(*pmd) && !pmd_is_leaf(*pmd)) /* pmd exists */ walk_pte(st, pmd, addr); else @@ -260,7 +281,7 @@ static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) for (i = 0; i < PTRS_PER_PUD; i++, pud++) { addr = start + i * PUD_SIZE; - if (!pud_none(*pud) && !pud_huge(*pud)) + if (!pud_none(*pud) && !pud_is_leaf(*pud)) /* pud exists */ walk_pmd(st, pud, addr); else @@ -270,18 +291,16 @@ static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) static void walk_pagetables(struct pg_state *st) { - pgd_t *pgd = pgd_offset_k(0UL); unsigned int i; - unsigned long addr; - - addr = st->start_address; + unsigned long addr = st->start_address & PGDIR_MASK; + pgd_t *pgd = pgd_offset_k(addr); /* * Traverse the linux pagetable structure and dump pages that are in * the hash pagetable. */ - for (i = 0; i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) { - if (!pgd_none(*pgd) && !pgd_huge(*pgd)) + for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) { + if (!pgd_none(*pgd) && !pgd_is_leaf(*pgd)) /* pgd exists */ walk_pud(st, pgd, addr); else @@ -303,25 +322,25 @@ static void populate_markers(void) address_markers[i++].start_address = PHB_IO_END; address_markers[i++].start_address = IOREMAP_BASE; address_markers[i++].start_address = IOREMAP_END; + /* What is the ifdef about? */ #ifdef CONFIG_PPC_BOOK3S_64 - address_markers[i++].start_address = H_VMEMMAP_BASE; + address_markers[i++].start_address = H_VMEMMAP_START; #else address_markers[i++].start_address = VMEMMAP_BASE; #endif #else /* !CONFIG_PPC64 */ address_markers[i++].start_address = ioremap_bot; address_markers[i++].start_address = IOREMAP_TOP; -#ifdef CONFIG_NOT_COHERENT_CACHE - address_markers[i++].start_address = IOREMAP_TOP; - address_markers[i++].start_address = IOREMAP_TOP + - CONFIG_CONSISTENT_SIZE; -#endif #ifdef CONFIG_HIGHMEM address_markers[i++].start_address = PKMAP_BASE; address_markers[i++].start_address = PKMAP_ADDR(LAST_PKMAP); #endif address_markers[i++].start_address = FIXADDR_START; address_markers[i++].start_address = FIXADDR_TOP; +#ifdef CONFIG_KASAN + address_markers[i++].start_address = KASAN_SHADOW_START; + address_markers[i++].start_address = KASAN_SHADOW_END; +#endif #endif /* CONFIG_PPC64 */ } @@ -330,12 +349,13 @@ static int ptdump_show(struct seq_file *m, void *v) struct pg_state st = { .seq = m, .marker = address_markers, + .start_address = PAGE_OFFSET, }; - if (radix_enabled()) - st.start_address = PAGE_OFFSET; - else +#ifdef CONFIG_PPC64 + if (!radix_enabled()) st.start_address = KERN_VIRT_START; +#endif /* Traverse kernel page tables */ walk_pagetables(&st); @@ -366,6 +386,31 @@ static void build_pgtable_complete_mask(void) pg_level[i].mask |= pg_level[i].flag[j].mask; } +#ifdef CONFIG_PPC_DEBUG_WX +void ptdump_check_wx(void) +{ + struct pg_state st = { + .seq = NULL, + .marker = address_markers, + .check_wx = true, + .start_address = PAGE_OFFSET, + }; + +#ifdef CONFIG_PPC64 + if (!radix_enabled()) + st.start_address = KERN_VIRT_START; +#endif + + walk_pagetables(&st); + + if (st.wx_pages) + pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", + st.wx_pages); + else + pr_info("Checked W+X mappings: passed, no W+X pages found\n"); +} +#endif + static int ptdump_init(void) { struct dentry *debugfs_file; diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index aec91dbcdc0b..42bbcd47cc85 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * address space "slices" (meta-segments) support * @@ -6,20 +7,6 @@ * Based on hugetlb implementation * * Copyright (C) 2003 David Gibson, IBM Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #undef DEBUG @@ -101,7 +88,7 @@ static int slice_area_is_free(struct mm_struct *mm, unsigned long addr, { struct vm_area_struct *vma; - if ((mm->context.slb_addr_limit - len) < addr) + if ((mm_ctx_slb_addr_limit(&mm->context) - len) < addr) return 0; vma = find_vma(mm, addr); return (!vma || (addr + len) <= vm_start_gap(vma)); @@ -118,13 +105,11 @@ static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice) unsigned long start = slice << SLICE_HIGH_SHIFT; unsigned long end = start + (1ul << SLICE_HIGH_SHIFT); -#ifdef CONFIG_PPC64 /* Hack, so that each addresses is controlled by exactly one * of the high or low area bitmaps, the first high area starts * at 4GB, not 0 */ if (start == 0) - start = SLICE_LOW_TOP; -#endif + start = (unsigned long)SLICE_LOW_TOP; return !slice_area_is_free(mm, start, end - start); } @@ -150,40 +135,6 @@ static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret, __set_bit(i, ret->high_slices); } -#ifdef CONFIG_PPC_BOOK3S_64 -static struct slice_mask *slice_mask_for_size(struct mm_struct *mm, int psize) -{ -#ifdef CONFIG_PPC_64K_PAGES - if (psize == MMU_PAGE_64K) - return &mm->context.mask_64k; -#endif - if (psize == MMU_PAGE_4K) - return &mm->context.mask_4k; -#ifdef CONFIG_HUGETLB_PAGE - if (psize == MMU_PAGE_16M) - return &mm->context.mask_16m; - if (psize == MMU_PAGE_16G) - return &mm->context.mask_16g; -#endif - BUG(); -} -#elif defined(CONFIG_PPC_8xx) -static struct slice_mask *slice_mask_for_size(struct mm_struct *mm, int psize) -{ - if (psize == mmu_virtual_psize) - return &mm->context.mask_base_psize; -#ifdef CONFIG_HUGETLB_PAGE - if (psize == MMU_PAGE_512K) - return &mm->context.mask_512k; - if (psize == MMU_PAGE_8M) - return &mm->context.mask_8m; -#endif - BUG(); -} -#else -#error "Must define the slice masks for page sizes supported by the platform" -#endif - static bool slice_check_range_fits(struct mm_struct *mm, const struct slice_mask *available, unsigned long start, unsigned long len) @@ -246,14 +197,14 @@ static void slice_convert(struct mm_struct *mm, slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize); slice_print_mask(" mask", mask); - psize_mask = slice_mask_for_size(mm, psize); + psize_mask = slice_mask_for_size(&mm->context, psize); /* We need to use a spinlock here to protect against * concurrent 64k -> 4k demotion ... */ spin_lock_irqsave(&slice_convert_lock, flags); - lpsizes = mm->context.low_slices_psize; + lpsizes = mm_ctx_low_slices(&mm->context); for (i = 0; i < SLICE_NUM_LOW; i++) { if (!(mask->low_slices & (1u << i))) continue; @@ -263,7 +214,7 @@ static void slice_convert(struct mm_struct *mm, /* Update the slice_mask */ old_psize = (lpsizes[index] >> (mask_index * 4)) & 0xf; - old_mask = slice_mask_for_size(mm, old_psize); + old_mask = slice_mask_for_size(&mm->context, old_psize); old_mask->low_slices &= ~(1u << i); psize_mask->low_slices |= 1u << i; @@ -272,8 +223,8 @@ static void slice_convert(struct mm_struct *mm, (((unsigned long)psize) << (mask_index * 4)); } - hpsizes = mm->context.high_slices_psize; - for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit); i++) { + hpsizes = mm_ctx_high_slices(&mm->context); + for (i = 0; i < GET_HIGH_SLICE_INDEX(mm_ctx_slb_addr_limit(&mm->context)); i++) { if (!test_bit(i, mask->high_slices)) continue; @@ -282,7 +233,7 @@ static void slice_convert(struct mm_struct *mm, /* Update the slice_mask */ old_psize = (hpsizes[index] >> (mask_index * 4)) & 0xf; - old_mask = slice_mask_for_size(mm, old_psize); + old_mask = slice_mask_for_size(&mm->context, old_psize); __clear_bit(i, old_mask->high_slices); __set_bit(i, psize_mask->high_slices); @@ -292,8 +243,8 @@ static void slice_convert(struct mm_struct *mm, } slice_dbg(" lsps=%lx, hsps=%lx\n", - (unsigned long)mm->context.low_slices_psize, - (unsigned long)mm->context.high_slices_psize); + (unsigned long)mm_ctx_low_slices(&mm->context), + (unsigned long)mm_ctx_high_slices(&mm->context)); spin_unlock_irqrestore(&slice_convert_lock, flags); @@ -393,7 +344,7 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm, * DEFAULT_MAP_WINDOW we should apply this. */ if (high_limit > DEFAULT_MAP_WINDOW) - addr += mm->context.slb_addr_limit - DEFAULT_MAP_WINDOW; + addr += mm_ctx_slb_addr_limit(&mm->context) - DEFAULT_MAP_WINDOW; while (addr > min_addr) { info.high_limit = addr; @@ -505,20 +456,20 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, return -ENOMEM; } - if (high_limit > mm->context.slb_addr_limit) { + if (high_limit > mm_ctx_slb_addr_limit(&mm->context)) { /* * Increasing the slb_addr_limit does not require * slice mask cache to be recalculated because it should * be already initialised beyond the old address limit. */ - mm->context.slb_addr_limit = high_limit; + mm_ctx_set_slb_addr_limit(&mm->context, high_limit); on_each_cpu(slice_flush_segments, mm, 1); } /* Sanity checks */ BUG_ON(mm->task_size == 0); - BUG_ON(mm->context.slb_addr_limit == 0); + BUG_ON(mm_ctx_slb_addr_limit(&mm->context) == 0); VM_BUG_ON(radix_enabled()); slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize); @@ -538,7 +489,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, /* First make up a "good" mask of slices that have the right size * already */ - maskp = slice_mask_for_size(mm, psize); + maskp = slice_mask_for_size(&mm->context, psize); /* * Here "good" means slices that are already the right page size, @@ -565,7 +516,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, * a pointer to good mask for the next code to use. */ if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && psize == MMU_PAGE_64K) { - compat_maskp = slice_mask_for_size(mm, MMU_PAGE_4K); + compat_maskp = slice_mask_for_size(&mm->context, MMU_PAGE_4K); if (fixed) slice_or_mask(&good_mask, maskp, compat_maskp); else @@ -642,14 +593,13 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, newaddr = slice_find_area(mm, len, &potential_mask, psize, topdown, high_limit); -#ifdef CONFIG_PPC_64K_PAGES - if (newaddr == -ENOMEM && psize == MMU_PAGE_64K) { + if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && newaddr == -ENOMEM && + psize == MMU_PAGE_64K) { /* retry the search with 4k-page slices included */ slice_or_mask(&potential_mask, &potential_mask, compat_maskp); newaddr = slice_find_area(mm, len, &potential_mask, psize, topdown, high_limit); } -#endif if (newaddr == -ENOMEM) return -ENOMEM; @@ -696,7 +646,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long flags) { return slice_get_unmapped_area(addr, len, flags, - current->mm->context.user_psize, 0); + mm_ctx_user_psize(¤t->mm->context), 0); } unsigned long arch_get_unmapped_area_topdown(struct file *filp, @@ -706,7 +656,7 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, const unsigned long flags) { return slice_get_unmapped_area(addr0, len, flags, - current->mm->context.user_psize, 1); + mm_ctx_user_psize(¤t->mm->context), 1); } unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr) @@ -717,10 +667,10 @@ unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr) VM_BUG_ON(radix_enabled()); if (slice_addr_is_low(addr)) { - psizes = mm->context.low_slices_psize; + psizes = mm_ctx_low_slices(&mm->context); index = GET_LOW_SLICE_INDEX(addr); } else { - psizes = mm->context.high_slices_psize; + psizes = mm_ctx_high_slices(&mm->context); index = GET_HIGH_SLICE_INDEX(addr); } mask_index = index & 0x1; @@ -741,27 +691,22 @@ void slice_init_new_context_exec(struct mm_struct *mm) * case of fork it is just inherited from the mm being * duplicated. */ -#ifdef CONFIG_PPC64 - mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64; -#else - mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW; -#endif - - mm->context.user_psize = psize; + mm_ctx_set_slb_addr_limit(&mm->context, SLB_ADDR_LIMIT_DEFAULT); + mm_ctx_set_user_psize(&mm->context, psize); /* * Set all slice psizes to the default. */ - lpsizes = mm->context.low_slices_psize; + lpsizes = mm_ctx_low_slices(&mm->context); memset(lpsizes, (psize << 4) | psize, SLICE_NUM_LOW >> 1); - hpsizes = mm->context.high_slices_psize; + hpsizes = mm_ctx_high_slices(&mm->context); memset(hpsizes, (psize << 4) | psize, SLICE_NUM_HIGH >> 1); /* * Slice mask cache starts zeroed, fill the default size cache. */ - mask = slice_mask_for_size(mm, psize); + mask = slice_mask_for_size(&mm->context, psize); mask->low_slices = ~0UL; if (SLICE_NUM_HIGH) bitmap_fill(mask->high_slices, SLICE_NUM_HIGH); @@ -777,7 +722,7 @@ void slice_setup_new_exec(void) if (!is_32bit_task()) return; - mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW; + mm_ctx_set_slb_addr_limit(&mm->context, DEFAULT_MAP_WINDOW); } #endif @@ -816,22 +761,21 @@ int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len) { const struct slice_mask *maskp; - unsigned int psize = mm->context.user_psize; + unsigned int psize = mm_ctx_user_psize(&mm->context); VM_BUG_ON(radix_enabled()); - maskp = slice_mask_for_size(mm, psize); -#ifdef CONFIG_PPC_64K_PAGES + maskp = slice_mask_for_size(&mm->context, psize); + /* We need to account for 4k slices too */ - if (psize == MMU_PAGE_64K) { + if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && psize == MMU_PAGE_64K) { const struct slice_mask *compat_maskp; struct slice_mask available; - compat_maskp = slice_mask_for_size(mm, MMU_PAGE_4K); + compat_maskp = slice_mask_for_size(&mm->context, MMU_PAGE_4K); slice_or_mask(&available, maskp, compat_maskp); return !slice_check_range_fits(mm, &available, addr, len); } -#endif return !slice_check_range_fits(mm, maskp, addr, len); } diff --git a/arch/powerpc/mm/vphn.c b/arch/powerpc/mm/vphn.c deleted file mode 100644 index f83044faac23..000000000000 --- a/arch/powerpc/mm/vphn.c +++ /dev/null @@ -1,71 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <asm/byteorder.h> -#include "vphn.h" - -/* - * The associativity domain numbers are returned from the hypervisor as a - * stream of mixed 16-bit and 32-bit fields. The stream is terminated by the - * special value of "all ones" (aka. 0xffff) and its size may not exceed 48 - * bytes. - * - * --- 16-bit fields --> - * _________________________ - * | 0 | 1 | 2 | 3 | be_packed[0] - * ------+-----+-----+------ - * _________________________ - * | 4 | 5 | 6 | 7 | be_packed[1] - * ------------------------- - * ... - * _________________________ - * | 20 | 21 | 22 | 23 | be_packed[5] - * ------------------------- - * - * Convert to the sequence they would appear in the ibm,associativity property. - */ -int vphn_unpack_associativity(const long *packed, __be32 *unpacked) -{ - __be64 be_packed[VPHN_REGISTER_COUNT]; - int i, nr_assoc_doms = 0; - const __be16 *field = (const __be16 *) be_packed; - u16 last = 0; - bool is_32bit = false; - -#define VPHN_FIELD_UNUSED (0xffff) -#define VPHN_FIELD_MSB (0x8000) -#define VPHN_FIELD_MASK (~VPHN_FIELD_MSB) - - /* Let's fix the values returned by plpar_hcall9() */ - for (i = 0; i < VPHN_REGISTER_COUNT; i++) - be_packed[i] = cpu_to_be64(packed[i]); - - for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) { - u16 new = be16_to_cpup(field++); - - if (is_32bit) { - /* Let's concatenate the 16 bits of this field to the - * 15 lower bits of the previous field - */ - unpacked[++nr_assoc_doms] = - cpu_to_be32(last << 16 | new); - is_32bit = false; - } else if (new == VPHN_FIELD_UNUSED) - /* This is the list terminator */ - break; - else if (new & VPHN_FIELD_MSB) { - /* Data is in the lower 15 bits of this field */ - unpacked[++nr_assoc_doms] = - cpu_to_be32(new & VPHN_FIELD_MASK); - } else { - /* Data is in the lower 15 bits of this field - * concatenated with the next 16 bit field - */ - last = new; - is_32bit = true; - } - } - - /* The first cell contains the length of the property */ - unpacked[0] = cpu_to_be32(nr_assoc_doms); - - return nr_assoc_doms; -} diff --git a/arch/powerpc/mm/vphn.h b/arch/powerpc/mm/vphn.h deleted file mode 100644 index f9ffdb3942fc..000000000000 --- a/arch/powerpc/mm/vphn.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ARCH_POWERPC_MM_VPHN_H_ -#define _ARCH_POWERPC_MM_VPHN_H_ - -/* The H_HOME_NODE_ASSOCIATIVITY h_call returns 6 64-bit registers. - */ -#define VPHN_REGISTER_COUNT 6 - -/* - * 6 64-bit registers unpacked into up to 24 be32 associativity values. To - * form the complete property we have to add the length in the first cell. - */ -#define VPHN_ASSOC_BUFSIZE (VPHN_REGISTER_COUNT*sizeof(u64)/sizeof(u16) + 1) - -extern int vphn_unpack_associativity(const long *packed, __be32 *unpacked); - -#endif |

