Merge tag 'v4.15-rc1' into next-seccomp

Linux 4.15-rc1
author: James Morris <james.l.morris@oracle.com> 2017-11-29 12:47:41 +1100
committer: James Morris <james.l.morris@oracle.com> 2017-11-29 12:47:41 +1100
commit: cf40a76e7d5874bb25f4404eecc58a2e033af885 (patch)
tree: 8fd81cbea03c87b3d41d7ae5b1d11eadd35d6ef5 /arch/powerpc/mm
parent: ab5348c9c23cd253f5902980d2d8fe067dc24c82 (diff)
parent: 4fbd8d194f06c8a3fd2af1ce560ddb31f7ec8323 (diff)
download: talos-op-linux-cf40a76e7d5874bb25f4404eecc58a2e033af885.tar.gz
talos-op-linux-cf40a76e7d5874bb25f4404eecc58a2e033af885.zip
36 files changed, 1177 insertions, 1240 deletions
diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c
index f4c6472f2fc4..f29212e40f40 100644
--- a/arch/powerpc/mm/8xx_mmu.c
+++ b/arch/powerpc/mm/8xx_mmu.c
@@ -22,8 +22,11 @@
 
 extern int __map_without_ltlbs;
 
+static unsigned long block_mapped_ram;
+
 /*
- * Return PA for this VA if it is in IMMR area, or 0
+ * Return PA for this VA if it is in an area mapped with LTLBs.
+ * Otherwise, returns 0
  */
 phys_addr_t v_block_mapped(unsigned long va)
 {
@@ -33,11 +36,13 @@ phys_addr_t v_block_mapped(unsigned long va)
 		return 0;
 	if (va >= VIRT_IMMR_BASE && va < VIRT_IMMR_BASE + IMMR_SIZE)
 		return p + va - VIRT_IMMR_BASE;
+	if (va >= PAGE_OFFSET && va < PAGE_OFFSET + block_mapped_ram)
+		return __pa(va);
 	return 0;
 }
 
 /*
- * Return VA for a given PA or 0 if not mapped
+ * Return VA for a given PA mapped with LTLBs or 0 if not mapped
  */
 unsigned long p_block_mapped(phys_addr_t pa)
 {
@@ -47,6 +52,8 @@ unsigned long p_block_mapped(phys_addr_t pa)
 		return 0;
 	if (pa >= p && pa < p + IMMR_SIZE)
 		return VIRT_IMMR_BASE + pa - p;
+	if (pa < block_mapped_ram)
+		return (unsigned long)__va(pa);
 	return 0;
 }
 
@@ -58,7 +65,7 @@ unsigned long p_block_mapped(phys_addr_t pa)
 void __init MMU_init_hw(void)
 {
 	/* PIN up to the 3 first 8Mb after IMMR in DTLB table */
-#ifdef CONFIG_PIN_TLB
+#ifdef CONFIG_PIN_TLB_DATA
 	unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000;
 	unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SHARED | _PAGE_DIRTY;
 #ifdef CONFIG_PIN_TLB_IMMR
@@ -80,7 +87,7 @@ void __init MMU_init_hw(void)
 #endif
 }
 
-static void mmu_mapin_immr(void)
+static void __init mmu_mapin_immr(void)
 {
 	unsigned long p = PHYS_IMMR_BASE;
 	unsigned long v = VIRT_IMMR_BASE;
@@ -96,8 +103,11 @@ static void mmu_mapin_immr(void)
 extern unsigned int DTLBMiss_jmp;
 #endif
 extern unsigned int DTLBMiss_cmp, FixupDAR_cmp;
+#ifndef CONFIG_PIN_TLB_TEXT
+extern unsigned int ITLBMiss_cmp;
+#endif
 
-void mmu_patch_cmp_limit(unsigned int *addr, unsigned long mapped)
+static void __init mmu_patch_cmp_limit(unsigned int *addr, unsigned long mapped)
 {
 	unsigned int instr = *addr;
 
@@ -116,6 +126,9 @@ unsigned long __init mmu_mapin_ram(unsigned long top)
 #ifndef CONFIG_PIN_TLB_IMMR
 		patch_instruction(&DTLBMiss_jmp, PPC_INST_NOP);
 #endif
+#ifndef CONFIG_PIN_TLB_TEXT
+		mmu_patch_cmp_limit(&ITLBMiss_cmp, 0);
+#endif
 	} else {
 		mapped = top & ~(LARGE_PAGE_SIZE_8M - 1);
 	}
@@ -133,11 +146,13 @@ unsigned long __init mmu_mapin_ram(unsigned long top)
 	if (mapped)
 		memblock_set_current_limit(mapped);
 
+	block_mapped_ram = mapped;
+
 	return mapped;
 }
 
-void setup_initial_memory_limit(phys_addr_t first_memblock_base,
-				phys_addr_t first_memblock_size)
+void __init setup_initial_memory_limit(phys_addr_t first_memblock_base,
+				       phys_addr_t first_memblock_size)
 {
 	/* We don't currently support the first MEMBLOCK not mapping 0
 	 * physical on those processors
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 7414034df1c3..76a6b057d454 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
 #
 # Makefile for the linux ppc-specific parts of the memory manager.
 #
@@ -8,22 +9,20 @@ ccflags-$(CONFIG_PPC64)	:= $(NO_MINIMAL_TOC)
 
 obj-y				:= fault.o mem.o pgtable.o mmap.o \
 				   init_$(BITS).o pgtable_$(BITS).o \
-				   init-common.o
+				   init-common.o mmu_context.o
 obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o tlb_nohash.o \
 				   tlb_nohash_low.o
 obj-$(CONFIG_PPC_BOOK3E)	+= tlb_low_$(BITS)e.o
 hash64-$(CONFIG_PPC_NATIVE)	:= hash_native_64.o
 obj-$(CONFIG_PPC_BOOK3E_64)   += pgtable-book3e.o
-obj-$(CONFIG_PPC_STD_MMU_64)	+= pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o
+obj-$(CONFIG_PPC_BOOK3S_64)	+= pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o
 obj-$(CONFIG_PPC_RADIX_MMU)	+= pgtable-radix.o tlb-radix.o
 obj-$(CONFIG_PPC_STD_MMU_32)	+= ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o
 obj-$(CONFIG_PPC_STD_MMU)	+= tlb_hash$(BITS).o
-ifeq ($(CONFIG_PPC_STD_MMU_64),y)
+ifeq ($(CONFIG_PPC_BOOK3S_64),y)
 obj-$(CONFIG_PPC_4K_PAGES)	+= hash64_4k.o
 obj-$(CONFIG_PPC_64K_PAGES)	+= hash64_64k.o
 endif
-obj-$(CONFIG_PPC_ICSWX)		+= icswx.o
-obj-$(CONFIG_PPC_ICSWX_PID)	+= icswx_pid.o
 obj-$(CONFIG_40x)		+= 40x_mmu.o
 obj-$(CONFIG_44x)		+= 44x_mmu.o
 obj-$(CONFIG_PPC_8xx)		+= 8xx_mmu.o
@@ -33,7 +32,7 @@ obj-$(CONFIG_PPC_SPLPAR)	+= vphn.o
 obj-$(CONFIG_PPC_MM_SLICES)	+= slice.o
 obj-y				+= hugetlbpage.o
 ifeq ($(CONFIG_HUGETLB_PAGE),y)
-obj-$(CONFIG_PPC_STD_MMU_64)	+= hugetlbpage-hash64.o
+obj-$(CONFIG_PPC_BOOK3S_64)	+= hugetlbpage-hash64.o
 obj-$(CONFIG_PPC_RADIX_MMU)	+= hugetlbpage-radix.o
 obj-$(CONFIG_PPC_BOOK3E_MMU)	+= hugetlbpage-book3e.o
 endif
diff --git a/arch/powerpc/mm/dump_hashpagetable.c b/arch/powerpc/mm/dump_hashpagetable.c
index b1c144b03fcf..14cfb11b09d0 100644
--- a/arch/powerpc/mm/dump_hashpagetable.c
+++ b/arch/powerpc/mm/dump_hashpagetable.c
@@ -205,7 +205,7 @@ static void dump_hpte_info(struct pg_state *st, unsigned long ea, u64 v, u64 r,
 	aps_index = calculate_pagesize(st, aps, "actual");
 	if (aps_index != 2)
 		seq_printf(st->seq, "LP enc: %lx", lp);
-	seq_puts(st->seq, "\n");
+	seq_putc(st->seq, '\n');
 }
 
 
@@ -500,7 +500,7 @@ static void populate_markers(void)
 	address_markers[6].start_address = PHB_IO_END;
 	address_markers[7].start_address = IOREMAP_BASE;
 	address_markers[8].start_address = IOREMAP_END;
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 	address_markers[9].start_address =  H_VMEMMAP_BASE;
 #else
 	address_markers[9].start_address =  VMEMMAP_BASE;
diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c
index 44fe4833910f..c2e7dea59490 100644
--- a/arch/powerpc/mm/dump_linuxpagetables.c
+++ b/arch/powerpc/mm/dump_linuxpagetables.c
@@ -112,7 +112,7 @@ struct flag_info {
 
 static const struct flag_info flag_array[] = {
 	{
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 		.mask	= _PAGE_PRIVILEGED,
 		.val	= 0,
 #else
@@ -147,7 +147,7 @@ static const struct flag_info flag_array[] = {
 		.set	= "present",
 		.clear	= "       ",
 	}, {
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 		.mask	= H_PAGE_HASHPTE,
 		.val	= H_PAGE_HASHPTE,
 #else
@@ -157,7 +157,7 @@ static const struct flag_info flag_array[] = {
 		.set	= "hpte",
 		.clear	= "    ",
 	}, {
-#ifndef CONFIG_PPC_STD_MMU_64
+#ifndef CONFIG_PPC_BOOK3S_64
 		.mask	= _PAGE_GUARDED,
 		.val	= _PAGE_GUARDED,
 		.set	= "guarded",
@@ -174,7 +174,7 @@ static const struct flag_info flag_array[] = {
 		.set	= "accessed",
 		.clear	= "        ",
 	}, {
-#ifndef CONFIG_PPC_STD_MMU_64
+#ifndef CONFIG_PPC_BOOK3S_64
 		.mask	= _PAGE_WRITETHRU,
 		.val	= _PAGE_WRITETHRU,
 		.set	= "write through",
@@ -350,7 +350,7 @@ static void note_page(struct pg_state *st, unsigned long addr,
 					  st->current_flags,
 					  pg_level[st->level].num);
 
-			seq_puts(st->seq, "\n");
+			seq_putc(st->seq, '\n');
 		}
 
 		/*
@@ -450,7 +450,7 @@ static void populate_markers(void)
 	address_markers[i++].start_address = PHB_IO_END;
 	address_markers[i++].start_address = IOREMAP_BASE;
 	address_markers[i++].start_address = IOREMAP_END;
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 	address_markers[i++].start_address =  H_VMEMMAP_BASE;
 #else
 	address_markers[i++].start_address =  VMEMMAP_BASE;
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 4c422632047b..4797d08581ce 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -45,43 +45,39 @@
 #include <asm/siginfo.h>
 #include <asm/debug.h>
 
-#include "icswx.h"
-
-#ifdef CONFIG_KPROBES
-static inline int notify_page_fault(struct pt_regs *regs)
+static inline bool notify_page_fault(struct pt_regs *regs)
 {
-	int ret = 0;
+	bool ret = false;
 
+#ifdef CONFIG_KPROBES
 	/* kprobe_running() needs smp_processor_id() */
 	if (!user_mode(regs)) {
 		preempt_disable();
 		if (kprobe_running() && kprobe_fault_handler(regs, 11))
-			ret = 1;
+			ret = true;
 		preempt_enable();
 	}
+#endif /* CONFIG_KPROBES */
+
+	if (unlikely(debugger_fault_handler(regs)))
+		ret = true;
 
 	return ret;
 }
-#else
-static inline int notify_page_fault(struct pt_regs *regs)
-{
-	return 0;
-}
-#endif
 
 /*
  * Check whether the instruction at regs->nip is a store using
  * an update addressing form which will update r1.
  */
-static int store_updates_sp(struct pt_regs *regs)
+static bool store_updates_sp(struct pt_regs *regs)
 {
 	unsigned int inst;
 
 	if (get_user(inst, (unsigned int __user *)regs->nip))
-		return 0;
+		return false;
 	/* check for 1 in the rA field */
 	if (((inst >> 16) & 0x1f) != 1)
-		return 0;
+		return false;
 	/* check major opcode */
 	switch (inst >> 26) {
 	case 37:	/* stwu */
@@ -89,7 +85,7 @@ static int store_updates_sp(struct pt_regs *regs)
 	case 45:	/* sthu */
 	case 53:	/* stfsu */
 	case 55:	/* stfdu */
-		return 1;
+		return true;
 	case 62:	/* std or stdu */
 		return (inst & 3) == 1;
 	case 31:
@@ -101,18 +97,53 @@ static int store_updates_sp(struct pt_regs *regs)
 		case 439:	/* sthux */
 		case 695:	/* stfsux */
 		case 759:	/* stfdux */
-			return 1;
+			return true;
 		}
 	}
-	return 0;
+	return false;
 }
 /*
  * do_page_fault error handling helpers
  */
 
-#define MM_FAULT_RETURN		0
-#define MM_FAULT_CONTINUE	-1
-#define MM_FAULT_ERR(sig)	(sig)
+static int
+__bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code)
+{
+	/*
+	 * If we are in kernel mode, bail out with a SEGV, this will
+	 * be caught by the assembly which will restore the non-volatile
+	 * registers before calling bad_page_fault()
+	 */
+	if (!user_mode(regs))
+		return SIGSEGV;
+
+	_exception(SIGSEGV, regs, si_code, address);
+
+	return 0;
+}
+
+static noinline int bad_area_nosemaphore(struct pt_regs *regs, unsigned long address)
+{
+	return __bad_area_nosemaphore(regs, address, SEGV_MAPERR);
+}
+
+static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code)
+{
+	struct mm_struct *mm = current->mm;
+
+	/*
+	 * Something tried to access memory that isn't in our memory map..
+	 * Fix it, but check if it's kernel or user first..
+	 */
+	up_read(&mm->mmap_sem);
+
+	return __bad_area_nosemaphore(regs, address, si_code);
+}
+
+static noinline int bad_area(struct pt_regs *regs, unsigned long address)
+{
+	return __bad_area(regs, address, SEGV_MAPERR);
+}
 
 static int do_sigbus(struct pt_regs *regs, unsigned long address,
 		     unsigned int fault)
@@ -121,7 +152,7 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address,
 	unsigned int lsb = 0;
 
 	if (!user_mode(regs))
-		return MM_FAULT_ERR(SIGBUS);
+		return SIGBUS;
 
 	current->thread.trap_nr = BUS_ADRERR;
 	info.si_signo = SIGBUS;
@@ -142,25 +173,17 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address,
 #endif
 	info.si_addr_lsb = lsb;
 	force_sig_info(SIGBUS, &info, current);
-	return MM_FAULT_RETURN;
+	return 0;
 }
 
 static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
 {
 	/*
-	 * Pagefault was interrupted by SIGKILL. We have no reason to
-	 * continue the pagefault.
+	 * Kernel page fault interrupted by SIGKILL. We have no reason to
+	 * continue processing.
 	 */
-	if (fatal_signal_pending(current)) {
-		/* Coming from kernel, we need to deal with uaccess fixups */
-		if (user_mode(regs))
-			return MM_FAULT_RETURN;
-		return MM_FAULT_ERR(SIGKILL);
-	}
-
-	/* No fault: be happy */
-	if (!(fault & VM_FAULT_ERROR))
-		return MM_FAULT_CONTINUE;
+	if (fatal_signal_pending(current) && !user_mode(regs))
+		return SIGKILL;
 
 	/* Out of memory */
 	if (fault & VM_FAULT_OOM) {
@@ -169,19 +192,176 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
 		 * made us unable to handle the page fault gracefully.
 		 */
 		if (!user_mode(regs))
-			return MM_FAULT_ERR(SIGKILL);
+			return SIGSEGV;
 		pagefault_out_of_memory();
-		return MM_FAULT_RETURN;
+	} else {
+		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
+			     VM_FAULT_HWPOISON_LARGE))
+			return do_sigbus(regs, addr, fault);
+		else if (fault & VM_FAULT_SIGSEGV)
+			return bad_area_nosemaphore(regs, addr);
+		else
+			BUG();
+	}
+	return 0;
+}
+
+/* Is this a bad kernel fault ? */
+static bool bad_kernel_fault(bool is_exec, unsigned long error_code,
+			     unsigned long address)
+{
+	if (is_exec && (error_code & (DSISR_NOEXEC_OR_G | DSISR_KEYFAULT))) {
+		printk_ratelimited(KERN_CRIT "kernel tried to execute"
+				   " exec-protected page (%lx) -"
+				   "exploit attempt? (uid: %d)\n",
+				   address, from_kuid(&init_user_ns,
+						      current_uid()));
 	}
+	return is_exec || (address >= TASK_SIZE);
+}
 
-	if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE))
-		return do_sigbus(regs, addr, fault);
+static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
+				struct vm_area_struct *vma,
+				bool store_update_sp)
+{
+	/*
+	 * N.B. The POWER/Open ABI allows programs to access up to
+	 * 288 bytes below the stack pointer.
+	 * The kernel signal delivery code writes up to about 1.5kB
+	 * below the stack pointer (r1) before decrementing it.
+	 * The exec code can write slightly over 640kB to the stack
+	 * before setting the user r1.  Thus we allow the stack to
+	 * expand to 1MB without further checks.
+	 */
+	if (address + 0x100000 < vma->vm_end) {
+		/* get user regs even if this fault is in kernel mode */
+		struct pt_regs *uregs = current->thread.regs;
+		if (uregs == NULL)
+			return true;
 
-	/* We don't understand the fault code, this is fatal */
-	BUG();
-	return MM_FAULT_CONTINUE;
+		/*
+		 * A user-mode access to an address a long way below
+		 * the stack pointer is only valid if the instruction
+		 * is one which would update the stack pointer to the
+		 * address accessed if the instruction completed,
+		 * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
+		 * (or the byte, halfword, float or double forms).
+		 *
+		 * If we don't check this then any write to the area
+		 * between the last mapped region and the stack will
+		 * expand the stack rather than segfaulting.
+		 */
+		if (address + 2048 < uregs->gpr[1] && !store_update_sp)
+			return true;
+	}
+	return false;
+}
+
+static bool access_error(bool is_write, bool is_exec,
+			 struct vm_area_struct *vma)
+{
+	/*
+	 * Allow execution from readable areas if the MMU does not
+	 * provide separate controls over reading and executing.
+	 *
+	 * Note: That code used to not be enabled for 4xx/BookE.
+	 * It is now as I/D cache coherency for these is done at
+	 * set_pte_at() time and I see no reason why the test
+	 * below wouldn't be valid on those processors. This -may-
+	 * break programs compiled with a really old ABI though.
+	 */
+	if (is_exec) {
+		return !(vma->vm_flags & VM_EXEC) &&
+			(cpu_has_feature(CPU_FTR_NOEXECUTE) ||
+			 !(vma->vm_flags & (VM_READ | VM_WRITE)));
+	}
+
+	if (is_write) {
+		if (unlikely(!(vma->vm_flags & VM_WRITE)))
+			return true;
+		return false;
+	}
+
+	if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+		return true;
+
+	return false;
 }
 
+#ifdef CONFIG_PPC_SMLPAR
+static inline void cmo_account_page_fault(void)
+{
+	if (firmware_has_feature(FW_FEATURE_CMO)) {
+		u32 page_ins;
+
+		preempt_disable();
+		page_ins = be32_to_cpu(get_lppaca()->page_ins);
+		page_ins += 1 << PAGE_FACTOR;
+		get_lppaca()->page_ins = cpu_to_be32(page_ins);
+		preempt_enable();
+	}
+}
+#else
+static inline void cmo_account_page_fault(void) { }
+#endif /* CONFIG_PPC_SMLPAR */
+
+#ifdef CONFIG_PPC_STD_MMU
+static void sanity_check_fault(bool is_write, unsigned long error_code)
+{
+	/*
+	 * For hash translation mode, we should never get a
+	 * PROTFAULT. Any update to pte to reduce access will result in us
+	 * removing the hash page table entry, thus resulting in a DSISR_NOHPTE
+	 * fault instead of DSISR_PROTFAULT.
+	 *
+	 * A pte update to relax the access will not result in a hash page table
+	 * entry invalidate and hence can result in DSISR_PROTFAULT.
+	 * ptep_set_access_flags() doesn't do a hpte flush. This is why we have
+	 * the special !is_write in the below conditional.
+	 *
+	 * For platforms that doesn't supports coherent icache and do support
+	 * per page noexec bit, we do setup things such that we do the
+	 * sync between D/I cache via fault. But that is handled via low level
+	 * hash fault code (hash_page_do_lazy_icache()) and we should not reach
+	 * here in such case.
+	 *
+	 * For wrong access that can result in PROTFAULT, the above vma->vm_flags
+	 * check should handle those and hence we should fall to the bad_area
+	 * handling correctly.
+	 *
+	 * For embedded with per page exec support that doesn't support coherent
+	 * icache we do get PROTFAULT and we handle that D/I cache sync in
+	 * set_pte_at while taking the noexec/prot fault. Hence this is WARN_ON
+	 * is conditional for server MMU.
+	 *
+	 * For radix, we can get prot fault for autonuma case, because radix
+	 * page table will have them marked noaccess for user.
+	 */
+	if (!radix_enabled() && !is_write)
+		WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
+}
+#else
+static void sanity_check_fault(bool is_write, unsigned long error_code) { }
+#endif /* CONFIG_PPC_STD_MMU */
+
+/*
+ * Define the correct "is_write" bit in error_code based
+ * on the processor family
+ */
+#if (defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
+#define page_fault_is_write(__err)	((__err) & ESR_DST)
+#define page_fault_is_bad(__err)	(0)
+#else
+#define page_fault_is_write(__err)	((__err) & DSISR_ISSTORE)
+#if defined(CONFIG_PPC_8xx)
+#define page_fault_is_bad(__err)	((__err) & DSISR_NOEXEC_OR_G)
+#elif defined(CONFIG_PPC64)
+#define page_fault_is_bad(__err)	((__err) & DSISR_BAD_FAULT_64S)
+#else
+#define page_fault_is_bad(__err)	((__err) & DSISR_BAD_FAULT_32S)
+#endif
+#endif
+
 /*
  * For 600- and 800-family processors, the error_code parameter is DSISR
  * for a data fault, SRR1 for an instruction fault. For 400-family processors
@@ -195,92 +375,56 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
  * The return value is 0 if the fault was handled, or the signal
  * number if this is a kernel fault that can't be handled here.
  */
-int do_page_fault(struct pt_regs *regs, unsigned long address,
-			    unsigned long error_code)
+static int __do_page_fault(struct pt_regs *regs, unsigned long address,
+			   unsigned long error_code)
 {
-	enum ctx_state prev_state = exception_enter();
 	struct vm_area_struct * vma;
 	struct mm_struct *mm = current->mm;
 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
-	int code = SEGV_MAPERR;
-	int is_write = 0;
-	int trap = TRAP(regs);
- 	int is_exec = trap == 0x400;
+ 	int is_exec = TRAP(regs) == 0x400;
 	int is_user = user_mode(regs);
-	int fault;
-	int rc = 0, store_update_sp = 0;
+	int is_write = page_fault_is_write(error_code);
+	int fault, major = 0;
+	bool store_update_sp = false;
 
-#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
-	/*
-	 * Fortunately the bit assignments in SRR1 for an instruction
-	 * fault and DSISR for a data fault are mostly the same for the
-	 * bits we are interested in.  But there are some bits which
-	 * indicate errors in DSISR but can validly be set in SRR1.
-	 */
-	if (is_exec)
-		error_code &= 0x48200000;
-	else
-		is_write = error_code & DSISR_ISSTORE;
-#else
-	is_write = error_code & ESR_DST;
-#endif /* CONFIG_4xx || CONFIG_BOOKE */
+	if (notify_page_fault(regs))
+		return 0;
 
-#ifdef CONFIG_PPC_ICSWX
-	/*
-	 * we need to do this early because this "data storage
-	 * interrupt" does not update the DAR/DEAR so we don't want to
-	 * look at it
-	 */
-	if (error_code & ICSWX_DSI_UCT) {
-		rc = acop_handle_fault(regs, address, error_code);
-		if (rc)
-			goto bail;
+	if (unlikely(page_fault_is_bad(error_code))) {
+		if (is_user) {
+			_exception(SIGBUS, regs, BUS_OBJERR, address);
+			return 0;
+		}
+		return SIGBUS;
 	}
-#endif /* CONFIG_PPC_ICSWX */
-
-	if (notify_page_fault(regs))
-		goto bail;
 
-	if (unlikely(debugger_fault_handler(regs)))
-		goto bail;
+	/* Additional sanity check(s) */
+	sanity_check_fault(is_write, error_code);
 
 	/*
 	 * The kernel should never take an execute fault nor should it
 	 * take a page fault to a kernel address.
 	 */
-	if (!is_user && (is_exec || (address >= TASK_SIZE))) {
-		rc = SIGSEGV;
-		goto bail;
-	}
+	if (unlikely(!is_user && bad_kernel_fault(is_exec, error_code, address)))
+		return SIGSEGV;
 
-#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE) || \
-      defined(CONFIG_PPC_BOOK3S_64) || defined(CONFIG_PPC_8xx))
-  	if (error_code & DSISR_DABRMATCH) {
-		/* breakpoint match */
-		do_break(regs, address, error_code);
-		goto bail;
+	/*
+	 * If we're in an interrupt, have no user context or are running
+	 * in a region with pagefaults disabled then we must not take the fault
+	 */
+	if (unlikely(faulthandler_disabled() || !mm)) {
+		if (is_user)
+			printk_ratelimited(KERN_ERR "Page fault in user mode"
+					   " with faulthandler_disabled()=%d"
+					   " mm=%p\n",
+					   faulthandler_disabled(), mm);
+		return bad_area_nosemaphore(regs, address);
 	}
-#endif
 
 	/* We restore the interrupt state now */
 	if (!arch_irq_disabled_regs(regs))
 		local_irq_enable();
 
-	if (faulthandler_disabled() || mm == NULL) {
-		if (!is_user) {
-			rc = SIGSEGV;
-			goto bail;
-		}
-		/* faulthandler_disabled() in user mode is really bad,
-		   as is current->mm == NULL. */
-		printk(KERN_EMERG "Page fault in user mode with "
-		       "faulthandler_disabled() = %d mm = %p\n",
-		       faulthandler_disabled(), mm);
-		printk(KERN_EMERG "NIP = %lx  MSR = %lx\n",
-		       regs->nip, regs->msr);
-		die("Weird page fault", regs, SIGSEGV);
-	}
-
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 
 	/*
@@ -293,6 +437,10 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 
 	if (is_user)
 		flags |= FAULT_FLAG_USER;
+	if (is_write)
+		flags |= FAULT_FLAG_WRITE;
+	if (is_exec)
+		flags |= FAULT_FLAG_INSTRUCTION;
 
 	/* When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
@@ -309,9 +457,9 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 	 * source.  If this is invalid we can skip the address space check,
 	 * thus avoiding the deadlock.
 	 */
-	if (!down_read_trylock(&mm->mmap_sem)) {
+	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
 		if (!is_user && !search_exception_tables(regs->nip))
-			goto bad_area_nosemaphore;
+			return bad_area_nosemaphore(regs, address);
 
 retry:
 		down_read(&mm->mmap_sem);
@@ -325,122 +473,24 @@ retry:
 	}
 
 	vma = find_vma(mm, address);
-	if (!vma)
-		goto bad_area;
-	if (vma->vm_start <= address)
+	if (unlikely(!vma))
+		return bad_area(regs, address);
+	if (likely(vma->vm_start <= address))
 		goto good_area;
-	if (!(vma->vm_flags & VM_GROWSDOWN))
-		goto bad_area;
+	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
+		return bad_area(regs, address);
 
-	/*
-	 * N.B. The POWER/Open ABI allows programs to access up to
-	 * 288 bytes below the stack pointer.
-	 * The kernel signal delivery code writes up to about 1.5kB
-	 * below the stack pointer (r1) before decrementing it.
-	 * The exec code can write slightly over 640kB to the stack
-	 * before setting the user r1.  Thus we allow the stack to
-	 * expand to 1MB without further checks.
-	 */
-	if (address + 0x100000 < vma->vm_end) {
-		/* get user regs even if this fault is in kernel mode */
-		struct pt_regs *uregs = current->thread.regs;
-		if (uregs == NULL)
-			goto bad_area;
+	/* The stack is being expanded, check if it's valid */
+	if (unlikely(bad_stack_expansion(regs, address, vma, store_update_sp)))
+		return bad_area(regs, address);
 
-		/*
-		 * A user-mode access to an address a long way below
-		 * the stack pointer is only valid if the instruction
-		 * is one which would update the stack pointer to the
-		 * address accessed if the instruction completed,
-		 * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
-		 * (or the byte, halfword, float or double forms).
-		 *
-		 * If we don't check this then any write to the area
-		 * between the last mapped region and the stack will
-		 * expand the stack rather than segfaulting.
-		 */
-		if (address + 2048 < uregs->gpr[1] && !store_update_sp)
-			goto bad_area;
-	}
-	if (expand_stack(vma, address))
-		goto bad_area;
+	/* Try to expand it */
+	if (unlikely(expand_stack(vma, address)))
+		return bad_area(regs, address);
 
 good_area:
-	code = SEGV_ACCERR;
-#if defined(CONFIG_6xx)
-	if (error_code & 0x95700000)
-		/* an error such as lwarx to I/O controller space,
-		   address matching DABR, eciwx, etc. */
-		goto bad_area;
-#endif /* CONFIG_6xx */
-#if defined(CONFIG_8xx)
-        /* The MPC8xx seems to always set 0x80000000, which is
-         * "undefined".  Of those that can be set, this is the only
-         * one which seems bad.
-         */
-	if (error_code & 0x10000000)
-                /* Guarded storage error. */
-		goto bad_area;
-#endif /* CONFIG_8xx */
-
-	if (is_exec) {
-		/*
-		 * Allow execution from readable areas if the MMU does not
-		 * provide separate controls over reading and executing.
-		 *
-		 * Note: That code used to not be enabled for 4xx/BookE.
-		 * It is now as I/D cache coherency for these is done at
-		 * set_pte_at() time and I see no reason why the test
-		 * below wouldn't be valid on those processors. This -may-
-		 * break programs compiled with a really old ABI though.
-		 */
-		if (!(vma->vm_flags & VM_EXEC) &&
-		    (cpu_has_feature(CPU_FTR_NOEXECUTE) ||
-		     !(vma->vm_flags & (VM_READ | VM_WRITE))))
-			goto bad_area;
-	/* a write */
-	} else if (is_write) {
-		if (!(vma->vm_flags & VM_WRITE))
-			goto bad_area;
-		flags |= FAULT_FLAG_WRITE;
-	/* a read */
-	} else {
-		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
-			goto bad_area;
-	}
-#ifdef CONFIG_PPC_STD_MMU
-	/*
-	 * For hash translation mode, we should never get a
-	 * PROTFAULT. Any update to pte to reduce access will result in us
-	 * removing the hash page table entry, thus resulting in a DSISR_NOHPTE
-	 * fault instead of DSISR_PROTFAULT.
-	 *
-	 * A pte update to relax the access will not result in a hash page table
-	 * entry invalidate and hence can result in DSISR_PROTFAULT.
-	 * ptep_set_access_flags() doesn't do a hpte flush. This is why we have
-	 * the special !is_write in the below conditional.
-	 *
-	 * For platforms that doesn't supports coherent icache and do support
-	 * per page noexec bit, we do setup things such that we do the
-	 * sync between D/I cache via fault. But that is handled via low level
-	 * hash fault code (hash_page_do_lazy_icache()) and we should not reach
-	 * here in such case.
-	 *
-	 * For wrong access that can result in PROTFAULT, the above vma->vm_flags
-	 * check should handle those and hence we should fall to the bad_area
-	 * handling correctly.
-	 *
-	 * For embedded with per page exec support that doesn't support coherent
-	 * icache we do get PROTFAULT and we handle that D/I cache sync in
-	 * set_pte_at while taking the noexec/prot fault. Hence this is WARN_ON
-	 * is conditional for server MMU.
-	 *
-	 * For radix, we can get prot fault for autonuma case, because radix
-	 * page table will have them marked noaccess for user.
-	 */
-	if (!radix_enabled() && !is_write)
-		WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
-#endif /* CONFIG_PPC_STD_MMU */
+	if (unlikely(access_error(is_write, is_exec, vma)))
+		return bad_area(regs, address);
 
 	/*
 	 * If for any reason at all we couldn't handle the fault,
@@ -448,6 +498,7 @@ good_area:
 	 * the fault.
 	 */
 	fault = handle_mm_fault(vma, address, flags);
+	major |= fault & VM_FAULT_MAJOR;
 
 	/*
 	 * Handle the retry right now, the mmap_sem has been released in that
@@ -465,64 +516,39 @@ good_area:
 			if (!fatal_signal_pending(current))
 				goto retry;
 		}
-		/* We will enter mm_fault_error() below */
-	} else
-		up_read(&current->mm->mmap_sem);
-
-	if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
-		if (fault & VM_FAULT_SIGSEGV)
-			goto bad_area_nosemaphore;
-		rc = mm_fault_error(regs, address, fault);
-		if (rc >= MM_FAULT_RETURN)
-			goto bail;
-		else
-			rc = 0;
+
+		/*
+		 * User mode? Just return to handle the fatal exception otherwise
+		 * return to bad_page_fault
+		 */
+		return is_user ? 0 : SIGBUS;
 	}
 
+	up_read(&current->mm->mmap_sem);
+
+	if (unlikely(fault & VM_FAULT_ERROR))
+		return mm_fault_error(regs, address, fault);
+
 	/*
 	 * Major/minor page fault accounting.
 	 */
-	if (fault & VM_FAULT_MAJOR) {
+	if (major) {
 		current->maj_flt++;
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
-			      regs, address);
-#ifdef CONFIG_PPC_SMLPAR
-		if (firmware_has_feature(FW_FEATURE_CMO)) {
-			u32 page_ins;
-
-			preempt_disable();
-			page_ins = be32_to_cpu(get_lppaca()->page_ins);
-			page_ins += 1 << PAGE_FACTOR;
-			get_lppaca()->page_ins = cpu_to_be32(page_ins);
-			preempt_enable();
-		}
-#endif /* CONFIG_PPC_SMLPAR */
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
+		cmo_account_page_fault();
 	} else {
 		current->min_flt++;
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
-			      regs, address);
-	}
-
-	goto bail;
-
-bad_area:
-	up_read(&mm->mmap_sem);
-
-bad_area_nosemaphore:
-	/* User mode accesses cause a SIGSEGV */
-	if (is_user) {
-		_exception(SIGSEGV, regs, code, address);
-		goto bail;
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
 	}
+	return 0;
+}
+NOKPROBE_SYMBOL(__do_page_fault);
 
-	if (is_exec && (error_code & DSISR_PROTFAULT))
-		printk_ratelimited(KERN_CRIT "kernel tried to execute NX-protected"
-				   " page (%lx) - exploit attempt? (uid: %d)\n",
-				   address, from_kuid(&init_user_ns, current_uid()));
-
-	rc = SIGSEGV;
-
-bail:
+int do_page_fault(struct pt_regs *regs, unsigned long address,
+		  unsigned long error_code)
+{
+	enum ctx_state prev_state = exception_enter();
+	int rc = __do_page_fault(regs, address, error_code);
 	exception_exit(prev_state);
 	return rc;
 }
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index 6f962e5cb5e1..ffbd7c0bda96 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -575,7 +575,6 @@ _GLOBAL(flush_hash_pages)
 	rlwinm	r8,r8,0,31,29		/* clear HASHPTE bit */
 	stwcx.	r8,0,r5			/* update the pte */
 	bne-	33b
-EXPORT_SYMBOL(flush_hash_pages)
 
 	/* Get the address of the primary PTE group in the hash table (r3) */
 _GLOBAL(flush_hash_patch_A)
@@ -634,6 +633,7 @@ _GLOBAL(flush_hash_patch_B)
 	SYNC_601
 	isync
 	blr
+EXPORT_SYMBOL(flush_hash_pages)
 
 /*
  * Flush an entry from the TLB
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 7a20669c19e7..655a5a9a183d 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -21,6 +21,7 @@
 #undef DEBUG
 #undef DEBUG_LOW
 
+#define pr_fmt(fmt) "hash-mmu: " fmt
 #include <linux/spinlock.h>
 #include <linux/errno.h>
 #include <linux/sched/mm.h>
@@ -61,6 +62,7 @@
 #include <asm/tm.h>
 #include <asm/trace.h>
 #include <asm/ps3.h>
+#include <asm/pte-walk.h>
 
 #ifdef DEBUG
 #define DBG(fmt...) udbg_printf(fmt)
@@ -507,9 +509,9 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
 	printk(KERN_INFO "Huge page(16GB) memory: "
 			"addr = 0x%lX size = 0x%lX pages = %d\n",
 			phys_addr, block_size, expected_pages);
-	if (phys_addr + (16 * GB) <= memblock_end_of_DRAM()) {
+	if (phys_addr + block_size * expected_pages <= memblock_end_of_DRAM()) {
 		memblock_reserve(phys_addr, block_size * expected_pages);
-		add_gpage(phys_addr, block_size, expected_pages);
+		pseries_add_gpage(phys_addr, block_size, expected_pages);
 	}
 	return 0;
 }
@@ -1019,6 +1021,7 @@ void __init hash__early_init_mmu(void)
 	__kernel_virt_size = H_KERN_VIRT_SIZE;
 	__vmalloc_start = H_VMALLOC_START;
 	__vmalloc_end = H_VMALLOC_END;
+	__kernel_io_start = H_KERN_IO_START;
 	vmemmap = (struct page *)H_VMEMMAP_BASE;
 	ioremap_bot = IOREMAP_BASE;
 
@@ -1228,7 +1231,6 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
 	unsigned long vsid;
 	pte_t *ptep;
 	unsigned hugeshift;
-	const struct cpumask *tmp;
 	int rc, user_region = 0;
 	int psize, ssize;
 
@@ -1280,8 +1282,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
 	}
 
 	/* Check CPU locality */
-	tmp = cpumask_of(smp_processor_id());
-	if (user_region && cpumask_equal(mm_cpumask(mm), tmp))
+	if (user_region && mm_is_thread_local(mm))
 		flags |= HPTE_LOCAL_UPDATE;
 
 #ifndef CONFIG_PPC_64K_PAGES
@@ -1297,7 +1298,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
 #endif /* CONFIG_PPC_64K_PAGES */
 
 	/* Get PTE and page size from page tables */
-	ptep = __find_linux_pte_or_hugepte(pgdir, ea, &is_thp, &hugeshift);
+	ptep = find_linux_pte(pgdir, ea, &is_thp, &hugeshift);
 	if (ptep == NULL || !pte_present(*ptep)) {
 		DBG_LOW(" no PTE !\n");
 		rc = 1;
@@ -1526,7 +1527,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
 	 * THP pages use update_mmu_cache_pmd. We don't do
 	 * hash preload there. Hence can ignore THP here
 	 */
-	ptep = find_linux_pte_or_hugepte(pgdir, ea, NULL, &hugepage_shift);
+	ptep = find_current_mm_pte(pgdir, ea, NULL, &hugepage_shift);
 	if (!ptep)
 		goto out_exit;
 
@@ -1543,7 +1544,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
 #endif /* CONFIG_PPC_64K_PAGES */
 
 	/* Is that local to this CPU ? */
-	if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
+	if (mm_is_thread_local(mm))
 		update_flags |= HPTE_LOCAL_UPDATE;
 
 	/* Hash it in */
diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c
index e292c8a60952..668e87d03f9e 100644
--- a/arch/powerpc/mm/highmem.c
+++ b/arch/powerpc/mm/highmem.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * highmem.c: virtual kernel memory mappings for high memory
  *
diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c
index bfe4e8526b2d..f84ec46cdb26 100644
--- a/arch/powerpc/mm/hugetlbpage-book3e.c
+++ b/arch/powerpc/mm/hugetlbpage-book3e.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * PPC Huge TLB Page Support for Book3E MMU
  *
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index a84bb44497f9..0c2a91df3210 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later)
  *
diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c
index a12e86395025..2486bee0f93e 100644
--- a/arch/powerpc/mm/hugetlbpage-radix.c
+++ b/arch/powerpc/mm/hugetlbpage-radix.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <asm/pgtable.h>
@@ -48,17 +49,22 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	struct hstate *h = hstate_file(file);
+	int fixed = (flags & MAP_FIXED);
+	unsigned long high_limit;
 	struct vm_unmapped_area_info info;
 
-	if (unlikely(addr > mm->context.addr_limit && addr < TASK_SIZE))
-		mm->context.addr_limit = TASK_SIZE;
+	high_limit = DEFAULT_MAP_WINDOW;
+	if (addr >= high_limit || (fixed && (addr + len > high_limit)))
+		high_limit = TASK_SIZE;
 
 	if (len & ~huge_page_mask(h))
 		return -EINVAL;
-	if (len > mm->task_size)
+	if (len > high_limit)
 		return -ENOMEM;
 
-	if (flags & MAP_FIXED) {
+	if (fixed) {
+		if (addr > high_limit - len)
+			return -ENOMEM;
 		if (prepare_hugepage_range(file, addr, len))
 			return -EINVAL;
 		return addr;
@@ -67,7 +73,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	if (addr) {
 		addr = ALIGN(addr, huge_page_size(h));
 		vma = find_vma(mm, addr);
-		if (mm->task_size - len >= addr &&
+		if (high_limit - len >= addr &&
 		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
@@ -78,12 +84,9 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
 	info.length = len;
 	info.low_limit = PAGE_SIZE;
-	info.high_limit = current->mm->mmap_base;
+	info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW);
 	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
 	info.align_offset = 0;
 
-	if (addr > DEFAULT_MAP_WINDOW)
-		info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW;
-
 	return vm_unmapped_area(&info);
 }
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index e1bf5ca397fe..a9b9083c5e49 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -24,6 +24,8 @@
 #include <asm/tlb.h>
 #include <asm/setup.h>
 #include <asm/hugetlb.h>
+#include <asm/pte-walk.h>
+
 
 #ifdef CONFIG_HUGETLB_PAGE
 
@@ -36,32 +38,15 @@
 unsigned int HPAGE_SHIFT;
 EXPORT_SYMBOL(HPAGE_SHIFT);
 
-/*
- * Tracks gpages after the device tree is scanned and before the
- * huge_boot_pages list is ready.  On non-Freescale implementations, this is
- * just used to track 16G pages and so is a single array.  FSL-based
- * implementations may have more than one gpage size, so we need multiple
- * arrays
- */
-#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
-#define MAX_NUMBER_GPAGES	128
-struct psize_gpages {
-	u64 gpage_list[MAX_NUMBER_GPAGES];
-	unsigned int nr_gpages;
-};
-static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT];
-#else
-#define MAX_NUMBER_GPAGES	1024
-static u64 gpage_freearray[MAX_NUMBER_GPAGES];
-static unsigned nr_gpages;
-#endif
-
 #define hugepd_none(hpd)	(hpd_val(hpd) == 0)
 
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
 {
-	/* Only called for hugetlbfs pages, hence can ignore THP */
-	return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL);
+	/*
+	 * Only called for hugetlbfs pages, hence can ignore THP and the
+	 * irq disabled walk.
+	 */
+	return __find_linux_pte(mm->pgd, addr, NULL, NULL);
 }
 
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
@@ -210,145 +195,20 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
 	return hugepte_offset(*hpdp, addr, pdshift);
 }
 
-#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
-/* Build list of addresses of gigantic pages.  This function is used in early
- * boot before the buddy allocator is setup.
- */
-void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
-{
-	unsigned int idx = shift_to_mmu_psize(__ffs(page_size));
-	int i;
-
-	if (addr == 0)
-		return;
-
-	gpage_freearray[idx].nr_gpages = number_of_pages;
-
-	for (i = 0; i < number_of_pages; i++) {
-		gpage_freearray[idx].gpage_list[i] = addr;
-		addr += page_size;
-	}
-}
-
-/*
- * Moves the gigantic page addresses from the temporary list to the
- * huge_boot_pages list.
- */
-int alloc_bootmem_huge_page(struct hstate *hstate)
-{
-	struct huge_bootmem_page *m;
-	int idx = shift_to_mmu_psize(huge_page_shift(hstate));
-	int nr_gpages = gpage_freearray[idx].nr_gpages;
-
-	if (nr_gpages == 0)
-		return 0;
-
-#ifdef CONFIG_HIGHMEM
-	/*
-	 * If gpages can be in highmem we can't use the trick of storing the
-	 * data structure in the page; allocate space for this
-	 */
-	m = memblock_virt_alloc(sizeof(struct huge_bootmem_page), 0);
-	m->phys = gpage_freearray[idx].gpage_list[--nr_gpages];
-#else
-	m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]);
-#endif
-
-	list_add(&m->list, &huge_boot_pages);
-	gpage_freearray[idx].nr_gpages = nr_gpages;
-	gpage_freearray[idx].gpage_list[nr_gpages] = 0;
-	m->hstate = hstate;
-
-	return 1;
-}
+#ifdef CONFIG_PPC_BOOK3S_64
 /*
- * Scan the command line hugepagesz= options for gigantic pages; store those in
- * a list that we use to allocate the memory once all options are parsed.
+ * Tracks gpages after the device tree is scanned and before the
+ * huge_boot_pages list is ready on pseries.
  */
-
-unsigned long gpage_npages[MMU_PAGE_COUNT];
-
-static int __init do_gpage_early_setup(char *param, char *val,
-				       const char *unused, void *arg)
-{
-	static phys_addr_t size;
-	unsigned long npages;
-
-	/*
-	 * The hugepagesz and hugepages cmdline options are interleaved.  We
-	 * use the size variable to keep track of whether or not this was done
-	 * properly and skip over instances where it is incorrect.  Other
-	 * command-line parsing code will issue warnings, so we don't need to.
-	 *
-	 */
-	if ((strcmp(param, "default_hugepagesz") == 0) ||
-	    (strcmp(param, "hugepagesz") == 0)) {
-		size = memparse(val, NULL);
-	} else if (strcmp(param, "hugepages") == 0) {
-		if (size != 0) {
-			if (sscanf(val, "%lu", &npages) <= 0)
-				npages = 0;
-			if (npages > MAX_NUMBER_GPAGES) {
-				pr_warn("MMU: %lu pages requested for page "
-#ifdef CONFIG_PHYS_ADDR_T_64BIT
-					"size %llu KB, limiting to "
-#else
-					"size %u KB, limiting to "
-#endif
-					__stringify(MAX_NUMBER_GPAGES) "\n",
-					npages, size / 1024);
-				npages = MAX_NUMBER_GPAGES;
-			}
-			gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages;
-			size = 0;
-		}
-	}
-	return 0;
-}
-
+#define MAX_NUMBER_GPAGES	1024
+__initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
+__initdata static unsigned nr_gpages;
 
 /*
- * This function allocates physical space for pages that are larger than the
- * buddy allocator can handle.  We want to allocate these in highmem because
- * the amount of lowmem is limited.  This means that this function MUST be
- * called before lowmem_end_addr is set up in MMU_init() in order for the lmb
- * allocate to grab highmem.
- */
-void __init reserve_hugetlb_gpages(void)
-{
-	static __initdata char cmdline[COMMAND_LINE_SIZE];
-	phys_addr_t size, base;
-	int i;
-
-	strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE);
-	parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0,
-			NULL, &do_gpage_early_setup);
-
-	/*
-	 * Walk gpage list in reverse, allocating larger page sizes first.
-	 * Skip over unsupported sizes, or sizes that have 0 gpages allocated.
-	 * When we reach the point in the list where pages are no longer
-	 * considered gpages, we're done.
-	 */
-	for (i = MMU_PAGE_COUNT-1; i >= 0; i--) {
-		if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0)
-			continue;
-		else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT))
-			break;
-
-		size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i));
-		base = memblock_alloc_base(size * gpage_npages[i], size,
-					   MEMBLOCK_ALLOC_ANYWHERE);
-		add_gpage(base, size, gpage_npages[i]);
-	}
-}
-
-#else /* !PPC_FSL_BOOK3E */
-
-/* Build list of addresses of gigantic pages.  This function is used in early
+ * Build list of addresses of gigantic pages.  This function is used in early
  * boot before the buddy allocator is setup.
  */
-void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
+void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
 {
 	if (!addr)
 		return;
@@ -360,10 +220,7 @@ void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
 	}
 }
 
-/* Moves the gigantic page addresses from the temporary list to the
- * huge_boot_pages list.
- */
-int alloc_bootmem_huge_page(struct hstate *hstate)
+int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
 {
 	struct huge_bootmem_page *m;
 	if (nr_gpages == 0)
@@ -376,6 +233,17 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
 }
 #endif
 
+
+int __init alloc_bootmem_huge_page(struct hstate *h)
+{
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
+		return pseries_alloc_bootmem_huge_page(h);
+#endif
+	return __alloc_bootmem_huge_page(h);
+}
+
 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
 #define HUGEPD_FREELIST_SIZE \
 	((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
@@ -407,8 +275,7 @@ static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
 	batchp = &get_cpu_var(hugepd_freelist_cur);
 
 	if (atomic_read(&tlb->mm->mm_users) < 2 ||
-	    cpumask_equal(mm_cpumask(tlb->mm),
-			  cpumask_of(smp_processor_id()))) {
+	    mm_is_thread_local(tlb->mm)) {
 		kmem_cache_free(hugepte_cache, hugepte);
 		put_cpu_var(hugepd_freelist_cur);
 		return;
@@ -566,6 +433,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 	pud = pud_offset(pgd, start);
 	pgd_clear(pgd);
 	pud_free_tlb(tlb, pud, start);
+	mm_dec_nr_puds(tlb->mm);
 }
 
 /*
@@ -886,9 +754,8 @@ void flush_dcache_icache_hugepage(struct page *page)
  * This function need to be called with interrupts disabled. We use this variant
  * when we have MSR[EE] = 0 but the paca->soft_enabled = 1
  */
-
-pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
-				   bool *is_thp, unsigned *shift)
+pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
+			bool *is_thp, unsigned *hpage_shift)
 {
 	pgd_t pgd, *pgdp;
 	pud_t pud, *pudp;
@@ -897,8 +764,8 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
 	hugepd_t *hpdp = NULL;
 	unsigned pdshift = PGDIR_SHIFT;
 
-	if (shift)
-		*shift = 0;
+	if (hpage_shift)
+		*hpage_shift = 0;
 
 	if (is_thp)
 		*is_thp = false;
@@ -968,16 +835,15 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
 	ret_pte = hugepte_offset(*hpdp, ea, pdshift);
 	pdshift = hugepd_shift(*hpdp);
 out:
-	if (shift)
-		*shift = pdshift;
+	if (hpage_shift)
+		*hpage_shift = pdshift;
 	return ret_pte;
 }
-EXPORT_SYMBOL_GPL(__find_linux_pte_or_hugepte);
+EXPORT_SYMBOL_GPL(__find_linux_pte);
 
 int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 		unsigned long end, int write, struct page **pages, int *nr)
 {
-	unsigned long mask;
 	unsigned long pte_end;
 	struct page *head, *page;
 	pte_t pte;
@@ -988,18 +854,10 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 		end = pte_end;
 
 	pte = READ_ONCE(*ptep);
-	mask = _PAGE_PRESENT | _PAGE_READ;
 
-	/*
-	 * On some CPUs like the 8xx, _PAGE_RW hence _PAGE_WRITE is defined
-	 * as 0 and _PAGE_RO has to be set when a page is not writable
-	 */
-	if (write)
-		mask |= _PAGE_WRITE;
-	else
-		mask |= _PAGE_RO;
-
-	if ((pte_val(pte) & mask) != mask)
+	if (!pte_present(pte) || !pte_read(pte))
+		return 0;
+	if (write && !pte_write(pte))
 		return 0;
 
 	/* hugepages are never "special" */
diff --git a/arch/powerpc/mm/icswx.c b/arch/powerpc/mm/icswx.c
deleted file mode 100644
index 1fa794d7d59f..000000000000
--- a/arch/powerpc/mm/icswx.c
+++ /dev/null
@@ -1,292 +0,0 @@
-/*
- *  ICSWX and ACOP Management
- *
- *  Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org>
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
-#include <linux/module.h>
-#include <linux/uaccess.h>
-
-#include "icswx.h"
-
-/*
- * The processor and its L2 cache cause the icswx instruction to
- * generate a COP_REQ transaction on PowerBus. The transaction has no
- * address, and the processor does not perform an MMU access to
- * authenticate the transaction. The command portion of the PowerBus
- * COP_REQ transaction includes the LPAR_ID (LPID) and the coprocessor
- * Process ID (PID), which the coprocessor compares to the authorized
- * LPID and PID held in the coprocessor, to determine if the process
- * is authorized to generate the transaction.  The data of the COP_REQ
- * transaction is 128-byte or less in size and is placed in cacheable
- * memory on a 128-byte cache line boundary.
- *
- * The task to use a coprocessor should use use_cop() to mark the use
- * of the Coprocessor Type (CT) and context switching. On a server
- * class processor, the PID register is used only for coprocessor
- * management + * and so a coprocessor PID is allocated before
- * executing icswx + * instruction. Drop_cop() is used to free the
- * coprocessor PID.
- *
- * Example:
- * Host Fabric Interface (HFI) is a PowerPC network coprocessor.
- * Each HFI have multiple windows. Each HFI window serves as a
- * network device sending to and receiving from HFI network.
- * HFI immediate send function uses icswx instruction. The immediate
- * send function allows small (single cache-line) packets be sent
- * without using the regular HFI send FIFO and doorbell, which are
- * much slower than immediate send.
- *
- * For each task intending to use HFI immediate send, the HFI driver
- * calls use_cop() to obtain a coprocessor PID for the task.
- * The HFI driver then allocate a free HFI window and save the
- * coprocessor PID to the HFI window to allow the task to use the
- * HFI window.
- *
- * The HFI driver repeatedly creates immediate send packets and
- * issues icswx instruction to send data through the HFI window.
- * The HFI compares the coprocessor PID in the CPU PID register
- * to the PID held in the HFI window to determine if the transaction
- * is allowed.
- *
- * When the task to release the HFI window, the HFI driver calls
- * drop_cop() to release the coprocessor PID.
- */
-
-void switch_cop(struct mm_struct *next)
-{
-#ifdef CONFIG_PPC_ICSWX_PID
-	mtspr(SPRN_PID, next->context.cop_pid);
-#endif
-	mtspr(SPRN_ACOP, next->context.acop);
-}
-
-/**
- * Start using a coprocessor.
- * @acop: mask of coprocessor to be used.
- * @mm: The mm the coprocessor to associate with. Most likely current mm.
- *
- * Return a positive PID if successful. Negative errno otherwise.
- * The returned PID will be fed to the coprocessor to determine if an
- * icswx transaction is authenticated.
- */
-int use_cop(unsigned long acop, struct mm_struct *mm)
-{
-	int ret;
-
-	if (!cpu_has_feature(CPU_FTR_ICSWX))
-		return -ENODEV;
-
-	if (!mm || !acop)
-		return -EINVAL;
-
-	/* The page_table_lock ensures mm_users won't change under us */
-	spin_lock(&mm->page_table_lock);
-	spin_lock(mm->context.cop_lockp);
-
-	ret = get_cop_pid(mm);
-	if (ret < 0)
-		goto out;
-
-	/* update acop */
-	mm->context.acop |= acop;
-
-	sync_cop(mm);
-
-	/*
-	 * If this is a threaded process then there might be other threads
-	 * running. We need to send an IPI to force them to pick up any
-	 * change in PID and ACOP.
-	 */
-	if (atomic_read(&mm->mm_users) > 1)
-		smp_call_function(sync_cop, mm, 1);
-
-out:
-	spin_unlock(mm->context.cop_lockp);
-	spin_unlock(&mm->page_table_lock);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(use_cop);
-
-/**
- * Stop using a coprocessor.
- * @acop: mask of coprocessor to be stopped.
- * @mm: The mm the coprocessor associated with.
- */
-void drop_cop(unsigned long acop, struct mm_struct *mm)
-{
-	int free_pid;
-
-	if (!cpu_has_feature(CPU_FTR_ICSWX))
-		return;
-
-	if (WARN_ON_ONCE(!mm))
-		return;
-
-	/* The page_table_lock ensures mm_users won't change under us */
-	spin_lock(&mm->page_table_lock);
-	spin_lock(mm->context.cop_lockp);
-
-	mm->context.acop &= ~acop;
-
-	free_pid = disable_cop_pid(mm);
-	sync_cop(mm);
-
-	/*
-	 * If this is a threaded process then there might be other threads
-	 * running. We need to send an IPI to force them to pick up any
-	 * change in PID and ACOP.
-	 */
-	if (atomic_read(&mm->mm_users) > 1)
-		smp_call_function(sync_cop, mm, 1);
-
-	if (free_pid != COP_PID_NONE)
-		free_cop_pid(free_pid);
-
-	spin_unlock(mm->context.cop_lockp);
-	spin_unlock(&mm->page_table_lock);
-}
-EXPORT_SYMBOL_GPL(drop_cop);
-
-static int acop_use_cop(int ct)
-{
-	/* There is no alternate policy, yet */
-	return -1;
-}
-
-/*
- * Get the instruction word at the NIP
- */
-static u32 acop_get_inst(struct pt_regs *regs)
-{
-	u32 inst;
-	u32 __user *p;
-
-	p = (u32 __user *)regs->nip;
-	if (!access_ok(VERIFY_READ, p, sizeof(*p)))
-		return 0;
-
-	if (__get_user(inst, p))
-		return 0;
-
-	return inst;
-}
-
-/**
- * @regs: registers at time of interrupt
- * @address: storage address
- * @error_code: Fault code, usually the DSISR or ESR depending on
- *		processor type
- *
- * Return 0 if we are able to resolve the data storage fault that
- * results from a CT miss in the ACOP register.
- */
-int acop_handle_fault(struct pt_regs *regs, unsigned long address,
-		      unsigned long error_code)
-{
-	int ct;
-	u32 inst = 0;
-
-	if (!cpu_has_feature(CPU_FTR_ICSWX)) {
-		pr_info("No coprocessors available");
-		_exception(SIGILL, regs, ILL_ILLOPN, address);
-	}
-
-	if (!user_mode(regs)) {
-		/* this could happen if the HV denies the
-		 * kernel access, for now we just die */
-		die("ICSWX from kernel failed", regs, SIGSEGV);
-	}
-
-	/* Some implementations leave us a hint for the CT */
-	ct = ICSWX_GET_CT_HINT(error_code);
-	if (ct < 0) {
-		/* we have to peek at the instruction word to figure out CT */
-		u32 ccw;
-		u32 rs;
-
-		inst = acop_get_inst(regs);
-		if (inst == 0)
-			return -1;
-
-		rs = (inst >> (31 - 10)) & 0x1f;
-		ccw = regs->gpr[rs];
-		ct = (ccw >> 16) & 0x3f;
-	}
-
-	/*
-	 * We could be here because another thread has enabled acop
-	 * but the ACOP register has yet to be updated.
-	 *
-	 * This should have been taken care of by the IPI to sync all
-	 * the threads (see smp_call_function(sync_cop, mm, 1)), but
-	 * that could take forever if there are a significant amount
-	 * of threads.
-	 *
-	 * Given the number of threads on some of these systems,
-	 * perhaps this is the best way to sync ACOP rather than whack
-	 * every thread with an IPI.
-	 */
-	if ((acop_copro_type_bit(ct) & current->active_mm->context.acop) != 0) {
-		sync_cop(current->active_mm);
-		return 0;
-	}
-
-	/* check for alternate policy */
-	if (!acop_use_cop(ct))
-		return 0;
-
-	/* at this point the CT is unknown to the system */
-	pr_warn("%s[%d]: Coprocessor %d is unavailable\n",
-		current->comm, current->pid, ct);
-
-	/* get inst if we don't already have it */
-	if (inst == 0) {
-		inst = acop_get_inst(regs);
-		if (inst == 0)
-			return -1;
-	}
-
-	/* Check if the instruction is the "record form" */
-	if (inst & 1) {
-		/*
-		 * the instruction is "record" form so we can reject
-		 * using CR0
-		 */
-		regs->ccr &= ~(0xful << 28);
-		regs->ccr |= ICSWX_RC_NOT_FOUND << 28;
-
-		/* Move on to the next instruction */
-		regs->nip += 4;
-	} else {
-		/*
-		 * There is no architected mechanism to report a bad
-		 * CT so we could either SIGILL or report nothing.
-		 * Since the non-record version should only bu used
-		 * for "hints" or "don't care" we should probably do
-		 * nothing.  However, I could see how some people
-		 * might want an SIGILL so it here if you want it.
-		 */
-#ifdef CONFIG_PPC_ICSWX_USE_SIGILL
-		_exception(SIGILL, regs, ILL_ILLOPN, address);
-#else
-		regs->nip += 4;
-#endif
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(acop_handle_fault);
diff --git a/arch/powerpc/mm/icswx.h b/arch/powerpc/mm/icswx.h
deleted file mode 100644
index 6dedc08e62c8..000000000000
--- a/arch/powerpc/mm/icswx.h
+++ /dev/null
@@ -1,68 +0,0 @@
-#ifndef _ARCH_POWERPC_MM_ICSWX_H_
-#define _ARCH_POWERPC_MM_ICSWX_H_
-
-/*
- *  ICSWX and ACOP Management
- *
- *  Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org>
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <asm/mmu_context.h>
-
-/* also used to denote that PIDs are not used */
-#define COP_PID_NONE 0
-
-static inline void sync_cop(void *arg)
-{
-	struct mm_struct *mm = arg;
-
-	if (mm == current->active_mm)
-		switch_cop(current->active_mm);
-}
-
-#ifdef CONFIG_PPC_ICSWX_PID
-extern int get_cop_pid(struct mm_struct *mm);
-extern int disable_cop_pid(struct mm_struct *mm);
-extern void free_cop_pid(int free_pid);
-#else
-#define get_cop_pid(m) (COP_PID_NONE)
-#define disable_cop_pid(m) (COP_PID_NONE)
-#define free_cop_pid(p)
-#endif
-
-/*
- * These are implementation bits for architected registers.  If this
- * ever becomes architecture the should be moved to reg.h et. al.
- */
-/* UCT is the same bit for Server and Embedded */
-#define ICSWX_DSI_UCT		0x00004000  /* Unavailable Coprocessor Type */
-
-#ifdef CONFIG_PPC_BOOK3E
-/* Embedded implementation gives us no hints as to what the CT is */
-#define ICSWX_GET_CT_HINT(x) (-1)
-#else
-/* Server implementation contains the CT value in the DSISR */
-#define ICSWX_DSISR_CTMASK	0x00003f00
-#define ICSWX_GET_CT_HINT(x)	(((x) & ICSWX_DSISR_CTMASK) >> 8)
-#endif
-
-#define ICSWX_RC_STARTED	0x8	/* The request has been started */
-#define ICSWX_RC_NOT_IDLE	0x4	/* No coprocessor found idle */
-#define ICSWX_RC_NOT_FOUND	0x2	/* No coprocessor found */
-#define ICSWX_RC_UNDEFINED	0x1	/* Reserved */
-
-extern int acop_handle_fault(struct pt_regs *regs, unsigned long address,
-			     unsigned long error_code);
-
-static inline u64 acop_copro_type_bit(unsigned int type)
-{
-	return 1ULL << (63 - type);
-}
-
-#endif /* !_ARCH_POWERPC_MM_ICSWX_H_ */
diff --git a/arch/powerpc/mm/icswx_pid.c b/arch/powerpc/mm/icswx_pid.c
deleted file mode 100644
index 91e30eb7d054..000000000000
--- a/arch/powerpc/mm/icswx_pid.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- *  ICSWX and ACOP/PID Management
- *
- *  Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org>
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
-#include <linux/idr.h>
-#include <linux/module.h>
-#include "icswx.h"
-
-#define COP_PID_MIN (COP_PID_NONE + 1)
-#define COP_PID_MAX (0xFFFF)
-
-static DEFINE_SPINLOCK(mmu_context_acop_lock);
-static DEFINE_IDA(cop_ida);
-
-static int new_cop_pid(struct ida *ida, int min_id, int max_id,
-		       spinlock_t *lock)
-{
-	int index;
-	int err;
-
-again:
-	if (!ida_pre_get(ida, GFP_KERNEL))
-		return -ENOMEM;
-
-	spin_lock(lock);
-	err = ida_get_new_above(ida, min_id, &index);
-	spin_unlock(lock);
-
-	if (err == -EAGAIN)
-		goto again;
-	else if (err)
-		return err;
-
-	if (index > max_id) {
-		spin_lock(lock);
-		ida_remove(ida, index);
-		spin_unlock(lock);
-		return -ENOMEM;
-	}
-
-	return index;
-}
-
-int get_cop_pid(struct mm_struct *mm)
-{
-	int pid;
-
-	if (mm->context.cop_pid == COP_PID_NONE) {
-		pid = new_cop_pid(&cop_ida, COP_PID_MIN, COP_PID_MAX,
-				  &mmu_context_acop_lock);
-		if (pid >= 0)
-			mm->context.cop_pid = pid;
-	}
-	return mm->context.cop_pid;
-}
-
-int disable_cop_pid(struct mm_struct *mm)
-{
-	int free_pid = COP_PID_NONE;
-
-	if ((!mm->context.acop) && (mm->context.cop_pid != COP_PID_NONE)) {
-		free_pid = mm->context.cop_pid;
-		mm->context.cop_pid = COP_PID_NONE;
-	}
-	return free_pid;
-}
-
-void free_cop_pid(int free_pid)
-{
-	spin_lock(&mmu_context_acop_lock);
-	ida_remove(&cop_ida, free_pid);
-	spin_unlock(&mmu_context_acop_lock);
-}
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 8a7c38b8d335..6419b33ca309 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -113,6 +113,12 @@ void __init MMU_setup(void)
 		__map_without_bats = 1;
 		__map_without_ltlbs = 1;
 	}
+#ifdef CONFIG_STRICT_KERNEL_RWX
+	if (rodata_enabled) {
+		__map_without_bats = 1;
+		__map_without_ltlbs = 1;
+	}
+#endif
 }
 
 /*
@@ -132,8 +138,6 @@ void __init MMU_init(void)
 	 * Reserve gigantic pages for hugetlb.  This MUST occur before
 	 * lowmem_end_addr is initialized below.
 	 */
-	reserve_hugetlb_gpages();
-
 	if (memblock.memory.cnt > 1) {
 #ifndef CONFIG_WII
 		memblock_enforce_memory_limit(memblock.memory.regions[0].size);
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 5b4c25d12ff3..a07722531b32 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -68,11 +68,11 @@
 
 #include "mmu_decl.h"
 
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 #if H_PGTABLE_RANGE > USER_VSID_RANGE
 #warning Limited user VSID range means pagetable space is wasted
 #endif
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_PPC_BOOK3S_64 */
 
 phys_addr_t memstart_addr = ~0;
 EXPORT_SYMBOL_GPL(memstart_addr);
@@ -356,7 +356,7 @@ struct page *realmode_pfn_to_page(unsigned long pfn)
 }
 EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
 
-#elif defined(CONFIG_FLATMEM)
+#else
 
 struct page *realmode_pfn_to_page(unsigned long pfn)
 {
@@ -365,13 +365,22 @@ struct page *realmode_pfn_to_page(unsigned long pfn)
 }
 EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
 
-#endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+
+#ifdef CONFIG_PPC_BOOK3S_64
+static bool disable_radix = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT);
 
-#ifdef CONFIG_PPC_STD_MMU_64
-static bool disable_radix;
 static int __init parse_disable_radix(char *p)
 {
-	disable_radix = true;
+	bool val;
+
+	if (strlen(p) == 0)
+		val = true;
+	else if (kstrtobool(p, &val))
+		return -EINVAL;
+
+	disable_radix = val;
+
 	return 0;
 }
 early_param("disable_radix", parse_disable_radix);
@@ -381,7 +390,7 @@ early_param("disable_radix", parse_disable_radix);
  * /chosen/ibm,architecture-vec-5 to see if the hypervisor is willing to do
  * radix.  If not, we clear the radix feature bit so we fall back to hash.
  */
-static void early_check_vec5(void)
+static void __init early_check_vec5(void)
 {
 	unsigned long root, chosen;
 	int size;
@@ -444,4 +453,4 @@ void __init mmu_early_init_devtree(void)
 	else
 		hash__early_init_devtree();
 }
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 46b4e67d2372..4362b86ef84c 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -436,7 +436,7 @@ void flush_dcache_icache_page(struct page *page)
 		return;
 	}
 #endif
-#if defined(CONFIG_8xx) || defined(CONFIG_PPC64)
+#if defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC64)
 	/* On 8xx there is no need to kmap since highmem is not supported */
 	__flush_dcache_icache(page_address(page));
 #else
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index 5d78b193fec4..d503f344e476 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -106,22 +106,27 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
+	int fixed = (flags & MAP_FIXED);
+	unsigned long high_limit;
 	struct vm_unmapped_area_info info;
 
-	if (unlikely(addr > mm->context.addr_limit &&
-		     mm->context.addr_limit != TASK_SIZE))
-		mm->context.addr_limit = TASK_SIZE;
+	high_limit = DEFAULT_MAP_WINDOW;
+	if (addr >= high_limit || (fixed && (addr + len > high_limit)))
+		high_limit = TASK_SIZE;
 
-	if (len > mm->task_size - mmap_min_addr)
+	if (len > high_limit)
 		return -ENOMEM;
 
-	if (flags & MAP_FIXED)
+	if (fixed) {
+		if (addr > high_limit - len)
+			return -ENOMEM;
 		return addr;
+	}
 
 	if (addr) {
 		addr = PAGE_ALIGN(addr);
 		vma = find_vma(mm, addr);
-		if (mm->task_size - len >= addr && addr >= mmap_min_addr &&
+		if (high_limit - len >= addr && addr >= mmap_min_addr &&
 		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
@@ -129,13 +134,9 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	info.flags = 0;
 	info.length = len;
 	info.low_limit = mm->mmap_base;
+	info.high_limit = high_limit;
 	info.align_mask = 0;
 
-	if (unlikely(addr > DEFAULT_MAP_WINDOW))
-		info.high_limit = mm->context.addr_limit;
-	else
-		info.high_limit = DEFAULT_MAP_WINDOW;
-
 	return vm_unmapped_area(&info);
 }
 
@@ -149,37 +150,37 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
 	struct vm_area_struct *vma;
 	struct mm_struct *mm = current->mm;
 	unsigned long addr = addr0;
+	int fixed = (flags & MAP_FIXED);
+	unsigned long high_limit;
 	struct vm_unmapped_area_info info;
 
-	if (unlikely(addr > mm->context.addr_limit &&
-		     mm->context.addr_limit != TASK_SIZE))
-		mm->context.addr_limit = TASK_SIZE;
+	high_limit = DEFAULT_MAP_WINDOW;
+	if (addr >= high_limit || (fixed && (addr + len > high_limit)))
+		high_limit = TASK_SIZE;
 
-	/* requested length too big for entire address space */
-	if (len > mm->task_size - mmap_min_addr)
+	if (len > high_limit)
 		return -ENOMEM;
 
-	if (flags & MAP_FIXED)
+	if (fixed) {
+		if (addr > high_limit - len)
+			return -ENOMEM;
 		return addr;
+	}
 
-	/* requesting a specific address */
 	if (addr) {
 		addr = PAGE_ALIGN(addr);
 		vma = find_vma(mm, addr);
-		if (mm->task_size - len >= addr && addr >= mmap_min_addr &&
-				(!vma || addr + len <= vm_start_gap(vma)))
+		if (high_limit - len >= addr && addr >= mmap_min_addr &&
+		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
 
 	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
 	info.length = len;
 	info.low_limit = max(PAGE_SIZE, mmap_min_addr);
-	info.high_limit = mm->mmap_base;
+	info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW);
 	info.align_mask = 0;
 
-	if (addr > DEFAULT_MAP_WINDOW)
-		info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW;
-
 	addr = vm_unmapped_area(&info);
 	if (!(addr & ~PAGE_MASK))
 		return addr;
diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c
new file mode 100644
index 000000000000..d60a62bf4fc7
--- /dev/null
+++ b/arch/powerpc/mm/mmu_context.c
@@ -0,0 +1,90 @@
+/*
+ *  Common implementation of switch_mm_irqs_off
+ *
+ *  Copyright IBM Corp. 2017
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+
+#include <asm/mmu_context.h>
+
+#if defined(CONFIG_PPC32)
+static inline void switch_mm_pgdir(struct task_struct *tsk,
+				   struct mm_struct *mm)
+{
+	/* 32-bit keeps track of the current PGDIR in the thread struct */
+	tsk->thread.pgdir = mm->pgd;
+}
+#elif defined(CONFIG_PPC_BOOK3E_64)
+static inline void switch_mm_pgdir(struct task_struct *tsk,
+				   struct mm_struct *mm)
+{
+	/* 64-bit Book3E keeps track of current PGD in the PACA */
+	get_paca()->pgd = mm->pgd;
+}
+#else
+static inline void switch_mm_pgdir(struct task_struct *tsk,
+				   struct mm_struct *mm) { }
+#endif
+
+void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+			struct task_struct *tsk)
+{
+	bool new_on_cpu = false;
+
+	/* Mark this context has been used on the new CPU */
+	if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next))) {
+		cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
+		inc_mm_active_cpus(next);
+
+		/*
+		 * This full barrier orders the store to the cpumask above vs
+		 * a subsequent operation which allows this CPU to begin loading
+		 * translations for next.
+		 *
+		 * When using the radix MMU that operation is the load of the
+		 * MMU context id, which is then moved to SPRN_PID.
+		 *
+		 * For the hash MMU it is either the first load from slb_cache
+		 * in switch_slb(), and/or the store of paca->mm_ctx_id in
+		 * copy_mm_to_paca().
+		 *
+		 * On the read side the barrier is in pte_xchg(), which orders
+		 * the store to the PTE vs the load of mm_cpumask.
+		 */
+		smp_mb();
+
+		new_on_cpu = true;
+	}
+
+	/* Some subarchs need to track the PGD elsewhere */
+	switch_mm_pgdir(tsk, next);
+
+	/* Nothing else to do if we aren't actually switching */
+	if (prev == next)
+		return;
+
+	/*
+	 * We must stop all altivec streams before changing the HW
+	 * context
+	 */
+	if (cpu_has_feature(CPU_FTR_ALTIVEC))
+		asm volatile ("dssall");
+
+	if (new_on_cpu)
+		radix_kvm_prefetch_workaround(next);
+
+	/*
+	 * The actual HW switching method differs between the various
+	 * sub architectures. Out of line for now
+	 */
+	switch_mmu_context(prev, next, tsk);
+}
+
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index abed1fe6992f..59c0766ae4e0 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -25,8 +25,6 @@
 #include <asm/mmu_context.h>
 #include <asm/pgalloc.h>
 
-#include "icswx.h"
-
 static DEFINE_SPINLOCK(mmu_context_lock);
 static DEFINE_IDA(mmu_context_ida);
 
@@ -95,11 +93,11 @@ static int hash__init_new_context(struct mm_struct *mm)
 		return index;
 
 	/*
-	 * We do switch_slb() early in fork, even before we setup the
-	 * mm->context.addr_limit. Default to max task size so that we copy the
-	 * default values to paca which will help us to handle slb miss early.
+	 * In the case of exec, use the default limit,
+	 * otherwise inherit it from the mm we are duplicating.
 	 */
-	mm->context.addr_limit = DEFAULT_MAP_WINDOW_USER64;
+	if (!mm->context.slb_addr_limit)
+		mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64;
 
 	/*
 	 * The old code would re-promote on fork, we don't do that when using
@@ -126,9 +124,10 @@ static int hash__init_new_context(struct mm_struct *mm)
 static int radix__init_new_context(struct mm_struct *mm)
 {
 	unsigned long rts_field;
-	int index;
+	int index, max_id;
 
-	index = alloc_context_id(1, PRTB_ENTRIES - 1);
+	max_id = (1 << mmu_pid_bits) - 1;
+	index = alloc_context_id(mmu_base_pid, max_id);
 	if (index < 0)
 		return index;
 
@@ -164,16 +163,6 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 		return index;
 
 	mm->context.id = index;
-#ifdef CONFIG_PPC_ICSWX
-	mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
-	if (!mm->context.cop_lockp) {
-		__destroy_context(index);
-		subpage_prot_free(mm);
-		mm->context.id = MMU_NO_CONTEXT;
-		return -ENOMEM;
-	}
-	spin_lock_init(mm->context.cop_lockp);
-#endif /* CONFIG_PPC_ICSWX */
 
 #ifdef CONFIG_PPC_64K_PAGES
 	mm->context.pte_frag = NULL;
@@ -181,6 +170,8 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 #ifdef CONFIG_SPAPR_TCE_IOMMU
 	mm_iommu_init(mm);
 #endif
+	atomic_set(&mm->context.active_cpus, 0);
+
 	return 0;
 }
 
@@ -209,7 +200,7 @@ static void destroy_pagetable_page(struct mm_struct *mm)
 	/* We allow PTE_FRAG_NR fragments from a PTE page */
 	if (page_ref_sub_and_test(page, PTE_FRAG_NR - count)) {
 		pgtable_page_dtor(page);
-		free_hot_cold_page(page, 0);
+		free_unref_page(page);
 	}
 }
 
@@ -225,25 +216,34 @@ void destroy_context(struct mm_struct *mm)
 #ifdef CONFIG_SPAPR_TCE_IOMMU
 	WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list));
 #endif
-#ifdef CONFIG_PPC_ICSWX
-	drop_cop(mm->context.acop, mm);
-	kfree(mm->context.cop_lockp);
-	mm->context.cop_lockp = NULL;
-#endif /* CONFIG_PPC_ICSWX */
+	if (radix_enabled())
+		WARN_ON(process_tb[mm->context.id].prtb0 != 0);
+	else
+		subpage_prot_free(mm);
+	destroy_pagetable_page(mm);
+	__destroy_context(mm->context.id);
+	mm->context.id = MMU_NO_CONTEXT;
+}
 
+void arch_exit_mmap(struct mm_struct *mm)
+{
 	if (radix_enabled()) {
 		/*
 		 * Radix doesn't have a valid bit in the process table
 		 * entries. However we know that at least P9 implementation
 		 * will avoid caching an entry with an invalid RTS field,
 		 * and 0 is invalid. So this will do.
+		 *
+		 * This runs before the "fullmm" tlb flush in exit_mmap,
+		 * which does a RIC=2 tlbie to clear the process table
+		 * entry. See the "fullmm" comments in tlb-radix.c.
+		 *
+		 * No barrier required here after the store because
+		 * this process will do the invalidate, which starts with
+		 * ptesync.
 		 */
 		process_tb[mm->context.id].prtb0 = 0;
-	} else
-		subpage_prot_free(mm);
-	destroy_pagetable_page(mm);
-	__destroy_context(mm->context.id);
-	mm->context.id = MMU_NO_CONTEXT;
+	}
 }
 
 #ifdef CONFIG_PPC_RADIX_MMU
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index d46128b22150..57fbc554c785 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -27,7 +27,7 @@
 /*
  * On 40x and 8xx, we directly inline tlbia and tlbivax
  */
-#if defined(CONFIG_40x) || defined(CONFIG_8xx)
+#if defined(CONFIG_40x) || defined(CONFIG_PPC_8xx)
 static inline void _tlbil_all(void)
 {
 	asm volatile ("sync; tlbia; isync" : : : "memory");
@@ -38,7 +38,7 @@ static inline void _tlbil_pid(unsigned int pid)
 }
 #define _tlbil_pid_noind(pid)	_tlbil_pid(pid)
 
-#else /* CONFIG_40x || CONFIG_8xx */
+#else /* CONFIG_40x || CONFIG_PPC_8xx */
 extern void _tlbil_all(void);
 extern void _tlbil_pid(unsigned int pid);
 #ifdef CONFIG_PPC_BOOK3E
@@ -46,12 +46,12 @@ extern void _tlbil_pid_noind(unsigned int pid);
 #else
 #define _tlbil_pid_noind(pid)	_tlbil_pid(pid)
 #endif
-#endif /* !(CONFIG_40x || CONFIG_8xx) */
+#endif /* !(CONFIG_40x || CONFIG_PPC_8xx) */
 
 /*
  * On 8xx, we directly inline tlbie, on others, it's extern
  */
-#ifdef CONFIG_8xx
+#ifdef CONFIG_PPC_8xx
 static inline void _tlbil_va(unsigned long address, unsigned int pid,
 			     unsigned int tsize, unsigned int ind)
 {
@@ -67,7 +67,7 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid,
 {
 	__tlbil_va(address, pid);
 }
-#endif /* CONFIG_8xx */
+#endif /* CONFIG_PPC_8xx */
 
 #if defined(CONFIG_PPC_BOOK3E) || defined(CONFIG_PPC_47x)
 extern void _tlbivax_bcast(unsigned long address, unsigned int pid,
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index b95c584ce19d..adb6364f4091 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1148,11 +1148,33 @@ struct topology_update_data {
 	int new_nid;
 };
 
+#define TOPOLOGY_DEF_TIMER_SECS	60
+
 static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
 static cpumask_t cpu_associativity_changes_mask;
 static int vphn_enabled;
 static int prrn_enabled;
 static void reset_topology_timer(void);
+static int topology_timer_secs = 1;
+static int topology_inited;
+static int topology_update_needed;
+
+/*
+ * Change polling interval for associativity changes.
+ */
+int timed_topology_update(int nsecs)
+{
+	if (vphn_enabled) {
+		if (nsecs > 0)
+			topology_timer_secs = nsecs;
+		else
+			topology_timer_secs = TOPOLOGY_DEF_TIMER_SECS;
+
+		reset_topology_timer();
+	}
+
+	return 0;
+}
 
 /*
  * Store the current values of the associativity change counters in the
@@ -1246,6 +1268,11 @@ static long vphn_get_associativity(unsigned long cpu,
 			"hcall_vphn() experienced a hardware fault "
 			"preventing VPHN. Disabling polling...\n");
 		stop_topology_update();
+		break;
+	case H_SUCCESS:
+		dbg("VPHN hcall succeeded. Reset polling...\n");
+		timed_topology_update(0);
+		break;
 	}
 
 	return rc;
@@ -1323,8 +1350,11 @@ int numa_update_cpu_topology(bool cpus_locked)
 	struct device *dev;
 	int weight, new_nid, i = 0;
 
-	if (!prrn_enabled && !vphn_enabled)
+	if (!prrn_enabled && !vphn_enabled) {
+		if (!topology_inited)
+			topology_update_needed = 1;
 		return 0;
+	}
 
 	weight = cpumask_weight(&cpu_associativity_changes_mask);
 	if (!weight)
@@ -1363,22 +1393,30 @@ int numa_update_cpu_topology(bool cpus_locked)
 			cpumask_andnot(&cpu_associativity_changes_mask,
 					&cpu_associativity_changes_mask,
 					cpu_sibling_mask(cpu));
+			dbg("Assoc chg gives same node %d for cpu%d\n",
+					new_nid, cpu);
 			cpu = cpu_last_thread_sibling(cpu);
 			continue;
 		}
 
 		for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
 			ud = &updates[i++];
+			ud->next = &updates[i];
 			ud->cpu = sibling;
 			ud->new_nid = new_nid;
 			ud->old_nid = numa_cpu_lookup_table[sibling];
 			cpumask_set_cpu(sibling, &updated_cpus);
-			if (i < weight)
-				ud->next = &updates[i];
 		}
 		cpu = cpu_last_thread_sibling(cpu);
 	}
 
+	/*
+	 * Prevent processing of 'updates' from overflowing array
+	 * where last entry filled in a 'next' pointer.
+	 */
+	if (i)
+		updates[i-1].next = NULL;
+
 	pr_debug("Topology update for the following CPUs:\n");
 	if (cpumask_weight(&updated_cpus)) {
 		for (ud = &updates[0]; ud; ud = ud->next) {
@@ -1433,12 +1471,12 @@ int numa_update_cpu_topology(bool cpus_locked)
 
 out:
 	kfree(updates);
+	topology_update_needed = 0;
 	return changed;
 }
 
 int arch_update_cpu_topology(void)
 {
-	lockdep_assert_cpus_held();
 	return numa_update_cpu_topology(true);
 }
 
@@ -1453,7 +1491,7 @@ static void topology_schedule_update(void)
 	schedule_work(&topology_work);
 }
 
-static void topology_timer_fn(unsigned long ignored)
+static void topology_timer_fn(struct timer_list *unused)
 {
 	if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask))
 		topology_schedule_update();
@@ -1463,14 +1501,11 @@ static void topology_timer_fn(unsigned long ignored)
 		reset_topology_timer();
 	}
 }
-static struct timer_list topology_timer =
-	TIMER_INITIALIZER(topology_timer_fn, 0, 0);
+static struct timer_list topology_timer;
 
 static void reset_topology_timer(void)
 {
-	topology_timer.data = 0;
-	topology_timer.expires = jiffies + 60 * HZ;
-	mod_timer(&topology_timer, topology_timer.expires);
+	mod_timer(&topology_timer, jiffies + topology_timer_secs * HZ);
 }
 
 #ifdef CONFIG_SMP
@@ -1519,18 +1554,18 @@ int start_topology_update(void)
 	if (firmware_has_feature(FW_FEATURE_PRRN)) {
 		if (!prrn_enabled) {
 			prrn_enabled = 1;
-			vphn_enabled = 0;
 #ifdef CONFIG_SMP
 			rc = of_reconfig_notifier_register(&dt_update_nb);
 #endif
 		}
-	} else if (firmware_has_feature(FW_FEATURE_VPHN) &&
+	}
+	if (firmware_has_feature(FW_FEATURE_VPHN) &&
 		   lppaca_shared_proc(get_lppaca())) {
 		if (!vphn_enabled) {
-			prrn_enabled = 0;
 			vphn_enabled = 1;
 			setup_cpu_associativity_change_counters();
-			init_timer_deferrable(&topology_timer);
+			timer_setup(&topology_timer, topology_timer_fn,
+				    TIMER_DEFERRABLE);
 			reset_topology_timer();
 		}
 	}
@@ -1550,7 +1585,8 @@ int stop_topology_update(void)
 #ifdef CONFIG_SMP
 		rc = of_reconfig_notifier_unregister(&dt_update_nb);
 #endif
-	} else if (vphn_enabled) {
+	}
+	if (vphn_enabled) {
 		vphn_enabled = 0;
 		rc = del_timer_sync(&topology_timer);
 	}
@@ -1613,9 +1649,17 @@ static int topology_update_init(void)
 	if (topology_updates_enabled)
 		start_topology_update();
 
+	if (vphn_enabled)
+		topology_schedule_update();
+
 	if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops))
 		return -ENOMEM;
 
+	topology_inited = 1;
+	if (topology_update_needed)
+		bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask),
+					nr_cpumask_bits);
+
 	return 0;
 }
 device_initcall(topology_update_init);
diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c
index 31eed8fa8e99..3b65917785a5 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -9,6 +9,7 @@
 
 #include <linux/sched.h>
 #include <linux/mm_types.h>
+#include <misc/cxl-base.h>
 
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
@@ -64,6 +65,27 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 	trace_hugepage_set_pmd(addr, pmd_val(pmd));
 	return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
 }
+
+static void do_nothing(void *unused)
+{
+
+}
+/*
+ * Serialize against find_current_mm_pte which does lock-less
+ * lookup in page tables with local interrupts disabled. For huge pages
+ * it casts pmd_t to pte_t. Since format of pte_t is different from
+ * pmd_t we want to prevent transit from pmd pointing to page table
+ * to pmd pointing to huge page (and back) while interrupts are disabled.
+ * We clear pmd to possibly replace it with page table pointer in
+ * different code paths. So make sure we wait for the parallel
+ * find_current_mm_pte to finish.
+ */
+void serialize_against_pte_lookup(struct mm_struct *mm)
+{
+	smp_mb();
+	smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1);
+}
+
 /*
  * We use this to invalidate a pmdp entry before switching from a
  * hugepte to regular pmd entry.
@@ -77,7 +99,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 	 * This ensures that generic code that rely on IRQ disabling
 	 * to prevent a parallel THP split work as expected.
 	 */
-	kick_all_cpus_sync();
+	serialize_against_pte_lookup(vma->vm_mm);
 }
 
 static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c
index 443a2c66a304..ec277913e01b 100644
--- a/arch/powerpc/mm/pgtable-hash64.c
+++ b/arch/powerpc/mm/pgtable-hash64.c
@@ -239,7 +239,7 @@ pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addres
 	 * by sending an IPI to all the cpus and executing a dummy
 	 * function there.
 	 */
-	kick_all_cpus_sync();
+	serialize_against_pte_lookup(vma->vm_mm);
 	/*
 	 * Now invalidate the hpte entries in the range
 	 * covered by pmd. This make sure we take a
@@ -329,7 +329,6 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
 	unsigned int psize;
 	unsigned long vsid;
 	unsigned long flags = 0;
-	const struct cpumask *tmp;
 
 	/* get the base page size,vsid and segment size */
 #ifdef CONFIG_DEBUG_VM
@@ -350,8 +349,7 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
 		ssize = mmu_kernel_ssize;
 	}
 
-	tmp = cpumask_of(smp_processor_id());
-	if (cpumask_equal(mm_cpumask(mm), tmp))
+	if (mm_is_thread_local(mm))
 		flags |= HPTE_LOCAL_UPDATE;
 
 	return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
@@ -380,16 +378,16 @@ pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
 	 */
 	memset(pgtable, 0, PTE_FRAG_SIZE);
 	/*
-	 * Serialize against find_linux_pte_or_hugepte which does lock-less
+	 * Serialize against find_current_mm_pte variants which does lock-less
 	 * lookup in page tables with local interrupts disabled. For huge pages
 	 * it casts pmd_t to pte_t. Since format of pte_t is different from
 	 * pmd_t we want to prevent transit from pmd pointing to page table
 	 * to pmd pointing to huge page (and back) while interrupts are disabled.
 	 * We clear pmd to possibly replace it with page table pointer in
 	 * different code paths. So make sure we wait for the parallel
-	 * find_linux_pte_or_hugepage to finish.
+	 * find_curren_mm_pte to finish.
 	 */
-	kick_all_cpus_sync();
+	serialize_against_pte_lookup(mm);
 	return old_pmd;
 }
 
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 5cc50d47ce3f..cfbbee941a76 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -8,10 +8,15 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
+
+#define pr_fmt(fmt) "radix-mmu: " fmt
+
+#include <linux/kernel.h>
 #include <linux/sched/mm.h>
 #include <linux/memblock.h>
 #include <linux/of_fdt.h>
 #include <linux/mm.h>
+#include <linux/string_helpers.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -25,12 +30,19 @@
 
 #include <trace/events/thp.h>
 
+unsigned int mmu_pid_bits;
+unsigned int mmu_base_pid;
+
 static int native_register_process_table(unsigned long base, unsigned long pg_sz,
 					 unsigned long table_size)
 {
-	unsigned long patb1 = base | table_size | PATB_GR;
+	unsigned long patb0, patb1;
+
+	patb0 = be64_to_cpu(partition_tb[0].patb0);
+	patb1 = base | table_size | PATB_GR;
+
+	mmu_partition_table_set_entry(0, patb0, patb1);
 
-	partition_tb->patb1 = cpu_to_be64(patb1);
 	return 0;
 }
 
@@ -157,6 +169,16 @@ void radix__mark_rodata_ro(void)
 {
 	unsigned long start, end;
 
+	/*
+	 * mark_rodata_ro() will mark itself as !writable at some point.
+	 * Due to DD1 workaround in radix__pte_update(), we'll end up with
+	 * an invalid pte and the system will crash quite severly.
+	 */
+	if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
+		pr_warn("Warning: Unable to mark rodata read only on P9 DD1\n");
+		return;
+	}
+
 	start = (unsigned long)_stext;
 	end = (unsigned long)__init_begin;
 
@@ -176,10 +198,14 @@ static inline void __meminit print_mapping(unsigned long start,
 					   unsigned long end,
 					   unsigned long size)
 {
+	char buf[10];
+
 	if (end <= start)
 		return;
 
-	pr_info("Mapped range 0x%lx - 0x%lx with 0x%lx\n", start, end, size);
+	string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
+
+	pr_info("Mapped 0x%016lx-0x%016lx with %s pages\n", start, end, buf);
 }
 
 static int __meminit create_physical_mapping(unsigned long start,
@@ -261,11 +287,34 @@ static void __init radix_init_pgtable(void)
 	for_each_memblock(memory, reg)
 		WARN_ON(create_physical_mapping(reg->base,
 						reg->base + reg->size));
+
+	/* Find out how many PID bits are supported */
+	if (cpu_has_feature(CPU_FTR_HVMODE)) {
+		if (!mmu_pid_bits)
+			mmu_pid_bits = 20;
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+		/*
+		 * When KVM is possible, we only use the top half of the
+		 * PID space to avoid collisions between host and guest PIDs
+		 * which can cause problems due to prefetch when exiting the
+		 * guest with AIL=3
+		 */
+		mmu_base_pid = 1 << (mmu_pid_bits - 1);
+#else
+		mmu_base_pid = 1;
+#endif
+	} else {
+		/* The guest uses the bottom half of the PID space */
+		if (!mmu_pid_bits)
+			mmu_pid_bits = 19;
+		mmu_base_pid = 1;
+	}
+
 	/*
 	 * Allocate Partition table and process table for the
 	 * host.
 	 */
-	BUILD_BUG_ON_MSG((PRTB_SIZE_SHIFT > 36), "Process table size too large.");
+	BUG_ON(PRTB_SIZE_SHIFT > 36);
 	process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT);
 	/*
 	 * Fill in the process table.
@@ -339,6 +388,12 @@ static int __init radix_dt_scan_page_sizes(unsigned long node,
 	if (type == NULL || strcmp(type, "cpu") != 0)
 		return 0;
 
+	/* Find MMU PID size */
+	prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
+	if (prop && size == 4)
+		mmu_pid_bits = be32_to_cpup(prop);
+
+	/* Grab page size encodings */
 	prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
 	if (!prop)
 		return 0;
@@ -494,6 +549,7 @@ void __init radix__early_init_mmu(void)
 	__kernel_virt_size = RADIX_KERN_VIRT_SIZE;
 	__vmalloc_start = RADIX_VMALLOC_START;
 	__vmalloc_end = RADIX_VMALLOC_END;
+	__kernel_io_start = RADIX_KERN_IO_START;
 	vmemmap = (struct page *)RADIX_VMEMMAP_BASE;
 	ioremap_bot = IOREMAP_BASE;
 
@@ -804,9 +860,12 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre
 	 */
 	pmd = *pmdp;
 	pmd_clear(pmdp);
+
 	/*FIXME!!  Verify whether we need this kick below */
-	kick_all_cpus_sync();
-	flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+	serialize_against_pte_lookup(vma->vm_mm);
+
+	radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
+
 	return pmd;
 }
 
@@ -865,16 +924,16 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
 	old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
 	old_pmd = __pmd(old);
 	/*
-	 * Serialize against find_linux_pte_or_hugepte which does lock-less
+	 * Serialize against find_current_mm_pte which does lock-less
 	 * lookup in page tables with local interrupts disabled. For huge pages
 	 * it casts pmd_t to pte_t. Since format of pte_t is different from
 	 * pmd_t we want to prevent transit from pmd pointing to page table
 	 * to pmd pointing to huge page (and back) while interrupts are disabled.
 	 * We clear pmd to possibly replace it with page table pointer in
 	 * different code paths. So make sure we wait for the parallel
-	 * find_linux_pte_or_hugepage to finish.
+	 * find_current_mm_pte to finish.
 	 */
-	kick_all_cpus_sync();
+	serialize_against_pte_lookup(mm);
 	return old_pmd;
 }
 
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index a9e4bfc025bc..f6c7f54c0515 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -34,6 +34,7 @@
 #include <asm/fixmap.h>
 #include <asm/io.h>
 #include <asm/setup.h>
+#include <asm/sections.h>
 
 #include "mmu_decl.h"
 
@@ -242,7 +243,7 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, int flags)
 /*
  * Map in a chunk of physical memory starting at start.
  */
-void __init __mapin_ram_chunk(unsigned long offset, unsigned long top)
+static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top)
 {
 	unsigned long v, s, f;
 	phys_addr_t p;
@@ -294,7 +295,7 @@ void __init mapin_ram(void)
  * Returns true (1) if PTE was found, zero otherwise.  The pointer to
  * the PTE pointer is unmodified if PTE is not found.
  */
-int
+static int
 get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp)
 {
         pgd_t	*pgd;
@@ -323,9 +324,7 @@ get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp)
         return(retval);
 }
 
-#ifdef CONFIG_DEBUG_PAGEALLOC
-
-static int __change_page_attr(struct page *page, pgprot_t prot)
+static int __change_page_attr_noflush(struct page *page, pgprot_t prot)
 {
 	pte_t *kpte;
 	pmd_t *kpmd;
@@ -339,8 +338,6 @@ static int __change_page_attr(struct page *page, pgprot_t prot)
 	if (!get_pteptr(&init_mm, address, &kpte, &kpmd))
 		return -EINVAL;
 	__set_pte_at(&init_mm, address, kpte, mk_pte(page, prot), 0);
-	wmb();
-	flush_tlb_page(NULL, address);
 	pte_unmap(kpte);
 
 	return 0;
@@ -349,44 +346,65 @@ static int __change_page_attr(struct page *page, pgprot_t prot)
 /*
  * Change the page attributes of an page in the linear mapping.
  *
- * THIS CONFLICTS WITH BAT MAPPINGS, DEBUG USE ONLY
+ * THIS DOES NOTHING WITH BAT MAPPINGS, DEBUG USE ONLY
  */
 static int change_page_attr(struct page *page, int numpages, pgprot_t prot)
 {
 	int i, err = 0;
 	unsigned long flags;
+	struct page *start = page;
 
 	local_irq_save(flags);
 	for (i = 0; i < numpages; i++, page++) {
-		err = __change_page_attr(page, prot);
+		err = __change_page_attr_noflush(page, prot);
 		if (err)
 			break;
 	}
+	wmb();
 	local_irq_restore(flags);
+	flush_tlb_kernel_range((unsigned long)page_address(start),
+			       (unsigned long)page_address(page));
 	return err;
 }
 
-
-void __kernel_map_pages(struct page *page, int numpages, int enable)
+void mark_initmem_nx(void)
 {
-	if (PageHighMem(page))
-		return;
+	struct page *page = virt_to_page(_sinittext);
+	unsigned long numpages = PFN_UP((unsigned long)_einittext) -
+				 PFN_DOWN((unsigned long)_sinittext);
 
-	change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
+	change_page_attr(page, numpages, PAGE_KERNEL);
 }
-#endif /* CONFIG_DEBUG_PAGEALLOC */
 
-static int fixmaps;
-
-void __set_fixmap (enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
+#ifdef CONFIG_STRICT_KERNEL_RWX
+void mark_rodata_ro(void)
 {
-	unsigned long address = __fix_to_virt(idx);
+	struct page *page;
+	unsigned long numpages;
+
+	page = virt_to_page(_stext);
+	numpages = PFN_UP((unsigned long)_etext) -
+		   PFN_DOWN((unsigned long)_stext);
 
-	if (idx >= __end_of_fixed_addresses) {
-		BUG();
+	change_page_attr(page, numpages, PAGE_KERNEL_ROX);
+	/*
+	 * mark .rodata as read only. Use __init_begin rather than __end_rodata
+	 * to cover NOTES and EXCEPTION_TABLE.
+	 */
+	page = virt_to_page(__start_rodata);
+	numpages = PFN_UP((unsigned long)__init_begin) -
+		   PFN_DOWN((unsigned long)__start_rodata);
+
+	change_page_attr(page, numpages, PAGE_KERNEL_RO);
+}
+#endif
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+void __kernel_map_pages(struct page *page, int numpages, int enable)
+{
+	if (PageHighMem(page))
 		return;
-	}
 
-	map_kernel_page(address, phys, pgprot_val(flags));
-	fixmaps++;
+	change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
 }
+#endif /* CONFIG_DEBUG_PAGEALLOC */
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 0736e94c7615..813ea22c3e00 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -57,7 +57,7 @@
 
 #include "mmu_decl.h"
 
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 #if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT))
 #error TASK_SIZE_USER64 exceeds user VSID range
 #endif
@@ -104,6 +104,8 @@ unsigned long __vmalloc_start;
 EXPORT_SYMBOL(__vmalloc_start);
 unsigned long __vmalloc_end;
 EXPORT_SYMBOL(__vmalloc_end);
+unsigned long __kernel_io_start;
+EXPORT_SYMBOL(__kernel_io_start);
 struct page *vmemmap;
 EXPORT_SYMBOL(vmemmap);
 unsigned long __pte_frag_nr;
@@ -402,7 +404,7 @@ void pte_fragment_free(unsigned long *table, int kernel)
 	if (put_page_testzero(page)) {
 		if (!kernel)
 			pgtable_page_dtor(page);
-		free_hot_cold_page(page, 0);
+		free_unref_page(page);
 	}
 }
 
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index bde378559d01..2cf5ef3fc50d 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -121,12 +121,25 @@ slb_miss_kernel_load_vmemmap:
 1:
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
-	/* vmalloc mapping gets the encoding from the PACA as the mapping
-	 * can be demoted from 64K -> 4K dynamically on some machines
+	/*
+	 * r10 contains the ESID, which is the original faulting EA shifted
+	 * right by 28 bits. We need to compare that with (H_VMALLOC_END >> 28)
+	 * which is 0xd00038000. That can't be used as an immediate, even if we
+	 * ignored the 0xd, so we have to load it into a register, and we only
+	 * have one register free. So we must load all of (H_VMALLOC_END >> 28)
+	 * into a register and compare ESID against that.
+	 */
+	lis	r11,(H_VMALLOC_END >> 32)@h	// r11 = 0xffffffffd0000000
+	ori	r11,r11,(H_VMALLOC_END >> 32)@l	// r11 = 0xffffffffd0003800
+	// Rotate left 4, then mask with 0xffffffff0
+	rldic	r11,r11,4,28			// r11 = 0xd00038000
+	cmpld	r10,r11				// if r10 >= r11
+	bge	5f				//   goto io_mapping
+
+	/*
+	 * vmalloc mapping gets the encoding from the PACA as the mapping
+	 * can be demoted from 64K -> 4K dynamically on some machines.
 	 */
-	clrldi	r11,r10,48
-	cmpldi	r11,(H_VMALLOC_SIZE >> 28) - 1
-	bgt	5f
 	lhz	r11,PACAVMALLOCSLLP(r13)
 	b	6f
 5:
@@ -154,7 +167,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
         /*
          * user space make sure we are within the allowed limit
 	 */
-	ld	r11,PACA_ADDR_LIMIT(r13)
+	ld	r11,PACA_SLB_ADDR_LIMIT(r13)
 	cmpld	r3,r11
 	bge-	8f
 
@@ -296,10 +309,6 @@ slb_compare_rr_to_size:
 	srdi	r10,r10,(SID_SHIFT_1T - SID_SHIFT)	/* get 1T ESID */
 	rldimi  r10,r9,ESID_BITS_1T,0
 	ASM_VSID_SCRAMBLE(r10,r9,r11,1T)
-	/*
-	 * bits above VSID_BITS_1T need to be ignored from r10
-	 * also combine VSID and flags
-	 */
 
 	li	r10,MMU_SEGSIZE_1T
 	rldimi	r11,r10,SLB_VSID_SSIZE_SHIFT,0	/* insert segment size */
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 45f6740dd407..23ec2c5e3b78 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -96,7 +96,7 @@ static int slice_area_is_free(struct mm_struct *mm, unsigned long addr,
 {
 	struct vm_area_struct *vma;
 
-	if ((mm->task_size - len) < addr)
+	if ((mm->context.slb_addr_limit - len) < addr)
 		return 0;
 	vma = find_vma(mm, addr);
 	return (!vma || (addr + len) <= vm_start_gap(vma));
@@ -122,7 +122,8 @@ static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice)
 	return !slice_area_is_free(mm, start, end - start);
 }
 
-static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret)
+static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret,
+				unsigned long high_limit)
 {
 	unsigned long i;
 
@@ -133,15 +134,16 @@ static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret)
 		if (!slice_low_has_vma(mm, i))
 			ret->low_slices |= 1u << i;
 
-	if (mm->task_size <= SLICE_LOW_TOP)
+	if (high_limit <= SLICE_LOW_TOP)
 		return;
 
-	for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++)
+	for (i = 0; i < GET_HIGH_SLICE_INDEX(high_limit); i++)
 		if (!slice_high_has_vma(mm, i))
 			__set_bit(i, ret->high_slices);
 }
 
-static void slice_mask_for_size(struct mm_struct *mm, int psize, struct slice_mask *ret)
+static void slice_mask_for_size(struct mm_struct *mm, int psize, struct slice_mask *ret,
+				unsigned long high_limit)
 {
 	unsigned char *hpsizes;
 	int index, mask_index;
@@ -156,8 +158,11 @@ static void slice_mask_for_size(struct mm_struct *mm, int psize, struct slice_ma
 		if (((lpsizes >> (i * 4)) & 0xf) == psize)
 			ret->low_slices |= 1u << i;
 
+	if (high_limit <= SLICE_LOW_TOP)
+		return;
+
 	hpsizes = mm->context.high_slices_psize;
-	for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) {
+	for (i = 0; i < GET_HIGH_SLICE_INDEX(high_limit); i++) {
 		mask_index = i & 0x1;
 		index = i >> 1;
 		if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize)
@@ -169,7 +174,11 @@ static int slice_check_fit(struct mm_struct *mm,
 			   struct slice_mask mask, struct slice_mask available)
 {
 	DECLARE_BITMAP(result, SLICE_NUM_HIGH);
-	unsigned long slice_count = GET_HIGH_SLICE_INDEX(mm->context.addr_limit);
+	/*
+	 * Make sure we just do bit compare only to the max
+	 * addr limit and not the full bit map size.
+	 */
+	unsigned long slice_count = GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit);
 
 	bitmap_and(result, mask.high_slices,
 		   available.high_slices, slice_count);
@@ -219,7 +228,7 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
 	mm->context.low_slices_psize = lpsizes;
 
 	hpsizes = mm->context.high_slices_psize;
-	for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) {
+	for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit); i++) {
 		mask_index = i & 0x1;
 		index = i >> 1;
 		if (test_bit(i, mask.high_slices))
@@ -329,8 +338,8 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
 	 * Only for that request for which high_limit is above
 	 * DEFAULT_MAP_WINDOW we should apply this.
 	 */
-	if (high_limit  > DEFAULT_MAP_WINDOW)
-		addr += mm->context.addr_limit - DEFAULT_MAP_WINDOW;
+	if (high_limit > DEFAULT_MAP_WINDOW)
+		addr += mm->context.slb_addr_limit - DEFAULT_MAP_WINDOW;
 
 	while (addr > PAGE_SIZE) {
 		info.high_limit = addr;
@@ -412,25 +421,31 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	struct slice_mask compat_mask;
 	int fixed = (flags & MAP_FIXED);
 	int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
+	unsigned long page_size = 1UL << pshift;
 	struct mm_struct *mm = current->mm;
 	unsigned long newaddr;
 	unsigned long high_limit;
 
-	/*
-	 * Check if we need to expland slice area.
-	 */
-	if (unlikely(addr > mm->context.addr_limit &&
-		     mm->context.addr_limit != TASK_SIZE)) {
-		mm->context.addr_limit = TASK_SIZE;
+	high_limit = DEFAULT_MAP_WINDOW;
+	if (addr >= high_limit || (fixed && (addr + len > high_limit)))
+		high_limit = TASK_SIZE;
+
+	if (len > high_limit)
+		return -ENOMEM;
+	if (len & (page_size - 1))
+		return -EINVAL;
+	if (fixed) {
+		if (addr & (page_size - 1))
+			return -EINVAL;
+		if (addr > high_limit - len)
+			return -ENOMEM;
+	}
+
+	if (high_limit > mm->context.slb_addr_limit) {
+		mm->context.slb_addr_limit = high_limit;
 		on_each_cpu(slice_flush_segments, mm, 1);
 	}
-	/*
-	 * This mmap request can allocate upt to 512TB
-	 */
-	if (addr > DEFAULT_MAP_WINDOW)
-		high_limit = mm->context.addr_limit;
-	else
-		high_limit = DEFAULT_MAP_WINDOW;
+
 	/*
 	 * init different masks
 	 */
@@ -446,27 +461,19 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 
 	/* Sanity checks */
 	BUG_ON(mm->task_size == 0);
+	BUG_ON(mm->context.slb_addr_limit == 0);
 	VM_BUG_ON(radix_enabled());
 
 	slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize);
 	slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n",
 		  addr, len, flags, topdown);
 
-	if (len > mm->task_size)
-		return -ENOMEM;
-	if (len & ((1ul << pshift) - 1))
-		return -EINVAL;
-	if (fixed && (addr & ((1ul << pshift) - 1)))
-		return -EINVAL;
-	if (fixed && addr > (mm->task_size - len))
-		return -ENOMEM;
-
 	/* If hint, make sure it matches our alignment restrictions */
 	if (!fixed && addr) {
-		addr = _ALIGN_UP(addr, 1ul << pshift);
+		addr = _ALIGN_UP(addr, page_size);
 		slice_dbg(" aligned addr=%lx\n", addr);
 		/* Ignore hint if it's too large or overlaps a VMA */
-		if (addr > mm->task_size - len ||
+		if (addr > high_limit - len ||
 		    !slice_area_is_free(mm, addr, len))
 			addr = 0;
 	}
@@ -474,7 +481,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	/* First make up a "good" mask of slices that have the right size
 	 * already
 	 */
-	slice_mask_for_size(mm, psize, &good_mask);
+	slice_mask_for_size(mm, psize, &good_mask, high_limit);
 	slice_print_mask(" good_mask", good_mask);
 
 	/*
@@ -499,7 +506,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 #ifdef CONFIG_PPC_64K_PAGES
 	/* If we support combo pages, we can allow 64k pages in 4k slices */
 	if (psize == MMU_PAGE_64K) {
-		slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask);
+		slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask, high_limit);
 		if (fixed)
 			slice_or_mask(&good_mask, &compat_mask);
 	}
@@ -532,11 +539,11 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 			return newaddr;
 		}
 	}
-
-	/* We don't fit in the good mask, check what other slices are
+	/*
+	 * We don't fit in the good mask, check what other slices are
 	 * empty and thus can be converted
 	 */
-	slice_mask_for_free(mm, &potential_mask);
+	slice_mask_for_free(mm, &potential_mask, high_limit);
 	slice_or_mask(&potential_mask, &good_mask);
 	slice_print_mask(" potential", potential_mask);
 
@@ -746,17 +753,18 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
 {
 	struct slice_mask mask, available;
 	unsigned int psize = mm->context.user_psize;
+	unsigned long high_limit = mm->context.slb_addr_limit;
 
 	if (radix_enabled())
 		return 0;
 
 	slice_range_to_mask(addr, len, &mask);
-	slice_mask_for_size(mm, psize, &available);
+	slice_mask_for_size(mm, psize, &available, high_limit);
 #ifdef CONFIG_PPC_64K_PAGES
 	/* We need to account for 4k slices too */
 	if (psize == MMU_PAGE_64K) {
 		struct slice_mask compat_mask;
-		slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask);
+		slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask, high_limit);
 		slice_or_mask(&available, &compat_mask);
 	}
 #endif
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index e94fbd4c8845..781532d7bc4d 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -36,7 +36,7 @@ void subpage_prot_free(struct mm_struct *mm)
 		}
 	}
 	addr = 0;
-	for (i = 0; i < 2; ++i) {
+	for (i = 0; i < (TASK_SIZE_USER64 >> 43); ++i) {
 		p = spt->protptrs[i];
 		if (!p)
 			continue;
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 744e0164ecf5..884f4b705b57 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -12,12 +12,12 @@
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/memblock.h>
-#include <asm/ppc-opcode.h>
 
+#include <asm/ppc-opcode.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/trace.h>
-
+#include <asm/cputhreads.h>
 
 #define RIC_FLUSH_TLB 0
 #define RIC_FLUSH_PWC 1
@@ -39,6 +39,20 @@ static inline void __tlbiel_pid(unsigned long pid, int set,
 	trace_tlbie(0, 1, rb, rs, ric, prs, r);
 }
 
+static inline void __tlbie_pid(unsigned long pid, unsigned long ric)
+{
+	unsigned long rb,rs,prs,r;
+
+	rb = PPC_BIT(53); /* IS = 1 */
+	rs = pid << PPC_BITLSHIFT(31);
+	prs = 1; /* process scoped */
+	r = 1;   /* raidx format */
+
+	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+	trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
+
 /*
  * We use 128 set in radix mode and 256 set in hpt mode.
  */
@@ -54,63 +68,76 @@ static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
 	 */
 	__tlbiel_pid(pid, 0, ric);
 
-	if (ric == RIC_FLUSH_ALL)
-		/* For the remaining sets, just flush the TLB */
-		ric = RIC_FLUSH_TLB;
+	/* For PWC, only one flush is needed */
+	if (ric == RIC_FLUSH_PWC) {
+		asm volatile("ptesync": : :"memory");
+		return;
+	}
 
+	/* For the remaining sets, just flush the TLB */
 	for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
-		__tlbiel_pid(pid, set, ric);
+		__tlbiel_pid(pid, set, RIC_FLUSH_TLB);
 
 	asm volatile("ptesync": : :"memory");
 	asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
 }
 
-static inline void tlbiel_pwc(unsigned long pid)
+static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
 {
 	asm volatile("ptesync": : :"memory");
-
-	/* For PWC flush, we don't look at set number */
-	__tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
-
-	asm volatile("ptesync": : :"memory");
-	asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
+	__tlbie_pid(pid, ric);
+	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
-static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
+static inline void __tlbiel_va(unsigned long va, unsigned long pid,
+			       unsigned long ap, unsigned long ric)
 {
 	unsigned long rb,rs,prs,r;
 
-	rb = PPC_BIT(53); /* IS = 1 */
+	rb = va & ~(PPC_BITMASK(52, 63));
+	rb |= ap << PPC_BITLSHIFT(58);
 	rs = pid << PPC_BITLSHIFT(31);
 	prs = 1; /* process scoped */
 	r = 1;   /* raidx format */
 
-	asm volatile("ptesync": : :"memory");
-	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+	asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
 		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-	asm volatile("eieio; tlbsync; ptesync": : :"memory");
-	trace_tlbie(0, 0, rb, rs, ric, prs, r);
+	trace_tlbie(0, 1, rb, rs, ric, prs, r);
+}
+
+static inline void __tlbiel_va_range(unsigned long start, unsigned long end,
+				    unsigned long pid, unsigned long page_size,
+				    unsigned long psize)
+{
+	unsigned long addr;
+	unsigned long ap = mmu_get_ap(psize);
+
+	for (addr = start; addr < end; addr += page_size)
+		__tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
 }
 
 static inline void _tlbiel_va(unsigned long va, unsigned long pid,
-			      unsigned long ap, unsigned long ric)
+			      unsigned long psize, unsigned long ric)
 {
-	unsigned long rb,rs,prs,r;
+	unsigned long ap = mmu_get_ap(psize);
 
-	rb = va & ~(PPC_BITMASK(52, 63));
-	rb |= ap << PPC_BITLSHIFT(58);
-	rs = pid << PPC_BITLSHIFT(31);
-	prs = 1; /* process scoped */
-	r = 1;   /* raidx format */
+	asm volatile("ptesync": : :"memory");
+	__tlbiel_va(va, pid, ap, ric);
+	asm volatile("ptesync": : :"memory");
+}
 
+static inline void _tlbiel_va_range(unsigned long start, unsigned long end,
+				    unsigned long pid, unsigned long page_size,
+				    unsigned long psize, bool also_pwc)
+{
 	asm volatile("ptesync": : :"memory");
-	asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
-		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+	if (also_pwc)
+		__tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
+	__tlbiel_va_range(start, end, pid, page_size, psize);
 	asm volatile("ptesync": : :"memory");
-	trace_tlbie(0, 1, rb, rs, ric, prs, r);
 }
 
-static inline void _tlbie_va(unsigned long va, unsigned long pid,
+static inline void __tlbie_va(unsigned long va, unsigned long pid,
 			     unsigned long ap, unsigned long ric)
 {
 	unsigned long rb,rs,prs,r;
@@ -121,13 +148,43 @@ static inline void _tlbie_va(unsigned long va, unsigned long pid,
 	prs = 1; /* process scoped */
 	r = 1;   /* raidx format */
 
-	asm volatile("ptesync": : :"memory");
 	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
 		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 	trace_tlbie(0, 0, rb, rs, ric, prs, r);
 }
 
+static inline void __tlbie_va_range(unsigned long start, unsigned long end,
+				    unsigned long pid, unsigned long page_size,
+				    unsigned long psize)
+{
+	unsigned long addr;
+	unsigned long ap = mmu_get_ap(psize);
+
+	for (addr = start; addr < end; addr += page_size)
+		__tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+}
+
+static inline void _tlbie_va(unsigned long va, unsigned long pid,
+			      unsigned long psize, unsigned long ric)
+{
+	unsigned long ap = mmu_get_ap(psize);
+
+	asm volatile("ptesync": : :"memory");
+	__tlbie_va(va, pid, ap, ric);
+	asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+static inline void _tlbie_va_range(unsigned long start, unsigned long end,
+				    unsigned long pid, unsigned long page_size,
+				    unsigned long psize, bool also_pwc)
+{
+	asm volatile("ptesync": : :"memory");
+	if (also_pwc)
+		__tlbie_pid(pid, RIC_FLUSH_PWC);
+	__tlbie_va_range(start, end, pid, page_size, psize);
+	asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
 /*
  * Base TLB flushing operations:
  *
@@ -146,42 +203,34 @@ void radix__local_flush_tlb_mm(struct mm_struct *mm)
 	preempt_disable();
 	pid = mm->context.id;
 	if (pid != MMU_NO_CONTEXT)
-		_tlbiel_pid(pid, RIC_FLUSH_ALL);
+		_tlbiel_pid(pid, RIC_FLUSH_TLB);
 	preempt_enable();
 }
 EXPORT_SYMBOL(radix__local_flush_tlb_mm);
 
-void radix__local_flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
+#ifndef CONFIG_SMP
+void radix__local_flush_all_mm(struct mm_struct *mm)
 {
 	unsigned long pid;
-	struct mm_struct *mm = tlb->mm;
-	/*
-	 * If we are doing a full mm flush, we will do a tlb flush
-	 * with RIC_FLUSH_ALL later.
-	 */
-	if (tlb->fullmm)
-		return;
 
 	preempt_disable();
-
 	pid = mm->context.id;
 	if (pid != MMU_NO_CONTEXT)
-		tlbiel_pwc(pid);
-
+		_tlbiel_pid(pid, RIC_FLUSH_ALL);
 	preempt_enable();
 }
-EXPORT_SYMBOL(radix__local_flush_tlb_pwc);
+EXPORT_SYMBOL(radix__local_flush_all_mm);
+#endif /* CONFIG_SMP */
 
 void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
 				       int psize)
 {
 	unsigned long pid;
-	unsigned long ap = mmu_get_ap(psize);
 
 	preempt_disable();
-	pid = mm ? mm->context.id : 0;
+	pid = mm->context.id;
 	if (pid != MMU_NO_CONTEXT)
-		_tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
+		_tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
 	preempt_enable();
 }
 
@@ -189,11 +238,10 @@ void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmadd
 {
 #ifdef CONFIG_HUGETLB_PAGE
 	/* need the return fix for nohash.c */
-	if (vma && is_vm_hugetlb_page(vma))
-		return __local_flush_hugetlb_page(vma, vmaddr);
+	if (is_vm_hugetlb_page(vma))
+		return radix__local_flush_hugetlb_page(vma, vmaddr);
 #endif
-	radix__local_flush_tlb_page_psize(vma ? vma->vm_mm : NULL, vmaddr,
-					  mmu_virtual_psize);
+	radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize);
 }
 EXPORT_SYMBOL(radix__local_flush_tlb_page);
 
@@ -202,75 +250,71 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
 {
 	unsigned long pid;
 
-	preempt_disable();
 	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
-		goto no_context;
+		return;
 
+	preempt_disable();
 	if (!mm_is_thread_local(mm))
-		_tlbie_pid(pid, RIC_FLUSH_ALL);
+		_tlbie_pid(pid, RIC_FLUSH_TLB);
 	else
-		_tlbiel_pid(pid, RIC_FLUSH_ALL);
-no_context:
+		_tlbiel_pid(pid, RIC_FLUSH_TLB);
 	preempt_enable();
 }
 EXPORT_SYMBOL(radix__flush_tlb_mm);
 
-void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
+void radix__flush_all_mm(struct mm_struct *mm)
 {
 	unsigned long pid;
-	struct mm_struct *mm = tlb->mm;
-
-	/*
-	 * If we are doing a full mm flush, we will do a tlb flush
-	 * with RIC_FLUSH_ALL later.
-	 */
-	if (tlb->fullmm)
-		return;
-	preempt_disable();
 
 	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
-		goto no_context;
+		return;
 
+	preempt_disable();
 	if (!mm_is_thread_local(mm))
-		_tlbie_pid(pid, RIC_FLUSH_PWC);
+		_tlbie_pid(pid, RIC_FLUSH_ALL);
 	else
-		tlbiel_pwc(pid);
-no_context:
+		_tlbiel_pid(pid, RIC_FLUSH_ALL);
 	preempt_enable();
 }
+EXPORT_SYMBOL(radix__flush_all_mm);
+
+void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
+{
+	tlb->need_flush_all = 1;
+}
 EXPORT_SYMBOL(radix__flush_tlb_pwc);
 
 void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
 				 int psize)
 {
 	unsigned long pid;
-	unsigned long ap = mmu_get_ap(psize);
 
-	preempt_disable();
-	pid = mm ? mm->context.id : 0;
+	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
-		goto bail;
+		return;
+
+	preempt_disable();
 	if (!mm_is_thread_local(mm))
-		_tlbie_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
+		_tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
 	else
-		_tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
-bail:
+		_tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
 	preempt_enable();
 }
 
 void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
 {
 #ifdef CONFIG_HUGETLB_PAGE
-	if (vma && is_vm_hugetlb_page(vma))
-		return flush_hugetlb_page(vma, vmaddr);
+	if (is_vm_hugetlb_page(vma))
+		return radix__flush_hugetlb_page(vma, vmaddr);
 #endif
-	radix__flush_tlb_page_psize(vma ? vma->vm_mm : NULL, vmaddr,
-				    mmu_virtual_psize);
+	radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize);
 }
 EXPORT_SYMBOL(radix__flush_tlb_page);
 
+#else /* CONFIG_SMP */
+#define radix__flush_all_mm radix__local_flush_all_mm
 #endif /* CONFIG_SMP */
 
 void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
@@ -279,16 +323,86 @@ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
 }
 EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
 
+#define TLB_FLUSH_ALL -1UL
+
 /*
- * Currently, for range flushing, we just do a full mm flush. Because
- * we use this in code path where we don' track the page size.
+ * Number of pages above which we invalidate the entire PID rather than
+ * flush individual pages, for local and global flushes respectively.
+ *
+ * tlbie goes out to the interconnect and individual ops are more costly.
+ * It also does not iterate over sets like the local tlbiel variant when
+ * invalidating a full PID, so it has a far lower threshold to change from
+ * individual page flushes to full-pid flushes.
  */
+static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
+static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2;
+
 void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 		     unsigned long end)
 
 {
 	struct mm_struct *mm = vma->vm_mm;
-	radix__flush_tlb_mm(mm);
+	unsigned long pid;
+	unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift;
+	unsigned long page_size = 1UL << page_shift;
+	unsigned long nr_pages = (end - start) >> page_shift;
+	bool local, full;
+
+#ifdef CONFIG_HUGETLB_PAGE
+	if (is_vm_hugetlb_page(vma))
+		return radix__flush_hugetlb_tlb_range(vma, start, end);
+#endif
+
+	pid = mm->context.id;
+	if (unlikely(pid == MMU_NO_CONTEXT))
+		return;
+
+	preempt_disable();
+	if (mm_is_thread_local(mm)) {
+		local = true;
+		full = (end == TLB_FLUSH_ALL ||
+				nr_pages > tlb_local_single_page_flush_ceiling);
+	} else {
+		local = false;
+		full = (end == TLB_FLUSH_ALL ||
+				nr_pages > tlb_single_page_flush_ceiling);
+	}
+
+	if (full) {
+		if (local)
+			_tlbiel_pid(pid, RIC_FLUSH_TLB);
+		else
+			_tlbie_pid(pid, RIC_FLUSH_TLB);
+	} else {
+		bool hflush = false;
+		unsigned long hstart, hend;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+		hstart = (start + HPAGE_PMD_SIZE - 1) >> HPAGE_PMD_SHIFT;
+		hend = end >> HPAGE_PMD_SHIFT;
+		if (hstart < hend) {
+			hstart <<= HPAGE_PMD_SHIFT;
+			hend <<= HPAGE_PMD_SHIFT;
+			hflush = true;
+		}
+#endif
+
+		asm volatile("ptesync": : :"memory");
+		if (local) {
+			__tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize);
+			if (hflush)
+				__tlbiel_va_range(hstart, hend, pid,
+						HPAGE_PMD_SIZE, MMU_PAGE_2M);
+			asm volatile("ptesync": : :"memory");
+		} else {
+			__tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize);
+			if (hflush)
+				__tlbie_va_range(hstart, hend, pid,
+						HPAGE_PMD_SIZE, MMU_PAGE_2M);
+			asm volatile("eieio; tlbsync; ptesync": : :"memory");
+		}
+	}
+	preempt_enable();
 }
 EXPORT_SYMBOL(radix__flush_tlb_range);
 
@@ -307,63 +421,122 @@ static int radix_get_mmu_psize(int page_size)
 	return psize;
 }
 
+static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
+				  unsigned long end, int psize);
+
 void radix__tlb_flush(struct mmu_gather *tlb)
 {
 	int psize = 0;
 	struct mm_struct *mm = tlb->mm;
 	int page_size = tlb->page_size;
 
-	psize = radix_get_mmu_psize(page_size);
 	/*
 	 * if page size is not something we understand, do a full mm flush
+	 *
+	 * A "fullmm" flush must always do a flush_all_mm (RIC=2) flush
+	 * that flushes the process table entry cache upon process teardown.
+	 * See the comment for radix in arch_exit_mmap().
 	 */
-	if (psize != -1 && !tlb->fullmm && !tlb->need_flush_all)
-		radix__flush_tlb_range_psize(mm, tlb->start, tlb->end, psize);
-	else
-		radix__flush_tlb_mm(mm);
-}
+	if (tlb->fullmm) {
+		radix__flush_all_mm(mm);
+	} else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
+		if (!tlb->need_flush_all)
+			radix__flush_tlb_mm(mm);
+		else
+			radix__flush_all_mm(mm);
+	} else {
+		unsigned long start = tlb->start;
+		unsigned long end = tlb->end;
 
-#define TLB_FLUSH_ALL -1UL
-/*
- * Number of pages above which we will do a bcast tlbie. Just a
- * number at this point copied from x86
- */
-static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
+		if (!tlb->need_flush_all)
+			radix__flush_tlb_range_psize(mm, start, end, psize);
+		else
+			radix__flush_tlb_pwc_range_psize(mm, start, end, psize);
+	}
+	tlb->need_flush_all = 0;
+}
 
-void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
-				  unsigned long end, int psize)
+static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
+				unsigned long start, unsigned long end,
+				int psize, bool also_pwc)
 {
 	unsigned long pid;
-	unsigned long addr;
-	int local = mm_is_thread_local(mm);
-	unsigned long ap = mmu_get_ap(psize);
-	unsigned long page_size = 1UL << mmu_psize_defs[psize].shift;
+	unsigned int page_shift = mmu_psize_defs[psize].shift;
+	unsigned long page_size = 1UL << page_shift;
+	unsigned long nr_pages = (end - start) >> page_shift;
+	bool local, full;
 
+	pid = mm->context.id;
+	if (unlikely(pid == MMU_NO_CONTEXT))
+		return;
 
 	preempt_disable();
-	pid = mm ? mm->context.id : 0;
-	if (unlikely(pid == MMU_NO_CONTEXT))
-		goto err_out;
+	if (mm_is_thread_local(mm)) {
+		local = true;
+		full = (end == TLB_FLUSH_ALL ||
+				nr_pages > tlb_local_single_page_flush_ceiling);
+	} else {
+		local = false;
+		full = (end == TLB_FLUSH_ALL ||
+				nr_pages > tlb_single_page_flush_ceiling);
+	}
 
-	if (end == TLB_FLUSH_ALL ||
-	    (end - start) > tlb_single_page_flush_ceiling * page_size) {
+	if (full) {
 		if (local)
-			_tlbiel_pid(pid, RIC_FLUSH_TLB);
+			_tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
 		else
-			_tlbie_pid(pid, RIC_FLUSH_TLB);
-		goto err_out;
-	}
-	for (addr = start; addr < end; addr += page_size) {
-
+			_tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL: RIC_FLUSH_TLB);
+	} else {
 		if (local)
-			_tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+			_tlbiel_va_range(start, end, pid, page_size, psize, also_pwc);
 		else
-			_tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+			_tlbie_va_range(start, end, pid, page_size, psize, also_pwc);
 	}
-err_out:
 	preempt_enable();
 }
 
+void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
+				  unsigned long end, int psize)
+{
+	return __radix__flush_tlb_range_psize(mm, start, end, psize, false);
+}
+
+static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
+				  unsigned long end, int psize)
+{
+	__radix__flush_tlb_range_psize(mm, start, end, psize, true);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
+{
+	unsigned long pid, end;
+
+	pid = mm->context.id;
+	if (unlikely(pid == MMU_NO_CONTEXT))
+		return;
+
+	/* 4k page size, just blow the world */
+	if (PAGE_SIZE == 0x1000) {
+		radix__flush_all_mm(mm);
+		return;
+	}
+
+	end = addr + HPAGE_PMD_SIZE;
+
+	/* Otherwise first do the PWC, then iterate the pages. */
+	preempt_disable();
+
+	if (mm_is_thread_local(mm)) {
+		_tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
+	} else {
+		_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
+	}
+
+	preempt_enable();
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 void radix__flush_tlb_lpid_va(unsigned long lpid, unsigned long gpa,
 			      unsigned long page_size)
 {
@@ -454,3 +627,44 @@ void radix__flush_tlb_pte_p9_dd1(unsigned long old_pte, struct mm_struct *mm,
 	else
 		radix__flush_tlb_page_psize(mm, address, mmu_virtual_psize);
 }
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+extern void radix_kvm_prefetch_workaround(struct mm_struct *mm)
+{
+	unsigned int pid = mm->context.id;
+
+	if (unlikely(pid == MMU_NO_CONTEXT))
+		return;
+
+	/*
+	 * If this context hasn't run on that CPU before and KVM is
+	 * around, there's a slim chance that the guest on another
+	 * CPU just brought in obsolete translation into the TLB of
+	 * this CPU due to a bad prefetch using the guest PID on
+	 * the way into the hypervisor.
+	 *
+	 * We work around this here. If KVM is possible, we check if
+	 * any sibling thread is in KVM. If it is, the window may exist
+	 * and thus we flush that PID from the core.
+	 *
+	 * A potential future improvement would be to mark which PIDs
+	 * have never been used on the system and avoid it if the PID
+	 * is new and the process has no other cpumask bit set.
+	 */
+	if (cpu_has_feature(CPU_FTR_HVMODE) && radix_enabled()) {
+		int cpu = smp_processor_id();
+		int sib = cpu_first_thread_sibling(cpu);
+		bool flush = false;
+
+		for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) {
+			if (sib == cpu)
+				continue;
+			if (paca[sib].kvm_hstate.kvm_vcpu)
+				flush = true;
+		}
+		if (flush)
+			_tlbiel_pid(pid, RIC_FLUSH_ALL);
+	}
+}
+EXPORT_SYMBOL_GPL(radix_kvm_prefetch_workaround);
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index b5b0fb97b9c0..881ebd53ffc2 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -29,6 +29,8 @@
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 #include <asm/bug.h>
+#include <asm/pte-walk.h>
+
 
 #include <trace/events/thp.h>
 
@@ -138,13 +140,10 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
  */
 void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
 {
-	const struct cpumask *tmp;
-	int i, local = 0;
+	int i, local;
 
 	i = batch->index;
-	tmp = cpumask_of(smp_processor_id());
-	if (cpumask_equal(mm_cpumask(batch->mm), tmp))
-		local = 1;
+	local = mm_is_thread_local(batch->mm);
 	if (i == 1)
 		flush_hash_page(batch->vpn[0], batch->pte[0],
 				batch->psize, batch->ssize, local);
@@ -207,8 +206,8 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
 	local_irq_save(flags);
 	arch_enter_lazy_mmu_mode();
 	for (; start < end; start += PAGE_SIZE) {
-		pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start, &is_thp,
-							&hugepage_shift);
+		pte_t *ptep = find_current_mm_pte(mm->pgd, start, &is_thp,
+						  &hugepage_shift);
 		unsigned long pte;
 
 		if (ptep == NULL)
diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S
index eabecfcaef7c..048b8e9f4492 100644
--- a/arch/powerpc/mm/tlb_nohash_low.S
+++ b/arch/powerpc/mm/tlb_nohash_low.S
@@ -60,7 +60,7 @@ _GLOBAL(__tlbil_va)
 	isync
 1:	blr
 
-#elif defined(CONFIG_8xx)
+#elif defined(CONFIG_PPC_8xx)
 
 /*
  * Nothing to do for 8xx, everything is inline
diff --git a/arch/powerpc/mm/vphn.c b/arch/powerpc/mm/vphn.c
index 5f8ef50e5c66..f83044faac23 100644
--- a/arch/powerpc/mm/vphn.c
+++ b/arch/powerpc/mm/vphn.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 #include <asm/byteorder.h>
 #include "vphn.h"
 
diff --git a/arch/powerpc/mm/vphn.h b/arch/powerpc/mm/vphn.h
index fe8b7805b78f..f9ffdb3942fc 100644
--- a/arch/powerpc/mm/vphn.h
+++ b/arch/powerpc/mm/vphn.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _ARCH_POWERPC_MM_VPHN_H_
 #define _ARCH_POWERPC_MM_VPHN_H_
author	James Morris <james.l.morris@oracle.com>	2017-11-29 12:47:41 +1100
committer	James Morris <james.l.morris@oracle.com>	2017-11-29 12:47:41 +1100
commit	cf40a76e7d5874bb25f4404eecc58a2e033af885 (patch)
tree	8fd81cbea03c87b3d41d7ae5b1d11eadd35d6ef5 /arch/powerpc/mm
parent	ab5348c9c23cd253f5902980d2d8fe067dc24c82 (diff)
parent	4fbd8d194f06c8a3fd2af1ce560ddb31f7ec8323 (diff)
download	talos-op-linux-cf40a76e7d5874bb25f4404eecc58a2e033af885.tar.gz talos-op-linux-cf40a76e7d5874bb25f4404eecc58a2e033af885.zip