30 files changed, 969 insertions, 497 deletions
diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c
index 949100577db5..6c5025e81236 100644
--- a/arch/powerpc/mm/8xx_mmu.c
+++ b/arch/powerpc/mm/8xx_mmu.c
@@ -13,62 +13,115 @@
  */
 
 #include <linux/memblock.h>
+#include <asm/fixmap.h>
+#include <asm/code-patching.h>
 
 #include "mmu_decl.h"
 
+#define IMMR_SIZE (FIX_IMMR_SIZE << PAGE_SHIFT)
+
 extern int __map_without_ltlbs;
+
 /*
- * MMU_init_hw does the chip-specific initialization of the MMU hardware.
+ * Return PA for this VA if it is in IMMR area, or 0
  */
-void __init MMU_init_hw(void)
+phys_addr_t v_block_mapped(unsigned long va)
 {
-	/* Nothing to do for the time being but keep it similar to other PPC */
+	unsigned long p = PHYS_IMMR_BASE;
+
+	if (__map_without_ltlbs)
+		return 0;
+	if (va >= VIRT_IMMR_BASE && va < VIRT_IMMR_BASE + IMMR_SIZE)
+		return p + va - VIRT_IMMR_BASE;
+	return 0;
+}
+
+/*
+ * Return VA for a given PA or 0 if not mapped
+ */
+unsigned long p_block_mapped(phys_addr_t pa)
+{
+	unsigned long p = PHYS_IMMR_BASE;
+
+	if (__map_without_ltlbs)
+		return 0;
+	if (pa >= p && pa < p + IMMR_SIZE)
+		return VIRT_IMMR_BASE + pa - p;
+	return 0;
 }
 
-#define LARGE_PAGE_SIZE_4M	(1<<22)
 #define LARGE_PAGE_SIZE_8M	(1<<23)
-#define LARGE_PAGE_SIZE_64M	(1<<26)
 
-unsigned long __init mmu_mapin_ram(unsigned long top)
+/*
+ * MMU_init_hw does the chip-specific initialization of the MMU hardware.
+ */
+void __init MMU_init_hw(void)
 {
-	unsigned long v, s, mapped;
-	phys_addr_t p;
+	/* PIN up to the 3 first 8Mb after IMMR in DTLB table */
+#ifdef CONFIG_PIN_TLB
+	unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000;
+	unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SHARED | _PAGE_DIRTY;
+#ifdef CONFIG_PIN_TLB_IMMR
+	int i = 29;
+#else
+	int i = 28;
+#endif
+	unsigned long addr = 0;
+	unsigned long mem = total_lowmem;
+
+	for (; i < 32 && mem >= LARGE_PAGE_SIZE_8M; i++) {
+		mtspr(SPRN_MD_CTR, ctr | (i << 8));
+		mtspr(SPRN_MD_EPN, (unsigned long)__va(addr) | MD_EVALID);
+		mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID);
+		mtspr(SPRN_MD_RPN, addr | flags | _PAGE_PRESENT);
+		addr += LARGE_PAGE_SIZE_8M;
+		mem -= LARGE_PAGE_SIZE_8M;
+	}
+#endif
+}
 
-	v = KERNELBASE;
-	p = 0;
-	s = top;
+static void mmu_mapin_immr(void)
+{
+	unsigned long p = PHYS_IMMR_BASE;
+	unsigned long v = VIRT_IMMR_BASE;
+	unsigned long f = pgprot_val(PAGE_KERNEL_NCG);
+	int offset;
 
-	if (__map_without_ltlbs)
-		return 0;
+	for (offset = 0; offset < IMMR_SIZE; offset += PAGE_SIZE)
+		map_page(v + offset, p + offset, f);
+}
 
-#ifdef CONFIG_PPC_4K_PAGES
-	while (s >= LARGE_PAGE_SIZE_8M) {
-		pmd_t *pmdp;
-		unsigned long val = p | MD_PS8MEG;
+/* Address of instructions to patch */
+#ifndef CONFIG_PIN_TLB_IMMR
+extern unsigned int DTLBMiss_jmp;
+#endif
+extern unsigned int DTLBMiss_cmp, FixupDAR_cmp;
 
-		pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
-		*pmdp++ = __pmd(val);
-		*pmdp++ = __pmd(val + LARGE_PAGE_SIZE_4M);
+void mmu_patch_cmp_limit(unsigned int *addr, unsigned long mapped)
+{
+	unsigned int instr = *addr;
 
-		v += LARGE_PAGE_SIZE_8M;
-		p += LARGE_PAGE_SIZE_8M;
-		s -= LARGE_PAGE_SIZE_8M;
-	}
-#else /* CONFIG_PPC_16K_PAGES */
-	while (s >= LARGE_PAGE_SIZE_64M) {
-		pmd_t *pmdp;
-		unsigned long val = p | MD_PS8MEG;
+	instr &= 0xffff0000;
+	instr |= (unsigned long)__va(mapped) >> 16;
+	patch_instruction(addr, instr);
+}
 
-		pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
-		*pmdp++ = __pmd(val);
+unsigned long __init mmu_mapin_ram(unsigned long top)
+{
+	unsigned long mapped;
 
-		v += LARGE_PAGE_SIZE_64M;
-		p += LARGE_PAGE_SIZE_64M;
-		s -= LARGE_PAGE_SIZE_64M;
-	}
+	if (__map_without_ltlbs) {
+		mapped = 0;
+		mmu_mapin_immr();
+#ifndef CONFIG_PIN_TLB_IMMR
+		patch_instruction(&DTLBMiss_jmp, PPC_INST_NOP);
 #endif
+	} else {
+		mapped = top & ~(LARGE_PAGE_SIZE_8M - 1);
+	}
 
-	mapped = top - s;
+	mmu_patch_cmp_limit(&DTLBMiss_cmp, mapped);
+	mmu_patch_cmp_limit(&FixupDAR_cmp, mapped);
 
 	/* If the size of RAM is not an exact power of two, we may not
 	 * have covered RAM in its entirety with 8 MiB
@@ -77,7 +130,8 @@ unsigned long __init mmu_mapin_ram(unsigned long top)
 	 * coverage with normal-sized pages (or other reasons) do not
 	 * attempt to allocate outside the allowed range.
 	 */
-	memblock_set_current_limit(mapped);
+	if (mapped)
+		memblock_set_current_limit(mapped);
 
 	return mapped;
 }
@@ -90,13 +144,8 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 	 */
 	BUG_ON(first_memblock_base != 0);
 
-#ifdef CONFIG_PIN_TLB
 	/* 8xx can only access 24MB at the moment */
 	memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01800000));
-#else
-	/* 8xx can only access 8MB at the moment */
-	memblock_set_current_limit(min_t(u64, first_memblock_size, 0x00800000));
-#endif
 }
 
 /*
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index f2cea6d5e764..1a4e570f7894 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -7,17 +7,16 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
 ccflags-$(CONFIG_PPC64)	:= $(NO_MINIMAL_TOC)
 
 obj-y				:= fault.o mem.o pgtable.o mmap.o \
-				   init_$(CONFIG_WORD_SIZE).o \
-				   pgtable_$(CONFIG_WORD_SIZE).o
+				   init_$(BITS).o pgtable_$(BITS).o
 obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o tlb_nohash.o \
 				   tlb_nohash_low.o
-obj-$(CONFIG_PPC_BOOK3E)	+= tlb_low_$(CONFIG_WORD_SIZE)e.o
+obj-$(CONFIG_PPC_BOOK3E)	+= tlb_low_$(BITS)e.o
 hash64-$(CONFIG_PPC_NATIVE)	:= hash_native_64.o
 obj-$(CONFIG_PPC_BOOK3E_64)   += pgtable-book3e.o
 obj-$(CONFIG_PPC_STD_MMU_64)	+= pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o
 obj-$(CONFIG_PPC_RADIX_MMU)	+= pgtable-radix.o tlb-radix.o
 obj-$(CONFIG_PPC_STD_MMU_32)	+= ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o
-obj-$(CONFIG_PPC_STD_MMU)	+= tlb_hash$(CONFIG_WORD_SIZE).o
+obj-$(CONFIG_PPC_STD_MMU)	+= tlb_hash$(BITS).o
 ifeq ($(CONFIG_PPC_STD_MMU_64),y)
 obj-$(CONFIG_PPC_4K_PAGES)	+= hash64_4k.o
 obj-$(CONFIG_PPC_64K_PAGES)	+= hash64_64k.o
diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
index 6527882ce05e..bb0354222b11 100644
--- a/arch/powerpc/mm/copro_fault.c
+++ b/arch/powerpc/mm/copro_fault.c
@@ -75,7 +75,7 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
 	}
 
 	ret = 0;
-	*flt = handle_mm_fault(mm, vma, ea, is_write ? FAULT_FLAG_WRITE : 0);
+	*flt = handle_mm_fault(vma, ea, is_write ? FAULT_FLAG_WRITE : 0);
 	if (unlikely(*flt & VM_FAULT_ERROR)) {
 		if (*flt & VM_FAULT_OOM) {
 			ret = -ENOMEM;
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index a67c6d781c52..d0b137d96df1 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -26,7 +26,7 @@
 #include <linux/mm.h>
 #include <linux/interrupt.h>
 #include <linux/highmem.h>
-#include <linux/module.h>
+#include <linux/extable.h>
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
 #include <linux/perf_event.h>
@@ -205,7 +205,7 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
  * The return value is 0 if the fault was handled, or the signal
  * number if this is a kernel fault that can't be handled here.
  */
-int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
+int do_page_fault(struct pt_regs *regs, unsigned long address,
 			    unsigned long error_code)
 {
 	enum ctx_state prev_state = exception_enter();
@@ -429,7 +429,7 @@ good_area:
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	fault = handle_mm_fault(mm, vma, address, flags);
+	fault = handle_mm_fault(vma, address, flags);
 	if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
 		if (fault & VM_FAULT_SIGSEGV)
 			goto bad_area;
@@ -498,8 +498,8 @@ bad_area_nosemaphore:
 bail:
 	exception_exit(prev_state);
 	return rc;
-
 }
+NOKPROBE_SYMBOL(do_page_fault);
 
 /*
  * bad_page_fault is called when we have a bad access from the kernel.
diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c
index 6333b273d2d5..42c702b3be1f 100644
--- a/arch/powerpc/mm/hash64_4k.c
+++ b/arch/powerpc/mm/hash64_4k.c
@@ -70,8 +70,8 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 		slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT;
 
-		if (ppc_md.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_4K,
-					 MMU_PAGE_4K, ssize, flags) == -1)
+		if (mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_4K,
+					       MMU_PAGE_4K, ssize, flags) == -1)
 			old_pte &= ~_PAGE_HPTEFLAGS;
 	}
 
@@ -84,21 +84,23 @@ repeat:
 		hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
 
 		/* Insert into the hash table, primary slot */
-		slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
-				  MMU_PAGE_4K, MMU_PAGE_4K, ssize);
+		slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+						MMU_PAGE_4K, MMU_PAGE_4K, ssize);
 		/*
 		 * Primary is full, try the secondary
 		 */
 		if (unlikely(slot == -1)) {
 			hpte_group = ((~hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
-			slot = ppc_md.hpte_insert(hpte_group, vpn, pa,
-						  rflags, HPTE_V_SECONDARY,
-						  MMU_PAGE_4K, MMU_PAGE_4K, ssize);
+			slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
+							rflags,
+							HPTE_V_SECONDARY,
+							MMU_PAGE_4K,
+							MMU_PAGE_4K, ssize);
 			if (slot == -1) {
 				if (mftb() & 0x1)
 					hpte_group = ((hash & htab_hash_mask) *
 						      HPTES_PER_GROUP) & ~0x7UL;
-				ppc_md.hpte_remove(hpte_group);
+				mmu_hash_ops.hpte_remove(hpte_group);
 				/*
 				 * FIXME!! Should be try the group from which we removed ?
 				 */
diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
index 16644e1f4e6b..3bbbea07378c 100644
--- a/arch/powerpc/mm/hash64_64k.c
+++ b/arch/powerpc/mm/hash64_64k.c
@@ -133,9 +133,9 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 		slot += hidx & _PTEIDX_GROUP_IX;
 
-		ret = ppc_md.hpte_updatepp(slot, rflags, vpn,
-					   MMU_PAGE_4K, MMU_PAGE_4K,
-					   ssize, flags);
+		ret = mmu_hash_ops.hpte_updatepp(slot, rflags, vpn,
+						 MMU_PAGE_4K, MMU_PAGE_4K,
+						 ssize, flags);
 		/*
 		 *if we failed because typically the HPTE wasn't really here
 		 * we try an insertion.
@@ -166,21 +166,22 @@ repeat:
 	hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
 
 	/* Insert into the hash table, primary slot */
-	slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
-				  MMU_PAGE_4K, MMU_PAGE_4K, ssize);
+	slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+					MMU_PAGE_4K, MMU_PAGE_4K, ssize);
 	/*
 	 * Primary is full, try the secondary
 	 */
 	if (unlikely(slot == -1)) {
 		hpte_group = ((~hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
-		slot = ppc_md.hpte_insert(hpte_group, vpn, pa,
-					  rflags, HPTE_V_SECONDARY,
-					  MMU_PAGE_4K, MMU_PAGE_4K, ssize);
+		slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
+						rflags, HPTE_V_SECONDARY,
+						MMU_PAGE_4K, MMU_PAGE_4K,
+						ssize);
 		if (slot == -1) {
 			if (mftb() & 0x1)
 				hpte_group = ((hash & htab_hash_mask) *
 					      HPTES_PER_GROUP) & ~0x7UL;
-			ppc_md.hpte_remove(hpte_group);
+			mmu_hash_ops.hpte_remove(hpte_group);
 			/*
 			 * FIXME!! Should be try the group from which we removed ?
 			 */
@@ -272,8 +273,9 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 		slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT;
 
-		if (ppc_md.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_64K,
-					 MMU_PAGE_64K, ssize, flags) == -1)
+		if (mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_64K,
+					       MMU_PAGE_64K, ssize,
+					       flags) == -1)
 			old_pte &= ~_PAGE_HPTEFLAGS;
 	}
 
@@ -286,21 +288,24 @@ repeat:
 		hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
 
 		/* Insert into the hash table, primary slot */
-		slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
-				  MMU_PAGE_64K, MMU_PAGE_64K, ssize);
+		slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+						MMU_PAGE_64K, MMU_PAGE_64K,
+						ssize);
 		/*
 		 * Primary is full, try the secondary
 		 */
 		if (unlikely(slot == -1)) {
 			hpte_group = ((~hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
-			slot = ppc_md.hpte_insert(hpte_group, vpn, pa,
-						  rflags, HPTE_V_SECONDARY,
-						  MMU_PAGE_64K, MMU_PAGE_64K, ssize);
+			slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
+							rflags,
+							HPTE_V_SECONDARY,
+							MMU_PAGE_64K,
+							MMU_PAGE_64K, ssize);
 			if (slot == -1) {
 				if (mftb() & 0x1)
 					hpte_group = ((hash & htab_hash_mask) *
 						      HPTES_PER_GROUP) & ~0x7UL;
-				ppc_md.hpte_remove(hpte_group);
+				mmu_hash_ops.hpte_remove(hpte_group);
 				/*
 				 * FIXME!! Should be try the group from which we removed ?
 				 */
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index 115347f74ce5..09cc50c8dace 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -26,6 +26,7 @@
 #include <asm/ppc_asm.h>
 #include <asm/thread_info.h>
 #include <asm/asm-offsets.h>
+#include <asm/export.h>
 
 #ifdef CONFIG_SMP
 	.section .bss
@@ -33,6 +34,7 @@
 	.globl mmu_hash_lock
 mmu_hash_lock:
 	.space	4
+EXPORT_SYMBOL(mmu_hash_lock)
 #endif /* CONFIG_SMP */
 
 /*
@@ -575,6 +577,7 @@ _GLOBAL(flush_hash_pages)
 	rlwinm	r8,r8,0,31,29		/* clear HASHPTE bit */
 	stwcx.	r8,0,r5			/* update the pte */
 	bne-	33b
+EXPORT_SYMBOL(flush_hash_pages)
 
 	/* Get the address of the primary PTE group in the hash table (r3) */
 _GLOBAL(flush_hash_patch_A)
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index d873f6507f72..83ddc0e171b0 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -55,7 +55,7 @@ static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
 	 * We need 14 to 65 bits of va for a tlibe of 4K page
 	 * With vpn we ignore the lower VPN_SHIFT bits already.
 	 * And top two bits are already ignored because we can
-	 * only accomadate 76 bits in a 64 bit vpn with a VPN_SHIFT
+	 * only accomodate 76 bits in a 64 bit vpn with a VPN_SHIFT
 	 * of 12.
 	 */
 	va = vpn << VPN_SHIFT;
@@ -64,15 +64,15 @@ static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
 	 * Older versions of the architecture (2.02 and earler) require the
 	 * masking of the top 16 bits.
 	 */
-	va &= ~(0xffffULL << 48);
+	if (mmu_has_feature(MMU_FTR_TLBIE_CROP_VA))
+		va &= ~(0xffffULL << 48);
 
 	switch (psize) {
 	case MMU_PAGE_4K:
 		/* clear out bits after (52) [0....52.....63] */
 		va &= ~((1ul << (64 - 52)) - 1);
 		va |= ssize << 8;
-		sllp = ((mmu_psize_defs[apsize].sllp & SLB_VSID_L) >> 6) |
-			((mmu_psize_defs[apsize].sllp & SLB_VSID_LP) >> 4);
+		sllp = get_sllp_encoding(apsize);
 		va |= sllp << 5;
 		asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2)
 			     : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
@@ -113,15 +113,15 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
 	 * Older versions of the architecture (2.02 and earler) require the
 	 * masking of the top 16 bits.
 	 */
-	va &= ~(0xffffULL << 48);
+	if (mmu_has_feature(MMU_FTR_TLBIE_CROP_VA))
+		va &= ~(0xffffULL << 48);
 
 	switch (psize) {
 	case MMU_PAGE_4K:
 		/* clear out bits after(52) [0....52.....63] */
 		va &= ~((1ul << (64 - 52)) - 1);
 		va |= ssize << 8;
-		sllp = ((mmu_psize_defs[apsize].sllp & SLB_VSID_L) >> 6) |
-			((mmu_psize_defs[apsize].sllp & SLB_VSID_LP) >> 4);
+		sllp = get_sllp_encoding(apsize);
 		va |= sllp << 5;
 		asm volatile(".long 0x7c000224 | (%0 << 11) | (0 << 21)"
 			     : : "r"(va) : "memory");
@@ -316,8 +316,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
 			DBG_LOW(" -> hit\n");
 			/* Update the HPTE */
 			hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) &
-						~(HPTE_R_PP | HPTE_R_N)) |
-					       (newpp & (HPTE_R_PP | HPTE_R_N |
+						~(HPTE_R_PPP | HPTE_R_N)) |
+					       (newpp & (HPTE_R_PPP | HPTE_R_N |
 							 HPTE_R_C)));
 		}
 		native_unlock_hpte(hptep);
@@ -385,8 +385,8 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
 
 	/* Update the HPTE */
 	hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) &
-			~(HPTE_R_PP | HPTE_R_N)) |
-		(newpp & (HPTE_R_PP | HPTE_R_N)));
+				~(HPTE_R_PPP | HPTE_R_N)) |
+			       (newpp & (HPTE_R_PPP | HPTE_R_N)));
 	/*
 	 * Ensure it is out of the tlb too. Bolted entries base and
 	 * actual page size will be same.
@@ -493,36 +493,6 @@ static void native_hugepage_invalidate(unsigned long vsid,
 }
 #endif
 
-static inline int __hpte_actual_psize(unsigned int lp, int psize)
-{
-	int i, shift;
-	unsigned int mask;
-
-	/* start from 1 ignoring MMU_PAGE_4K */
-	for (i = 1; i < MMU_PAGE_COUNT; i++) {
-
-		/* invalid penc */
-		if (mmu_psize_defs[psize].penc[i] == -1)
-			continue;
-		/*
-		 * encoding bits per actual page size
-		 *        PTE LP     actual page size
-		 *    rrrr rrrz		>=8KB
-		 *    rrrr rrzz		>=16KB
-		 *    rrrr rzzz		>=32KB
-		 *    rrrr zzzz		>=64KB
-		 * .......
-		 */
-		shift = mmu_psize_defs[i].shift - LP_SHIFT;
-		if (shift > LP_BITS)
-			shift = LP_BITS;
-		mask = (1 << shift) - 1;
-		if ((lp & mask) == mmu_psize_defs[psize].penc[i])
-			return i;
-	}
-	return -1;
-}
-
 static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
 			int *psize, int *apsize, int *ssize, unsigned long *vpn)
 {
@@ -538,19 +508,15 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
 		size   = MMU_PAGE_4K;
 		a_size = MMU_PAGE_4K;
 	} else {
-		for (size = 0; size < MMU_PAGE_COUNT; size++) {
-
-			/* valid entries have a shift value */
-			if (!mmu_psize_defs[size].shift)
-				continue;
-
-			a_size = __hpte_actual_psize(lp, size);
-			if (a_size != -1)
-				break;
-		}
+		size = hpte_page_sizes[lp] & 0xf;
+		a_size = hpte_page_sizes[lp] >> 4;
 	}
 	/* This works for all page sizes, and for 256M and 1T segments */
-	*ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		*ssize = hpte_r >> HPTE_R_3_0_SSIZE_SHIFT;
+	else
+		*ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
+
 	shift = mmu_psize_defs[size].shift;
 
 	avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm);
@@ -601,7 +567,7 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
  * crashdump and all bets are off anyway.
  *
  * TODO: add batching support when enabled.  remember, no dynamic memory here,
- * athough there is the control page available...
+ * although there is the control page available...
  */
 static void native_hpte_clear(void)
 {
@@ -719,23 +685,29 @@ static void native_flush_hash_range(unsigned long number, int local)
 	local_irq_restore(flags);
 }
 
-static int native_update_partition_table(u64 patb1)
+static int native_register_proc_table(unsigned long base, unsigned long page_size,
+				      unsigned long table_size)
 {
+	unsigned long patb1 = base << 25; /* VSID */
+
+	patb1 |= (page_size << 5);  /* sllp */
+	patb1 |= table_size;
+
 	partition_tb->patb1 = cpu_to_be64(patb1);
 	return 0;
 }
 
 void __init hpte_init_native(void)
 {
-	ppc_md.hpte_invalidate	= native_hpte_invalidate;
-	ppc_md.hpte_updatepp	= native_hpte_updatepp;
-	ppc_md.hpte_updateboltedpp = native_hpte_updateboltedpp;
-	ppc_md.hpte_insert	= native_hpte_insert;
-	ppc_md.hpte_remove	= native_hpte_remove;
-	ppc_md.hpte_clear_all	= native_hpte_clear;
-	ppc_md.flush_hash_range = native_flush_hash_range;
-	ppc_md.hugepage_invalidate   = native_hugepage_invalidate;
+	mmu_hash_ops.hpte_invalidate	= native_hpte_invalidate;
+	mmu_hash_ops.hpte_updatepp	= native_hpte_updatepp;
+	mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp;
+	mmu_hash_ops.hpte_insert	= native_hpte_insert;
+	mmu_hash_ops.hpte_remove	= native_hpte_remove;
+	mmu_hash_ops.hpte_clear_all	= native_hpte_clear;
+	mmu_hash_ops.flush_hash_range = native_flush_hash_range;
+	mmu_hash_ops.hugepage_invalidate   = native_hugepage_invalidate;
 
 	if (cpu_has_feature(CPU_FTR_ARCH_300))
-		ppc_md.update_partition_table = native_update_partition_table;
+		register_process_table = native_register_proc_table;
 }
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 59268969a0bc..44d3c3a38e3e 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -34,6 +34,7 @@
 #include <linux/signal.h>
 #include <linux/memblock.h>
 #include <linux/context_tracking.h>
+#include <linux/libfdt.h>
 
 #include <asm/processor.h>
 #include <asm/pgtable.h>
@@ -58,6 +59,7 @@
 #include <asm/firmware.h>
 #include <asm/tm.h>
 #include <asm/trace.h>
+#include <asm/ps3.h>
 
 #ifdef DEBUG
 #define DBG(fmt...) udbg_printf(fmt)
@@ -87,14 +89,13 @@
  *
  */
 
-#ifdef CONFIG_U3_DART
-extern unsigned long dart_tablebase;
-#endif /* CONFIG_U3_DART */
-
 static unsigned long _SDR1;
 struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
 EXPORT_SYMBOL_GPL(mmu_psize_defs);
 
+u8 hpte_page_sizes[1 << LP_BITS];
+EXPORT_SYMBOL_GPL(hpte_page_sizes);
+
 struct hash_pte *htab_address;
 unsigned long htab_size_bytes;
 unsigned long htab_hash_mask;
@@ -120,6 +121,8 @@ static u8 *linear_map_hash_slots;
 static unsigned long linear_map_hash_count;
 static DEFINE_SPINLOCK(linear_map_hash_lock);
 #endif /* CONFIG_DEBUG_PAGEALLOC */
+struct mmu_hash_ops mmu_hash_ops;
+EXPORT_SYMBOL(mmu_hash_ops);
 
 /* There are definitions of page sizes arrays to be used when none
  * is provided by the firmware.
@@ -159,6 +162,19 @@ static struct mmu_psize_def mmu_psize_defaults_gp[] = {
 	},
 };
 
+/*
+ * 'R' and 'C' update notes:
+ *  - Under pHyp or KVM, the updatepp path will not set C, thus it *will*
+ *     create writeable HPTEs without C set, because the hcall H_PROTECT
+ *     that we use in that case will not update C
+ *  - The above is however not a problem, because we also don't do that
+ *     fancy "no flush" variant of eviction and we use H_REMOVE which will
+ *     do the right thing and thus we don't have the race I described earlier
+ *
+ *    - Under bare metal,  we do have the race, so we need R and C set
+ *    - We make sure R is always set and never lost
+ *    - C is _PAGE_DIRTY, and *should* always be set for a writeable mapping
+ */
 unsigned long htab_convert_pte_flags(unsigned long pteflags)
 {
 	unsigned long rflags = 0;
@@ -186,19 +202,28 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags)
 			rflags |= 0x1;
 	}
 	/*
-	 * Always add "C" bit for perf. Memory coherence is always enabled
+	 * We can't allow hardware to update hpte bits. Hence always
+	 * set 'R' bit and set 'C' if it is a write fault
 	 */
-	rflags |=  HPTE_R_C | HPTE_R_M;
+	rflags |=  HPTE_R_R;
+
+	if (pteflags & _PAGE_DIRTY)
+		rflags |= HPTE_R_C;
 	/*
 	 * Add in WIG bits
 	 */
 
 	if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_TOLERANT)
 		rflags |= HPTE_R_I;
-	if ((pteflags & _PAGE_CACHE_CTL ) == _PAGE_NON_IDEMPOTENT)
+	else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT)
 		rflags |= (HPTE_R_I | HPTE_R_G);
-	if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO)
-		rflags |= (HPTE_R_I | HPTE_R_W);
+	else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO)
+		rflags |= (HPTE_R_W | HPTE_R_I | HPTE_R_M);
+	else
+		/*
+		 * Add memory coherence if cache inhibited is not set
+		 */
+		rflags |= HPTE_R_M;
 
 	return rflags;
 }
@@ -256,9 +281,10 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
 		hash = hpt_hash(vpn, shift, ssize);
 		hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
 
-		BUG_ON(!ppc_md.hpte_insert);
-		ret = ppc_md.hpte_insert(hpteg, vpn, paddr, tprot,
-					 HPTE_V_BOLTED, psize, psize, ssize);
+		BUG_ON(!mmu_hash_ops.hpte_insert);
+		ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot,
+					       HPTE_V_BOLTED, psize, psize,
+					       ssize);
 
 		if (ret < 0)
 			break;
@@ -283,11 +309,11 @@ int htab_remove_mapping(unsigned long vstart, unsigned long vend,
 	shift = mmu_psize_defs[psize].shift;
 	step = 1 << shift;
 
-	if (!ppc_md.hpte_removebolted)
+	if (!mmu_hash_ops.hpte_removebolted)
 		return -ENODEV;
 
 	for (vaddr = vstart; vaddr < vend; vaddr += step) {
-		rc = ppc_md.hpte_removebolted(vaddr, psize, ssize);
+		rc = mmu_hash_ops.hpte_removebolted(vaddr, psize, ssize);
 		if (rc == -ENOENT) {
 			ret = -ENOENT;
 			continue;
@@ -299,6 +325,15 @@ int htab_remove_mapping(unsigned long vstart, unsigned long vend,
 	return ret;
 }
 
+static bool disable_1tb_segments = false;
+
+static int __init parse_disable_1tb_segments(char *p)
+{
+	disable_1tb_segments = true;
+	return 0;
+}
+early_param("disable_1tb_segments", parse_disable_1tb_segments);
+
 static int __init htab_dt_scan_seg_sizes(unsigned long node,
 					 const char *uname, int depth,
 					 void *data)
@@ -317,6 +352,12 @@ static int __init htab_dt_scan_seg_sizes(unsigned long node,
 	for (; size >= 4; size -= 4, ++prop) {
 		if (be32_to_cpu(prop[0]) == 40) {
 			DBG("1T segment support detected\n");
+
+			if (disable_1tb_segments) {
+				DBG("1T segments disabled by command line\n");
+				break;
+			}
+
 			cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT;
 			return 1;
 		}
@@ -325,11 +366,6 @@ static int __init htab_dt_scan_seg_sizes(unsigned long node,
 	return 0;
 }
 
-static void __init htab_init_seg_sizes(void)
-{
-	of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL);
-}
-
 static int __init get_idx_from_shift(unsigned int shift)
 {
 	int idx = -1;
@@ -492,7 +528,8 @@ static bool might_have_hea(void)
 	 * we will never see an HEA ethernet device.
 	 */
 #ifdef CONFIG_IBMEBUS
-	return !cpu_has_feature(CPU_FTR_ARCH_207S);
+	return !cpu_has_feature(CPU_FTR_ARCH_207S) &&
+		firmware_has_feature(FW_FEATURE_SPLPAR);
 #else
 	return false;
 #endif
@@ -500,7 +537,7 @@ static bool might_have_hea(void)
 
 #endif /* #ifdef CONFIG_PPC_64K_PAGES */
 
-static void __init htab_init_page_sizes(void)
+static void __init htab_scan_page_sizes(void)
 {
 	int rc;
 
@@ -515,17 +552,75 @@ static void __init htab_init_page_sizes(void)
 	 * Try to find the available page sizes in the device-tree
 	 */
 	rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL);
-	if (rc != 0)  /* Found */
-		goto found;
-
-	/*
-	 * Not in the device-tree, let's fallback on known size
-	 * list for 16M capable GP & GR
-	 */
-	if (mmu_has_feature(MMU_FTR_16M_PAGE))
+	if (rc == 0 && early_mmu_has_feature(MMU_FTR_16M_PAGE)) {
+		/*
+		 * Nothing in the device-tree, but the CPU supports 16M pages,
+		 * so let's fallback on a known size list for 16M capable CPUs.
+		 */
 		memcpy(mmu_psize_defs, mmu_psize_defaults_gp,
 		       sizeof(mmu_psize_defaults_gp));
-found:
+	}
+
+#ifdef CONFIG_HUGETLB_PAGE
+	/* Reserve 16G huge page memory sections for huge pages */
+	of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
+#endif /* CONFIG_HUGETLB_PAGE */
+}
+
+/*
+ * Fill in the hpte_page_sizes[] array.
+ * We go through the mmu_psize_defs[] array looking for all the
+ * supported base/actual page size combinations.  Each combination
+ * has a unique pagesize encoding (penc) value in the low bits of
+ * the LP field of the HPTE.  For actual page sizes less than 1MB,
+ * some of the upper LP bits are used for RPN bits, meaning that
+ * we need to fill in several entries in hpte_page_sizes[].
+ *
+ * In diagrammatic form, with r = RPN bits and z = page size bits:
+ *        PTE LP     actual page size
+ *    rrrr rrrz		>=8KB
+ *    rrrr rrzz		>=16KB
+ *    rrrr rzzz		>=32KB
+ *    rrrr zzzz		>=64KB
+ *    ...
+ *
+ * The zzzz bits are implementation-specific but are chosen so that
+ * no encoding for a larger page size uses the same value in its
+ * low-order N bits as the encoding for the 2^(12+N) byte page size
+ * (if it exists).
+ */
+static void init_hpte_page_sizes(void)
+{
+	long int ap, bp;
+	long int shift, penc;
+
+	for (bp = 0; bp < MMU_PAGE_COUNT; ++bp) {
+		if (!mmu_psize_defs[bp].shift)
+			continue;	/* not a supported page size */
+		for (ap = bp; ap < MMU_PAGE_COUNT; ++ap) {
+			penc = mmu_psize_defs[bp].penc[ap];
+			if (penc == -1)
+				continue;
+			shift = mmu_psize_defs[ap].shift - LP_SHIFT;
+			if (shift <= 0)
+				continue;	/* should never happen */
+			/*
+			 * For page sizes less than 1MB, this loop
+			 * replicates the entry for all possible values
+			 * of the rrrr bits.
+			 */
+			while (penc < (1 << LP_BITS)) {
+				hpte_page_sizes[penc] = (ap << 4) | bp;
+				penc += 1 << shift;
+			}
+		}
+	}
+}
+
+static void __init htab_init_page_sizes(void)
+{
+	init_hpte_page_sizes();
+
 	if (!debug_pagealloc_enabled()) {
 		/*
 		 * Pick a size for the linear mapping. Currently, we only
@@ -558,7 +653,7 @@ found:
 			 * would stop us accessing the HEA ethernet. So if we
 			 * have the chance of ever seeing one, stay at 4k.
 			 */
-			if (!might_have_hea() || !machine_is(pseries))
+			if (!might_have_hea())
 				mmu_io_psize = MMU_PAGE_64K;
 		} else
 			mmu_ci_restrictions = 1;
@@ -591,11 +686,6 @@ found:
 	       ,mmu_psize_defs[mmu_vmemmap_psize].shift
 #endif
 	       );
-
-#ifdef CONFIG_HUGETLB_PAGE
-	/* Reserve 16G huge page memory sections for huge pages */
-	of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
-#endif /* CONFIG_HUGETLB_PAGE */
 }
 
 static int __init htab_dt_scan_pftsize(unsigned long node,
@@ -676,11 +766,33 @@ int remove_section_mapping(unsigned long start, unsigned long end)
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
+static void update_hid_for_hash(void)
+{
+	unsigned long hid0;
+	unsigned long rb = 3UL << PPC_BITLSHIFT(53); /* IS = 3 */
+
+	asm volatile("ptesync": : :"memory");
+	/* prs = 0, ric = 2, rs = 0, r = 1 is = 3 */
+	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(0), "i"(0), "i"(2), "r"(0) : "memory");
+	asm volatile("eieio; tlbsync; ptesync; isync; slbia": : :"memory");
+	/*
+	 * now switch the HID
+	 */
+	hid0  = mfspr(SPRN_HID0);
+	hid0 &= ~HID0_POWER9_RADIX;
+	mtspr(SPRN_HID0, hid0);
+	asm volatile("isync": : :"memory");
+
+	/* Wait for it to happen */
+	while ((mfspr(SPRN_HID0) & HID0_POWER9_RADIX))
+		cpu_relax();
+}
+
 static void __init hash_init_partition_table(phys_addr_t hash_table,
-					     unsigned long pteg_count)
+					     unsigned long htab_size)
 {
 	unsigned long ps_field;
-	unsigned long htab_size;
 	unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
 
 	/*
@@ -688,7 +800,7 @@ static void __init hash_init_partition_table(phys_addr_t hash_table,
 	 * We can ignore that for lpid 0
 	 */
 	ps_field = 0;
-	htab_size =  __ilog2(pteg_count) - 11;
+	htab_size =  __ilog2(htab_size) - 18;
 
 	BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large.");
 	partition_tb = __va(memblock_alloc_base(patb_size, patb_size,
@@ -702,7 +814,9 @@ static void __init hash_init_partition_table(phys_addr_t hash_table,
 	 * For now UPRT is 0 for us.
 	 */
 	partition_tb->patb1 = 0;
-	DBG("Partition table %p\n", partition_tb);
+	pr_info("Partition table %p\n", partition_tb);
+	if (cpu_has_feature(CPU_FTR_POWER9_DD1))
+		update_hid_for_hash();
 	/*
 	 * update partition table control register,
 	 * 64 K size.
@@ -716,17 +830,11 @@ static void __init htab_initialize(void)
 	unsigned long table;
 	unsigned long pteg_count;
 	unsigned long prot;
-	unsigned long base = 0, size = 0, limit;
+	unsigned long base = 0, size = 0;
 	struct memblock_region *reg;
 
 	DBG(" -> htab_initialize()\n");
 
-	/* Initialize segment sizes */
-	htab_init_seg_sizes();
-
-	/* Initialize page sizes */
-	htab_init_page_sizes();
-
 	if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) {
 		mmu_kernel_ssize = MMU_SEGSIZE_1T;
 		mmu_highuser_ssize = MMU_SEGSIZE_1T;
@@ -742,7 +850,8 @@ static void __init htab_initialize(void)
 
 	htab_hash_mask = pteg_count - 1;
 
-	if (firmware_has_feature(FW_FEATURE_LPAR)) {
+	if (firmware_has_feature(FW_FEATURE_LPAR) ||
+	    firmware_has_feature(FW_FEATURE_PS3_LV1)) {
 		/* Using a hypervisor which owns the htab */
 		htab_address = NULL;
 		_SDR1 = 0; 
@@ -753,20 +862,26 @@ static void __init htab_initialize(void)
 		 * Clear the htab if firmware assisted dump is active so
 		 * that we dont end up using old mappings.
 		 */
-		if (is_fadump_active() && ppc_md.hpte_clear_all)
-			ppc_md.hpte_clear_all();
+		if (is_fadump_active() && mmu_hash_ops.hpte_clear_all)
+			mmu_hash_ops.hpte_clear_all();
 #endif
 	} else {
-		/* Find storage for the HPT.  Must be contiguous in
-		 * the absolute address space. On cell we want it to be
-		 * in the first 2 Gig so we can use it for IOMMU hacks.
+		unsigned long limit = MEMBLOCK_ALLOC_ANYWHERE;
+
+#ifdef CONFIG_PPC_CELL
+		/*
+		 * Cell may require the hash table down low when using the
+		 * Axon IOMMU in order to fit the dynamic region over it, see
+		 * comments in cell/iommu.c
 		 */
-		if (machine_is(cell))
+		if (fdt_subnode_offset(initial_boot_params, 0, "axon") > 0) {
 			limit = 0x80000000;
-		else
-			limit = MEMBLOCK_ALLOC_ANYWHERE;
+			pr_info("Hash table forced below 2G for Axon IOMMU\n");
+		}
+#endif /* CONFIG_PPC_CELL */
 
-		table = memblock_alloc_base(htab_size_bytes, htab_size_bytes, limit);
+		table = memblock_alloc_base(htab_size_bytes, htab_size_bytes,
+					    limit);
 
 		DBG("Hash table allocated at %lx, size: %lx\n", table,
 		    htab_size_bytes);
@@ -774,7 +889,7 @@ static void __init htab_initialize(void)
 		htab_address = __va(table);
 
 		/* htab absolute addr + encoded htabsize */
-		_SDR1 = table + __ilog2(pteg_count) - 11;
+		_SDR1 = table + __ilog2(htab_size_bytes) - 18;
 
 		/* Initialize the HPT with no entries */
 		memset((void *)table, 0, htab_size_bytes);
@@ -783,7 +898,7 @@ static void __init htab_initialize(void)
 			/* Set SDR1 */
 			mtspr(SPRN_SDR1, _SDR1);
 		else
-			hash_init_partition_table(table, pteg_count);
+			hash_init_partition_table(table, htab_size_bytes);
 	}
 
 	prot = pgprot_val(PAGE_KERNEL);
@@ -810,34 +925,6 @@ static void __init htab_initialize(void)
 		DBG("creating mapping for region: %lx..%lx (prot: %lx)\n",
 		    base, size, prot);
 
-#ifdef CONFIG_U3_DART
-		/* Do not map the DART space. Fortunately, it will be aligned
-		 * in such a way that it will not cross two memblock regions and
-		 * will fit within a single 16Mb page.
-		 * The DART space is assumed to be a full 16Mb region even if
-		 * we only use 2Mb of that space. We will use more of it later
-		 * for AGP GART. We have to use a full 16Mb large page.
-		 */
-		DBG("DART base: %lx\n", dart_tablebase);
-
-		if (dart_tablebase != 0 && dart_tablebase >= base
-		    && dart_tablebase < (base + size)) {
-			unsigned long dart_table_end = dart_tablebase + 16 * MB;
-			if (base != dart_tablebase)
-				BUG_ON(htab_bolt_mapping(base, dart_tablebase,
-							__pa(base), prot,
-							mmu_linear_psize,
-							mmu_kernel_ssize));
-			if ((base + size) > dart_table_end)
-				BUG_ON(htab_bolt_mapping(dart_tablebase+16*MB,
-							base + size,
-							__pa(dart_table_end),
-							 prot,
-							 mmu_linear_psize,
-							 mmu_kernel_ssize));
-			continue;
-		}
-#endif /* CONFIG_U3_DART */
 		BUG_ON(htab_bolt_mapping(base, base + size, __pa(base),
 				prot, mmu_linear_psize, mmu_kernel_ssize));
 	}
@@ -868,8 +955,19 @@ static void __init htab_initialize(void)
 #undef KB
 #undef MB
 
+void __init hash__early_init_devtree(void)
+{
+	/* Initialize segment sizes */
+	of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL);
+
+	/* Initialize page sizes */
+	htab_scan_page_sizes();
+}
+
 void __init hash__early_init_mmu(void)
 {
+	htab_init_page_sizes();
+
 	/*
 	 * initialize page table size
 	 */
@@ -900,12 +998,28 @@ void __init hash__early_init_mmu(void)
 	vmemmap = (struct page *)H_VMEMMAP_BASE;
 	ioremap_bot = IOREMAP_BASE;
 
+#ifdef CONFIG_PCI
+	pci_io_base = ISA_IO_BASE;
+#endif
+
+	/* Select appropriate backend */
+	if (firmware_has_feature(FW_FEATURE_PS3_LV1))
+		ps3_early_mm_init();
+	else if (firmware_has_feature(FW_FEATURE_LPAR))
+		hpte_init_pseries();
+	else if (IS_ENABLED(CONFIG_PPC_NATIVE))
+		hpte_init_native();
+
+	if (!mmu_hash_ops.hpte_insert)
+		panic("hash__early_init_mmu: No MMU hash ops defined!\n");
+
 	/* Initialize the MMU Hash table and create the linear mapping
 	 * of memory. Has to be done before SLB initialization as this is
 	 * currently where the page size encoding is obtained.
 	 */
 	htab_initialize();
 
+	pr_info("Initializing hash mmu with SLB\n");
 	/* Initialize SLB management */
 	slb_initialize();
 }
@@ -1426,6 +1540,29 @@ out_exit:
 	local_irq_restore(flags);
 }
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static inline void tm_flush_hash_page(int local)
+{
+	/*
+	 * Transactions are not aborted by tlbiel, only tlbie. Without, syncing a
+	 * page back to a block device w/PIO could pick up transactional data
+	 * (bad!) so we force an abort here. Before the sync the page will be
+	 * made read-only, which will flush_hash_page. BIG ISSUE here: if the
+	 * kernel uses a page from userspace without unmapping it first, it may
+	 * see the speculated version.
+	 */
+	if (local && cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
+	    MSR_TM_ACTIVE(current->thread.regs->msr)) {
+		tm_enable();
+		tm_abort(TM_CAUSE_TLBI);
+	}
+}
+#else
+static inline void tm_flush_hash_page(int local)
+{
+}
+#endif
+
 /* WARNING: This is called from hash_low_64.S, if you change this prototype,
  *          do not forget to update the assembly call site !
  */
@@ -1448,24 +1585,11 @@ void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
 		 * We use same base page size and actual psize, because we don't
 		 * use these functions for hugepage
 		 */
-		ppc_md.hpte_invalidate(slot, vpn, psize, psize, ssize, local);
+		mmu_hash_ops.hpte_invalidate(slot, vpn, psize, psize,
+					     ssize, local);
 	} pte_iterate_hashed_end();
 
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-	/* Transactions are not aborted by tlbiel, only tlbie.
-	 * Without, syncing a page back to a block device w/ PIO could pick up
-	 * transactional data (bad!) so we force an abort here.  Before the
-	 * sync the page will be made read-only, which will flush_hash_page.
-	 * BIG ISSUE here: if the kernel uses a page from userspace without
-	 * unmapping it first, it may see the speculated version.
-	 */
-	if (local && cpu_has_feature(CPU_FTR_TM) &&
-	    current->thread.regs &&
-	    MSR_TM_ACTIVE(current->thread.regs->msr)) {
-		tm_enable();
-		tm_abort(TM_CAUSE_TLBI);
-	}
-#endif
+	tm_flush_hash_page(local);
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -1489,9 +1613,9 @@ void flush_hash_hugepage(unsigned long vsid, unsigned long addr,
 	if (!hpte_slot_array)
 		return;
 
-	if (ppc_md.hugepage_invalidate) {
-		ppc_md.hugepage_invalidate(vsid, s_addr, hpte_slot_array,
-					   psize, ssize, local);
+	if (mmu_hash_ops.hugepage_invalidate) {
+		mmu_hash_ops.hugepage_invalidate(vsid, s_addr, hpte_slot_array,
+						 psize, ssize, local);
 		goto tm_abort;
 	}
 	/*
@@ -1518,33 +1642,18 @@ void flush_hash_hugepage(unsigned long vsid, unsigned long addr,
 
 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 		slot += hidx & _PTEIDX_GROUP_IX;
-		ppc_md.hpte_invalidate(slot, vpn, psize,
-				       MMU_PAGE_16M, ssize, local);
+		mmu_hash_ops.hpte_invalidate(slot, vpn, psize,
+					     MMU_PAGE_16M, ssize, local);
 	}
 tm_abort:
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-	/* Transactions are not aborted by tlbiel, only tlbie.
-	 * Without, syncing a page back to a block device w/ PIO could pick up
-	 * transactional data (bad!) so we force an abort here.  Before the
-	 * sync the page will be made read-only, which will flush_hash_page.
-	 * BIG ISSUE here: if the kernel uses a page from userspace without
-	 * unmapping it first, it may see the speculated version.
-	 */
-	if (local && cpu_has_feature(CPU_FTR_TM) &&
-	    current->thread.regs &&
-	    MSR_TM_ACTIVE(current->thread.regs->msr)) {
-		tm_enable();
-		tm_abort(TM_CAUSE_TLBI);
-	}
-#endif
-	return;
+	tm_flush_hash_page(local);
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 void flush_hash_range(unsigned long number, int local)
 {
-	if (ppc_md.flush_hash_range)
-		ppc_md.flush_hash_range(number, local);
+	if (mmu_hash_ops.flush_hash_range)
+		mmu_hash_ops.flush_hash_range(number, local);
 	else {
 		int i;
 		struct ppc64_tlb_batch *batch =
@@ -1589,22 +1698,22 @@ repeat:
 		       HPTES_PER_GROUP) & ~0x7UL;
 
 	/* Insert into the hash table, primary slot */
-	slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, vflags,
-				  psize, psize, ssize);
+	slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, vflags,
+					psize, psize, ssize);
 
 	/* Primary is full, try the secondary */
 	if (unlikely(slot == -1)) {
 		hpte_group = ((~hash & htab_hash_mask) *
 			      HPTES_PER_GROUP) & ~0x7UL;
-		slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags,
-					  vflags | HPTE_V_SECONDARY,
-					  psize, psize, ssize);
+		slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags,
+						vflags | HPTE_V_SECONDARY,
+						psize, psize, ssize);
 		if (slot == -1) {
 			if (mftb() & 0x1)
 				hpte_group = ((hash & htab_hash_mask) *
 					      HPTES_PER_GROUP)&~0x7UL;
 
-			ppc_md.hpte_remove(hpte_group);
+			mmu_hash_ops.hpte_remove(hpte_group);
 			goto repeat;
 		}
 	}
@@ -1654,8 +1763,9 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
 		hash = ~hash;
 	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 	slot += hidx & _PTEIDX_GROUP_IX;
-	ppc_md.hpte_invalidate(slot, vpn, mmu_linear_psize, mmu_linear_psize,
-			       mmu_kernel_ssize, 0);
+	mmu_hash_ops.hpte_invalidate(slot, vpn, mmu_linear_psize,
+				     mmu_linear_psize,
+				     mmu_kernel_ssize, 0);
 }
 
 void __kernel_map_pages(struct page *page, int numpages, int enable)
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
index ba3fc229468a..f20d16f849c5 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -103,8 +103,8 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 		slot += hidx & _PTEIDX_GROUP_IX;
 
-		ret = ppc_md.hpte_updatepp(slot, rflags, vpn,
-					   psize, lpsize, ssize, flags);
+		ret = mmu_hash_ops.hpte_updatepp(slot, rflags, vpn,
+						 psize, lpsize, ssize, flags);
 		/*
 		 * We failed to update, try to insert a new entry.
 		 */
@@ -131,23 +131,24 @@ repeat:
 		hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
 
 		/* Insert into the hash table, primary slot */
-		slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
-					  psize, lpsize, ssize);
+		slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+						psize, lpsize, ssize);
 		/*
 		 * Primary is full, try the secondary
 		 */
 		if (unlikely(slot == -1)) {
 			hpte_group = ((~hash & htab_hash_mask) *
 				      HPTES_PER_GROUP) & ~0x7UL;
-			slot = ppc_md.hpte_insert(hpte_group, vpn, pa,
-						  rflags, HPTE_V_SECONDARY,
-						  psize, lpsize, ssize);
+			slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
+							rflags,
+							HPTE_V_SECONDARY,
+							psize, lpsize, ssize);
 			if (slot == -1) {
 				if (mftb() & 0x1)
 					hpte_group = ((hash & htab_hash_mask) *
 						      HPTES_PER_GROUP) & ~0x7UL;
 
-				ppc_md.hpte_remove(hpte_group);
+				mmu_hash_ops.hpte_remove(hpte_group);
 				goto repeat;
 			}
 		}
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index 3058560b6121..d5026f3800b6 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -79,8 +79,8 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 		slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT;
 
-		if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize,
-					 mmu_psize, ssize, flags) == -1)
+		if (mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, mmu_psize,
+					       mmu_psize, ssize, flags) == -1)
 			old_pte &= ~_PAGE_HPTEFLAGS;
 	}
 
diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c
index 1e11559e1aac..35254a678456 100644
--- a/arch/powerpc/mm/hugetlbpage-radix.c
+++ b/arch/powerpc/mm/hugetlbpage-radix.c
@@ -5,39 +5,34 @@
 #include <asm/cacheflush.h>
 #include <asm/machdep.h>
 #include <asm/mman.h>
+#include <asm/tlb.h>
 
 void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
 {
-	unsigned long ap, shift;
+	int psize;
 	struct hstate *hstate = hstate_file(vma->vm_file);
 
-	shift = huge_page_shift(hstate);
-	if (shift == mmu_psize_defs[MMU_PAGE_2M].shift)
-		ap = mmu_get_ap(MMU_PAGE_2M);
-	else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift)
-		ap = mmu_get_ap(MMU_PAGE_1G);
-	else {
-		WARN(1, "Wrong huge page shift\n");
-		return ;
-	}
-	radix___flush_tlb_page(vma->vm_mm, vmaddr, ap, 0);
+	psize = hstate_get_psize(hstate);
+	radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, psize);
 }
 
 void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
 {
-	unsigned long ap, shift;
+	int psize;
 	struct hstate *hstate = hstate_file(vma->vm_file);
 
-	shift = huge_page_shift(hstate);
-	if (shift == mmu_psize_defs[MMU_PAGE_2M].shift)
-		ap = mmu_get_ap(MMU_PAGE_2M);
-	else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift)
-		ap = mmu_get_ap(MMU_PAGE_1G);
-	else {
-		WARN(1, "Wrong huge page shift\n");
-		return ;
-	}
-	radix___local_flush_tlb_page(vma->vm_mm, vmaddr, ap, 0);
+	psize = hstate_get_psize(hstate);
+	radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, psize);
+}
+
+void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long start,
+				   unsigned long end)
+{
+	int psize;
+	struct hstate *hstate = hstate_file(vma->vm_file);
+
+	psize = hstate_get_psize(hstate);
+	radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize);
 }
 
 /*
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 5aac1a3f86cd..a5d3ecdabc44 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -73,7 +73,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 	cachep = PGT_CACHE(pdshift - pshift);
 #endif
 
-	new = kmem_cache_zalloc(cachep, GFP_KERNEL|__GFP_REPEAT);
+	new = kmem_cache_zalloc(cachep, GFP_KERNEL);
 
 	BUG_ON(pshift > HUGEPD_SHIFT_MASK);
 	BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
@@ -81,6 +81,13 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 	if (! new)
 		return -ENOMEM;
 
+	/*
+	 * Make sure other cpus find the hugepd set only after a
+	 * properly initialized page table is visible to them.
+	 * For more details look for comment in __pte_alloc().
+	 */
+	smp_wmb();
+
 	spin_lock(&mm->page_table_lock);
 #ifdef CONFIG_PPC_FSL_BOOK3E
 	/*
@@ -1012,8 +1019,15 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 
 	pte = READ_ONCE(*ptep);
 	mask = _PAGE_PRESENT | _PAGE_READ;
+
+	/*
+	 * On some CPUs like the 8xx, _PAGE_RW hence _PAGE_WRITE is defined
+	 * as 0 and _PAGE_RO has to be set when a page is not writable
+	 */
 	if (write)
 		mask |= _PAGE_WRITE;
+	else
+		mask |= _PAGE_RO;
 
 	if ((pte_val(pte) & mask) != mask)
 		return 0;
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index c899fe340bbd..8a7c38b8d335 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(memstart_addr);
 phys_addr_t kernstart_addr;
 EXPORT_SYMBOL(kernstart_addr);
 
-#ifdef CONFIG_RELOCATABLE_PPC32
+#ifdef CONFIG_RELOCATABLE
 /* Used in __va()/__pa() */
 long long virt_phys_offset;
 EXPORT_SYMBOL(virt_phys_offset);
@@ -80,9 +80,6 @@ EXPORT_SYMBOL(agp_special_page);
 
 void MMU_init(void);
 
-/* XXX should be in current.h  -- paulus */
-extern struct task_struct *current_set[NR_CPUS];
-
 /*
  * this tells the system to map all of ram with the segregs
  * (i.e. page tables) instead of the bats.
@@ -140,7 +137,7 @@ void __init MMU_init(void)
 	if (memblock.memory.cnt > 1) {
 #ifndef CONFIG_WII
 		memblock_enforce_memory_limit(memblock.memory.regions[0].size);
-		printk(KERN_WARNING "Only using first contiguous memory region");
+		pr_warn("Only using first contiguous memory region\n");
 #else
 		wii_memory_fixups();
 #endif
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 33709bdb0419..16ada1eb7e26 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -411,3 +411,25 @@ struct page *realmode_pfn_to_page(unsigned long pfn)
 EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
 
 #endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */
+
+#ifdef CONFIG_PPC_STD_MMU_64
+static bool disable_radix;
+static int __init parse_disable_radix(char *p)
+{
+	disable_radix = true;
+	return 0;
+}
+early_param("disable_radix", parse_disable_radix);
+
+void __init mmu_early_init_devtree(void)
+{
+	/* Disable radix mode based on kernel command line. */
+	if (disable_radix)
+		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
+
+	if (early_radix_enabled())
+		radix__early_init_devtree();
+	else
+		hash__early_init_devtree();
+}
+#endif /* CONFIG_PPC_STD_MMU_64 */
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 2fd57fa48429..5f844337de21 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -116,6 +116,16 @@ int memory_add_physaddr_to_nid(u64 start)
 }
 #endif
 
+int __weak create_section_mapping(unsigned long start, unsigned long end)
+{
+	return -ENODEV;
+}
+
+int __weak remove_section_mapping(unsigned long start, unsigned long end)
+{
+	return -ENODEV;
+}
+
 int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
 	struct pglist_data *pgdata;
@@ -239,8 +249,14 @@ static int __init mark_nonram_nosave(void)
 
 static bool zone_limits_final;
 
+/*
+ * The memory zones past TOP_ZONE are managed by generic mm code.
+ * These should be set to zero since that's what every other
+ * architecture does.
+ */
 static unsigned long max_zone_pfns[MAX_NR_ZONES] = {
-	[0 ... MAX_NR_ZONES - 1] = ~0UL
+	[0            ... TOP_ZONE        ] = ~0UL,
+	[TOP_ZONE + 1 ... MAX_NR_ZONES - 1] = 0
 };
 
 /*
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index 227b2a6c4544..b114f8b93ec9 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -65,7 +65,7 @@ static int radix__init_new_context(struct mm_struct *mm, int index)
 	/*
 	 * set the process table entry,
 	 */
-	rts_field = 3ull << PPC_BITLSHIFT(2);
+	rts_field = radix__get_tree_size();
 	process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE);
 	return 0;
 }
@@ -181,7 +181,10 @@ void destroy_context(struct mm_struct *mm)
 #ifdef CONFIG_PPC_RADIX_MMU
 void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 {
-	mtspr(SPRN_PID, next->context.id);
 	asm volatile("isync": : :"memory");
+	mtspr(SPRN_PID, next->context.id);
+	asm volatile("isync \n"
+		     PPC_SLBIA(0x7)
+		     : : :"memory");
 }
 #endif
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
index da6a2168ae9e..e0f1c33601dd 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -15,6 +15,9 @@
 #include <linux/rculist.h>
 #include <linux/vmalloc.h>
 #include <linux/mutex.h>
+#include <linux/migrate.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
 #include <asm/mmu_context.h>
 
 static DEFINE_MUTEX(mem_list_mutex);
@@ -72,6 +75,55 @@ bool mm_iommu_preregistered(void)
 }
 EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
 
+/*
+ * Taken from alloc_migrate_target with changes to remove CMA allocations
+ */
+struct page *new_iommu_non_cma_page(struct page *page, unsigned long private,
+					int **resultp)
+{
+	gfp_t gfp_mask = GFP_USER;
+	struct page *new_page;
+
+	if (PageHuge(page) || PageTransHuge(page) || PageCompound(page))
+		return NULL;
+
+	if (PageHighMem(page))
+		gfp_mask |= __GFP_HIGHMEM;
+
+	/*
+	 * We don't want the allocation to force an OOM if possibe
+	 */
+	new_page = alloc_page(gfp_mask | __GFP_NORETRY | __GFP_NOWARN);
+	return new_page;
+}
+
+static int mm_iommu_move_page_from_cma(struct page *page)
+{
+	int ret = 0;
+	LIST_HEAD(cma_migrate_pages);
+
+	/* Ignore huge pages for now */
+	if (PageHuge(page) || PageTransHuge(page) || PageCompound(page))
+		return -EBUSY;
+
+	lru_add_drain();
+	ret = isolate_lru_page(page);
+	if (ret)
+		return ret;
+
+	list_add(&page->lru, &cma_migrate_pages);
+	put_page(page); /* Drop the gup reference */
+
+	ret = migrate_pages(&cma_migrate_pages, new_iommu_non_cma_page,
+				NULL, 0, MIGRATE_SYNC, MR_CMA);
+	if (ret) {
+		if (!list_empty(&cma_migrate_pages))
+			putback_movable_pages(&cma_migrate_pages);
+	}
+
+	return 0;
+}
+
 long mm_iommu_get(unsigned long ua, unsigned long entries,
 		struct mm_iommu_table_group_mem_t **pmem)
 {
@@ -124,15 +176,36 @@ long mm_iommu_get(unsigned long ua, unsigned long entries,
 	for (i = 0; i < entries; ++i) {
 		if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
 					1/* pages */, 1/* iswrite */, &page)) {
+			ret = -EFAULT;
 			for (j = 0; j < i; ++j)
-				put_page(pfn_to_page(
-						mem->hpas[j] >> PAGE_SHIFT));
+				put_page(pfn_to_page(mem->hpas[j] >>
+						PAGE_SHIFT));
 			vfree(mem->hpas);
 			kfree(mem);
-			ret = -EFAULT;
 			goto unlock_exit;
 		}
-
+		/*
+		 * If we get a page from the CMA zone, since we are going to
+		 * be pinning these entries, we might as well move them out
+		 * of the CMA zone if possible. NOTE: faulting in + migration
+		 * can be expensive. Batching can be considered later
+		 */
+		if (get_pageblock_migratetype(page) == MIGRATE_CMA) {
+			if (mm_iommu_move_page_from_cma(page))
+				goto populate;
+			if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
+						1/* pages */, 1/* iswrite */,
+						&page)) {
+				ret = -EFAULT;
+				for (j = 0; j < i; ++j)
+					put_page(pfn_to_page(mem->hpas[j] >>
+								PAGE_SHIFT));
+				vfree(mem->hpas);
+				kfree(mem);
+				goto unlock_exit;
+			}
+		}
+populate:
 		mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
 	}
 
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index 7d95bc402dba..c491f2c8f2b9 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -369,44 +369,34 @@ void destroy_context(struct mm_struct *mm)
 }
 
 #ifdef CONFIG_SMP
-
-static int mmu_context_cpu_notify(struct notifier_block *self,
-				  unsigned long action, void *hcpu)
+static int mmu_ctx_cpu_prepare(unsigned int cpu)
 {
-	unsigned int cpu = (unsigned int)(long)hcpu;
-
 	/* We don't touch CPU 0 map, it's allocated at aboot and kept
 	 * around forever
 	 */
 	if (cpu == boot_cpuid)
-		return NOTIFY_OK;
-
-	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		pr_devel("MMU: Allocating stale context map for CPU %d\n", cpu);
-		stale_map[cpu] = kzalloc(CTX_MAP_SIZE, GFP_KERNEL);
-		break;
-#ifdef CONFIG_HOTPLUG_CPU
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu);
-		kfree(stale_map[cpu]);
-		stale_map[cpu] = NULL;
-
-		/* We also clear the cpu_vm_mask bits of CPUs going away */
-		clear_tasks_mm_cpumask(cpu);
-	break;
-#endif /* CONFIG_HOTPLUG_CPU */
-	}
-	return NOTIFY_OK;
+		return 0;
+
+	pr_devel("MMU: Allocating stale context map for CPU %d\n", cpu);
+	stale_map[cpu] = kzalloc(CTX_MAP_SIZE, GFP_KERNEL);
+	return 0;
 }
 
-static struct notifier_block mmu_context_cpu_nb = {
-	.notifier_call	= mmu_context_cpu_notify,
-};
+static int mmu_ctx_cpu_dead(unsigned int cpu)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+	if (cpu == boot_cpuid)
+		return 0;
+
+	pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu);
+	kfree(stale_map[cpu]);
+	stale_map[cpu] = NULL;
+
+	/* We also clear the cpu_vm_mask bits of CPUs going away */
+	clear_tasks_mm_cpumask(cpu);
+#endif
+	return 0;
+}
 
 #endif /* CONFIG_SMP */
 
@@ -469,7 +459,9 @@ void __init mmu_context_init(void)
 #else
 	stale_map[boot_cpuid] = memblock_virt_alloc(CTX_MAP_SIZE, 0);
 
-	register_cpu_notifier(&mmu_context_cpu_nb);
+	cpuhp_setup_state_nocalls(CPUHP_POWERPC_MMU_CTX_PREPARE,
+				  "powerpc/mmu/ctx:prepare",
+				  mmu_ctx_cpu_prepare, mmu_ctx_cpu_dead);
 #endif
 
 	printk(KERN_INFO
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 6af65327c993..f988db655e5b 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -154,9 +154,10 @@ struct tlbcam {
 };
 #endif
 
-#if defined(CONFIG_6xx) || defined(CONFIG_FSL_BOOKE)
+#if defined(CONFIG_6xx) || defined(CONFIG_FSL_BOOKE) || defined(CONFIG_PPC_8xx)
 /* 6xx have BATS */
 /* FSL_BOOKE have TLBCAM */
+/* 8xx have LTLB */
 phys_addr_t v_block_mapped(unsigned long va);
 unsigned long p_block_mapped(phys_addr_t pa);
 #else
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 669a15e7fa76..75b9cd6150cc 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -581,30 +581,22 @@ static void verify_cpu_node_mapping(int cpu, int node)
 	}
 }
 
-static int cpu_numa_callback(struct notifier_block *nfb, unsigned long action,
-			     void *hcpu)
+/* Must run before sched domains notifier. */
+static int ppc_numa_cpu_prepare(unsigned int cpu)
 {
-	unsigned long lcpu = (unsigned long)hcpu;
-	int ret = NOTIFY_DONE, nid;
+	int nid;
 
-	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		nid = numa_setup_cpu(lcpu);
-		verify_cpu_node_mapping((int)lcpu, nid);
-		ret = NOTIFY_OK;
-		break;
+	nid = numa_setup_cpu(cpu);
+	verify_cpu_node_mapping(cpu, nid);
+	return 0;
+}
+
+static int ppc_numa_cpu_dead(unsigned int cpu)
+{
 #ifdef CONFIG_HOTPLUG_CPU
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-		unmap_cpu_from_node(lcpu);
-		ret = NOTIFY_OK;
-		break;
+	unmap_cpu_from_node(cpu);
 #endif
-	}
-	return ret;
+	return 0;
 }
 
 /*
@@ -913,11 +905,6 @@ static void __init dump_numa_memory_topology(void)
 	}
 }
 
-static struct notifier_block ppc64_numa_nb = {
-	.notifier_call = cpu_numa_callback,
-	.priority = 1 /* Must run before sched domains notifier. */
-};
-
 /* Initialize NODE_DATA for a node on the local memory */
 static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
 {
@@ -985,15 +972,18 @@ void __init initmem_init(void)
 	setup_node_to_cpumask_map();
 
 	reset_numa_cpu_lookup_table();
-	register_cpu_notifier(&ppc64_numa_nb);
+
 	/*
 	 * We need the numa_cpu_lookup_table to be accurate for all CPUs,
 	 * even before we online them, so that we can use cpu_to_{node,mem}
 	 * early in boot, cf. smp_prepare_cpus().
+	 * _nocalls() + manual invocation is used because cpuhp is not yet
+	 * initialized for the boot CPU.
 	 */
-	for_each_present_cpu(cpu) {
-		numa_setup_cpu((unsigned long)cpu);
-	}
+	cpuhp_setup_state_nocalls(CPUHP_POWER_NUMA_PREPARE, "POWER_NUMA_PREPARE",
+				  ppc_numa_cpu_prepare, ppc_numa_cpu_dead);
+	for_each_present_cpu(cpu)
+		numa_setup_cpu(cpu);
 }
 
 static int __init early_numa(char *p)
@@ -1163,18 +1153,34 @@ int hot_add_scn_to_nid(unsigned long scn_addr)
 
 static u64 hot_add_drconf_memory_max(void)
 {
-        struct device_node *memory = NULL;
-        unsigned int drconf_cell_cnt = 0;
-        u64 lmb_size = 0;
+	struct device_node *memory = NULL;
+	struct device_node *dn = NULL;
+	unsigned int drconf_cell_cnt = 0;
+	u64 lmb_size = 0;
 	const __be32 *dm = NULL;
+	const __be64 *lrdr = NULL;
+	struct of_drconf_cell drmem;
+
+	dn = of_find_node_by_path("/rtas");
+	if (dn) {
+		lrdr = of_get_property(dn, "ibm,lrdr-capacity", NULL);
+		of_node_put(dn);
+		if (lrdr)
+			return be64_to_cpup(lrdr);
+	}
 
-        memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
-        if (memory) {
-                drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
-                lmb_size = of_get_lmb_size(memory);
-                of_node_put(memory);
-        }
-        return lmb_size * drconf_cell_cnt;
+	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+	if (memory) {
+		drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
+		lmb_size = of_get_lmb_size(memory);
+
+		/* Advance to the last cell, each cell has 6 32 bit integers */
+		dm += (drconf_cell_cnt - 1) * 6;
+		read_drconf_cell(&drmem, &dm);
+		of_node_put(memory);
+		return drmem.base_addr + lmb_size;
+	}
+	return 0;
 }
 
 /*
diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c
index eb4451144746..f4f437cbabf1 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -14,6 +14,9 @@
 #include "mmu_decl.h"
 #include <trace/events/thp.h>
 
+int (*register_process_table)(unsigned long base, unsigned long page_size,
+			      unsigned long tbl_size);
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /*
  * This is called when relaxing access to a hugepage. It's also called in the page
@@ -32,11 +35,8 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
 #endif
 	changed = !pmd_same(*(pmdp), entry);
 	if (changed) {
-		__ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
-		/*
-		 * Since we are not supporting SW TLB systems, we don't
-		 * have any thing similar to flush_tlb_page_nohash()
-		 */
+		__ptep_set_access_flags(vma->vm_mm, pmdp_ptep(pmdp), pmd_pte(entry));
+		flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 	}
 	return changed;
 }
@@ -69,7 +69,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 		     pmd_t *pmdp)
 {
 	pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
-	flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 	/*
 	 * This ensures that generic code that rely on IRQ disabling
 	 * to prevent a parallel THP split work as expected.
@@ -116,3 +116,12 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
 	return;
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+/* For use by kexec */
+void mmu_cleanup_all(void)
+{
+	if (radix_enabled())
+		radix__mmu_cleanup_all();
+	else if (mmu_hash_ops.hpte_clear_all)
+		mmu_hash_ops.hpte_clear_all();
+}
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 18b2c11604fa..ed7bddc456b7 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -21,8 +21,11 @@
 
 #include <trace/events/thp.h>
 
-static int native_update_partition_table(u64 patb1)
+static int native_register_process_table(unsigned long base, unsigned long pg_sz,
+					 unsigned long table_size)
 {
+	unsigned long patb1 = base | table_size | PATB_GR;
+
 	partition_tb->patb1 = cpu_to_be64(patb1);
 	return 0;
 }
@@ -160,32 +163,30 @@ redo:
 	process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT);
 	/*
 	 * Fill in the process table.
-	 * we support 52 bits, hence 52-28 = 24, 11000
 	 */
-	rts_field = 3ull << PPC_BITLSHIFT(2);
+	rts_field = radix__get_tree_size();
 	process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
 	/*
 	 * Fill in the partition table. We are suppose to use effective address
 	 * of process table here. But our linear mapping also enable us to use
 	 * physical address here.
 	 */
-	ppc_md.update_partition_table(__pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR);
+	register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12);
 	pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd);
 }
 
 static void __init radix_init_partition_table(void)
 {
 	unsigned long rts_field;
-	/*
-	 * we support 52 bits, hence 52-28 = 24, 11000
-	 */
-	rts_field = 3ull << PPC_BITLSHIFT(2);
+
+	rts_field = radix__get_tree_size();
 
 	BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large.");
 	partition_tb = early_alloc_pgtable(1UL << PATB_SIZE_SHIFT);
 	partition_tb->patb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) |
 					  RADIX_PGD_INDEX_SIZE | PATB_HR);
-	printk("Partition table %p\n", partition_tb);
+	pr_info("Initializing Radix MMU\n");
+	pr_info("Partition table %p\n", partition_tb);
 
 	memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
 	/*
@@ -197,7 +198,7 @@ static void __init radix_init_partition_table(void)
 
 void __init radix_init_native(void)
 {
-	ppc_md.update_partition_table = native_update_partition_table;
+	register_process_table = native_register_process_table;
 }
 
 static int __init get_idx_from_shift(unsigned int shift)
@@ -263,7 +264,7 @@ static int __init radix_dt_scan_page_sizes(unsigned long node,
 	return 1;
 }
 
-static void __init radix_init_page_sizes(void)
+void __init radix__early_init_devtree(void)
 {
 	int rc;
 
@@ -293,14 +294,35 @@ found:
 	return;
 }
 
-void __init radix__early_init_mmu(void)
+static void update_hid_for_radix(void)
 {
-	unsigned long lpcr;
+	unsigned long hid0;
+	unsigned long rb = 3UL << PPC_BITLSHIFT(53); /* IS = 3 */
+
+	asm volatile("ptesync": : :"memory");
+	/* prs = 0, ric = 2, rs = 0, r = 1 is = 3 */
+	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(1), "i"(0), "i"(2), "r"(0) : "memory");
+	/* prs = 1, ric = 2, rs = 0, r = 1 is = 3 */
+	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(1), "i"(1), "i"(2), "r"(0) : "memory");
+	asm volatile("eieio; tlbsync; ptesync; isync; slbia": : :"memory");
 	/*
-	 * setup LPCR UPRT based on mmu_features
+	 * now switch the HID
 	 */
-	lpcr = mfspr(SPRN_LPCR);
-	mtspr(SPRN_LPCR, lpcr | LPCR_UPRT);
+	hid0  = mfspr(SPRN_HID0);
+	hid0 |= HID0_POWER9_RADIX;
+	mtspr(SPRN_HID0, hid0);
+	asm volatile("isync": : :"memory");
+
+	/* Wait for it to happen */
+	while (!(mfspr(SPRN_HID0) & HID0_POWER9_RADIX))
+		cpu_relax();
+}
+
+void __init radix__early_init_mmu(void)
+{
+	unsigned long lpcr;
 
 #ifdef CONFIG_PPC_64K_PAGES
 	/* PAGE_SIZE mappings */
@@ -336,15 +358,25 @@ void __init radix__early_init_mmu(void)
 	__vmalloc_end = RADIX_VMALLOC_END;
 	vmemmap = (struct page *)RADIX_VMEMMAP_BASE;
 	ioremap_bot = IOREMAP_BASE;
+
+#ifdef CONFIG_PCI
+	pci_io_base = ISA_IO_BASE;
+#endif
+
 	/*
 	 * For now radix also use the same frag size
 	 */
 	__pte_frag_nr = H_PTE_FRAG_NR;
 	__pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT;
 
-	radix_init_page_sizes();
-	if (!firmware_has_feature(FW_FEATURE_LPAR))
+	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+		radix_init_native();
+		if (cpu_has_feature(CPU_FTR_POWER9_DD1))
+			update_hid_for_radix();
+		lpcr = mfspr(SPRN_LPCR);
+		mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
 		radix_init_partition_table();
+	}
 
 	radix_init_pgtable();
 }
@@ -353,16 +385,27 @@ void radix__early_init_mmu_secondary(void)
 {
 	unsigned long lpcr;
 	/*
-	 * setup LPCR UPRT based on mmu_features
-	 */
-	lpcr = mfspr(SPRN_LPCR);
-	mtspr(SPRN_LPCR, lpcr | LPCR_UPRT);
-	/*
-	 * update partition table control register, 64 K size.
+	 * update partition table control register and UPRT
 	 */
-	if (!firmware_has_feature(FW_FEATURE_LPAR))
+	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+		lpcr = mfspr(SPRN_LPCR);
+		mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
+
 		mtspr(SPRN_PTCR,
 		      __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
+	}
+}
+
+void radix__mmu_cleanup_all(void)
+{
+	unsigned long lpcr;
+
+	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+		lpcr = mfspr(SPRN_LPCR);
+		mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
+		mtspr(SPRN_PTCR, 0);
+		radix__flush_tlb_all();
+	}
 }
 
 void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 88a307504b5a..911fdfb63ec1 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -224,8 +224,8 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
 	if (changed) {
 		if (!is_vm_hugetlb_page(vma))
 			assert_pte_locked(vma->vm_mm, address);
-		__ptep_set_access_flags(ptep, entry);
-		flush_tlb_page_nohash(vma, address);
+		__ptep_set_access_flags(vma->vm_mm, ptep, entry);
+		flush_tlb_page(vma, address);
 	}
 	return changed;
 }
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index bf7bf32b54f8..0ae0572bc239 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -79,12 +79,12 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 #endif
 }
 
-__init_refok pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+__ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 {
 	pte_t *pte;
 
 	if (slab_is_available()) {
-		pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
+		pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO);
 	} else {
 		pte = __va(memblock_alloc(PAGE_SIZE, PAGE_SIZE));
 		if (pte)
@@ -97,7 +97,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
 	struct page *ptepage;
 
-	gfp_t flags = GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO;
+	gfp_t flags = GFP_KERNEL | __GFP_ZERO;
 
 	ptepage = alloc_pages(flags, 0);
 	if (!ptepage)
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index e009e0604a8a..f5e8d4edb808 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -350,8 +350,7 @@ static pte_t *get_from_cache(struct mm_struct *mm)
 static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
 {
 	void *ret = NULL;
-	struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
-				       __GFP_REPEAT | __GFP_ZERO);
+	struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
 	if (!page)
 		return NULL;
 	if (!kernel && !pgtable_page_ctor(page)) {
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index dfdb90cb4403..e2974fcd20f1 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -113,7 +113,12 @@ BEGIN_FTR_SECTION
 END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
 	b	slb_finish_load_1T
 
-0:
+0:	/*
+	 * For userspace addresses, make sure this is region 0.
+	 */
+	cmpdi	r9, 0
+	bne	8f
+
 	/* when using slices, we extract the psize off the slice bitmaps
 	 * and then we need to get the sllp encoding off the mmu_psize_defs
 	 * array.
@@ -173,11 +178,9 @@ BEGIN_FTR_SECTION
 END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
 	b	slb_finish_load
 
-8:	/* invalid EA */
-	li	r10,0			/* BAD_VSID */
-	li	r9,0			/* BAD_VSID */
-	li	r11,SLB_VSID_USER	/* flags don't much matter */
-	b	slb_finish_load
+8:	/* invalid EA - return an error indication */
+	crset	4*cr0+eq		/* indicate failure */
+	blr
 
 /*
  * Finish loading of an SLB entry and return
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 0fdaf93a3e09..0e49ec541ab5 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -12,26 +12,30 @@
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/memblock.h>
+#include <asm/ppc-opcode.h>
 
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
 static DEFINE_RAW_SPINLOCK(native_tlbie_lock);
 
-static inline void __tlbiel_pid(unsigned long pid, int set)
+#define RIC_FLUSH_TLB 0
+#define RIC_FLUSH_PWC 1
+#define RIC_FLUSH_ALL 2
+
+static inline void __tlbiel_pid(unsigned long pid, int set,
+				unsigned long ric)
 {
-	unsigned long rb,rs,ric,prs,r;
+	unsigned long rb,rs,prs,r;
 
 	rb = PPC_BIT(53); /* IS = 1 */
 	rb |= set << PPC_BITLSHIFT(51);
 	rs = ((unsigned long)pid) << PPC_BITLSHIFT(31);
 	prs = 1; /* process scoped */
 	r = 1;   /* raidx format */
-	ric = 2;  /* invalidate all the caches */
 
 	asm volatile("ptesync": : :"memory");
-	asm volatile(".long 0x7c000224 | (%0 << 11) | (%1 << 16) |"
-		     "(%2 << 17) | (%3 << 18) | (%4 << 21)"
+	asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
 		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
 	asm volatile("ptesync": : :"memory");
 }
@@ -39,67 +43,61 @@ static inline void __tlbiel_pid(unsigned long pid, int set)
 /*
  * We use 128 set in radix mode and 256 set in hpt mode.
  */
-static inline void _tlbiel_pid(unsigned long pid)
+static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
 {
 	int set;
 
 	for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) {
-		__tlbiel_pid(pid, set);
+		__tlbiel_pid(pid, set, ric);
 	}
 	return;
 }
 
-static inline void _tlbie_pid(unsigned long pid)
+static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
 {
-	unsigned long rb,rs,ric,prs,r;
+	unsigned long rb,rs,prs,r;
 
 	rb = PPC_BIT(53); /* IS = 1 */
 	rs = pid << PPC_BITLSHIFT(31);
 	prs = 1; /* process scoped */
 	r = 1;   /* raidx format */
-	ric = 2;  /* invalidate all the caches */
 
 	asm volatile("ptesync": : :"memory");
-	asm volatile(".long 0x7c000264 | (%0 << 11) | (%1 << 16) |"
-		     "(%2 << 17) | (%3 << 18) | (%4 << 21)"
+	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
 		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
 static inline void _tlbiel_va(unsigned long va, unsigned long pid,
-			      unsigned long ap)
+			      unsigned long ap, unsigned long ric)
 {
-	unsigned long rb,rs,ric,prs,r;
+	unsigned long rb,rs,prs,r;
 
 	rb = va & ~(PPC_BITMASK(52, 63));
 	rb |= ap << PPC_BITLSHIFT(58);
 	rs = pid << PPC_BITLSHIFT(31);
 	prs = 1; /* process scoped */
 	r = 1;   /* raidx format */
-	ric = 0;  /* no cluster flush yet */
 
 	asm volatile("ptesync": : :"memory");
-	asm volatile(".long 0x7c000224 | (%0 << 11) | (%1 << 16) |"
-		     "(%2 << 17) | (%3 << 18) | (%4 << 21)"
+	asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
 		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
 	asm volatile("ptesync": : :"memory");
 }
 
 static inline void _tlbie_va(unsigned long va, unsigned long pid,
-			     unsigned long ap)
+			     unsigned long ap, unsigned long ric)
 {
-	unsigned long rb,rs,ric,prs,r;
+	unsigned long rb,rs,prs,r;
 
 	rb = va & ~(PPC_BITMASK(52, 63));
 	rb |= ap << PPC_BITLSHIFT(58);
 	rs = pid << PPC_BITLSHIFT(31);
 	prs = 1; /* process scoped */
 	r = 1;   /* raidx format */
-	ric = 0;  /* no cluster flush yet */
 
 	asm volatile("ptesync": : :"memory");
-	asm volatile(".long 0x7c000264 | (%0 << 11) | (%1 << 16) |"
-		     "(%2 << 17) | (%3 << 18) | (%4 << 21)"
+	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
 		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
@@ -117,25 +115,41 @@ static inline void _tlbie_va(unsigned long va, unsigned long pid,
  */
 void radix__local_flush_tlb_mm(struct mm_struct *mm)
 {
-	unsigned int pid;
+	unsigned long pid;
 
 	preempt_disable();
 	pid = mm->context.id;
 	if (pid != MMU_NO_CONTEXT)
-		_tlbiel_pid(pid);
+		_tlbiel_pid(pid, RIC_FLUSH_ALL);
 	preempt_enable();
 }
 EXPORT_SYMBOL(radix__local_flush_tlb_mm);
 
-void radix___local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
-			    unsigned long ap, int nid)
+void radix__local_flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
 {
-	unsigned int pid;
+	unsigned long pid;
+	struct mm_struct *mm = tlb->mm;
+
+	preempt_disable();
+
+	pid = mm->context.id;
+	if (pid != MMU_NO_CONTEXT)
+		_tlbiel_pid(pid, RIC_FLUSH_PWC);
+
+	preempt_enable();
+}
+EXPORT_SYMBOL(radix__local_flush_tlb_pwc);
+
+void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
+				       int psize)
+{
+	unsigned long pid;
+	unsigned long ap = mmu_get_ap(psize);
 
 	preempt_disable();
 	pid = mm ? mm->context.id : 0;
 	if (pid != MMU_NO_CONTEXT)
-		_tlbiel_va(vmaddr, pid, ap);
+		_tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
 	preempt_enable();
 }
 
@@ -146,23 +160,43 @@ void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmadd
 	if (vma && is_vm_hugetlb_page(vma))
 		return __local_flush_hugetlb_page(vma, vmaddr);
 #endif
-	radix___local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
-			       mmu_get_ap(mmu_virtual_psize), 0);
+	radix__local_flush_tlb_page_psize(vma ? vma->vm_mm : NULL, vmaddr,
+					  mmu_virtual_psize);
 }
 EXPORT_SYMBOL(radix__local_flush_tlb_page);
 
 #ifdef CONFIG_SMP
-static int mm_is_core_local(struct mm_struct *mm)
+void radix__flush_tlb_mm(struct mm_struct *mm)
 {
-	return cpumask_subset(mm_cpumask(mm),
-			      topology_sibling_cpumask(smp_processor_id()));
+	unsigned long pid;
+
+	preempt_disable();
+	pid = mm->context.id;
+	if (unlikely(pid == MMU_NO_CONTEXT))
+		goto no_context;
+
+	if (!mm_is_core_local(mm)) {
+		int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+
+		if (lock_tlbie)
+			raw_spin_lock(&native_tlbie_lock);
+		_tlbie_pid(pid, RIC_FLUSH_ALL);
+		if (lock_tlbie)
+			raw_spin_unlock(&native_tlbie_lock);
+	} else
+		_tlbiel_pid(pid, RIC_FLUSH_ALL);
+no_context:
+	preempt_enable();
 }
+EXPORT_SYMBOL(radix__flush_tlb_mm);
 
-void radix__flush_tlb_mm(struct mm_struct *mm)
+void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
 {
-	unsigned int pid;
+	unsigned long pid;
+	struct mm_struct *mm = tlb->mm;
 
 	preempt_disable();
+
 	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
 		goto no_context;
@@ -172,20 +206,21 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
 
 		if (lock_tlbie)
 			raw_spin_lock(&native_tlbie_lock);
-		_tlbie_pid(pid);
+		_tlbie_pid(pid, RIC_FLUSH_PWC);
 		if (lock_tlbie)
 			raw_spin_unlock(&native_tlbie_lock);
 	} else
-		_tlbiel_pid(pid);
+		_tlbiel_pid(pid, RIC_FLUSH_PWC);
 no_context:
 	preempt_enable();
 }
-EXPORT_SYMBOL(radix__flush_tlb_mm);
+EXPORT_SYMBOL(radix__flush_tlb_pwc);
 
-void radix___flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
-		       unsigned long ap, int nid)
+void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
+				 int psize)
 {
-	unsigned int pid;
+	unsigned long pid;
+	unsigned long ap = mmu_get_ap(psize);
 
 	preempt_disable();
 	pid = mm ? mm->context.id : 0;
@@ -196,11 +231,11 @@ void radix___flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
 
 		if (lock_tlbie)
 			raw_spin_lock(&native_tlbie_lock);
-		_tlbie_va(vmaddr, pid, ap);
+		_tlbie_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
 		if (lock_tlbie)
 			raw_spin_unlock(&native_tlbie_lock);
 	} else
-		_tlbiel_va(vmaddr, pid, ap);
+		_tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
 bail:
 	preempt_enable();
 }
@@ -211,8 +246,8 @@ void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
 	if (vma && is_vm_hugetlb_page(vma))
 		return flush_hugetlb_page(vma, vmaddr);
 #endif
-	radix___flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
-			 mmu_get_ap(mmu_virtual_psize), 0);
+	radix__flush_tlb_page_psize(vma ? vma->vm_mm : NULL, vmaddr,
+				    mmu_virtual_psize);
 }
 EXPORT_SYMBOL(radix__flush_tlb_page);
 
@@ -224,7 +259,7 @@ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
 
 	if (lock_tlbie)
 		raw_spin_lock(&native_tlbie_lock);
-	_tlbie_pid(0);
+	_tlbie_pid(0, RIC_FLUSH_ALL);
 	if (lock_tlbie)
 		raw_spin_unlock(&native_tlbie_lock);
 }
@@ -243,9 +278,149 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 }
 EXPORT_SYMBOL(radix__flush_tlb_range);
 
+static int radix_get_mmu_psize(int page_size)
+{
+	int psize;
+
+	if (page_size == (1UL << mmu_psize_defs[mmu_virtual_psize].shift))
+		psize = mmu_virtual_psize;
+	else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_2M].shift))
+		psize = MMU_PAGE_2M;
+	else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_1G].shift))
+		psize = MMU_PAGE_1G;
+	else
+		return -1;
+	return psize;
+}
 
 void radix__tlb_flush(struct mmu_gather *tlb)
 {
+	int psize = 0;
 	struct mm_struct *mm = tlb->mm;
-	radix__flush_tlb_mm(mm);
+	int page_size = tlb->page_size;
+
+	psize = radix_get_mmu_psize(page_size);
+	/*
+	 * if page size is not something we understand, do a full mm flush
+	 */
+	if (psize != -1 && !tlb->fullmm && !tlb->need_flush_all)
+		radix__flush_tlb_range_psize(mm, tlb->start, tlb->end, psize);
+	else
+		radix__flush_tlb_mm(mm);
+}
+
+#define TLB_FLUSH_ALL -1UL
+/*
+ * Number of pages above which we will do a bcast tlbie. Just a
+ * number at this point copied from x86
+ */
+static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
+
+void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
+				  unsigned long end, int psize)
+{
+	unsigned long pid;
+	unsigned long addr;
+	int local = mm_is_core_local(mm);
+	unsigned long ap = mmu_get_ap(psize);
+	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+	unsigned long page_size = 1UL << mmu_psize_defs[psize].shift;
+
+
+	preempt_disable();
+	pid = mm ? mm->context.id : 0;
+	if (unlikely(pid == MMU_NO_CONTEXT))
+		goto err_out;
+
+	if (end == TLB_FLUSH_ALL ||
+	    (end - start) > tlb_single_page_flush_ceiling * page_size) {
+		if (local)
+			_tlbiel_pid(pid, RIC_FLUSH_TLB);
+		else
+			_tlbie_pid(pid, RIC_FLUSH_TLB);
+		goto err_out;
+	}
+	for (addr = start; addr < end; addr += page_size) {
+
+		if (local)
+			_tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+		else {
+			if (lock_tlbie)
+				raw_spin_lock(&native_tlbie_lock);
+			_tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+			if (lock_tlbie)
+				raw_spin_unlock(&native_tlbie_lock);
+		}
+	}
+err_out:
+	preempt_enable();
+}
+
+void radix__flush_tlb_lpid_va(unsigned long lpid, unsigned long gpa,
+			      unsigned long page_size)
+{
+	unsigned long rb,rs,prs,r;
+	unsigned long ap;
+	unsigned long ric = RIC_FLUSH_TLB;
+
+	ap = mmu_get_ap(radix_get_mmu_psize(page_size));
+	rb = gpa & ~(PPC_BITMASK(52, 63));
+	rb |= ap << PPC_BITLSHIFT(58);
+	rs = lpid & ((1UL << 32) - 1);
+	prs = 0; /* process scoped */
+	r = 1;   /* raidx format */
+
+	asm volatile("ptesync": : :"memory");
+	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+	asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+EXPORT_SYMBOL(radix__flush_tlb_lpid_va);
+
+void radix__flush_tlb_lpid(unsigned long lpid)
+{
+	unsigned long rb,rs,prs,r;
+	unsigned long ric = RIC_FLUSH_ALL;
+
+	rb = 0x2 << PPC_BITLSHIFT(53); /* IS = 2 */
+	rs = lpid & ((1UL << 32) - 1);
+	prs = 0; /* partition scoped */
+	r = 1;   /* raidx format */
+
+	asm volatile("ptesync": : :"memory");
+	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+	asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+EXPORT_SYMBOL(radix__flush_tlb_lpid);
+
+void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
+				unsigned long start, unsigned long end)
+{
+	radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_2M);
+}
+EXPORT_SYMBOL(radix__flush_pmd_tlb_range);
+
+void radix__flush_tlb_all(void)
+{
+	unsigned long rb,prs,r,rs;
+	unsigned long ric = RIC_FLUSH_ALL;
+
+	rb = 0x3 << PPC_BITLSHIFT(53); /* IS = 3 */
+	prs = 0; /* partition scoped */
+	r = 1;   /* raidx format */
+	rs = 1 & ((1UL << 32) - 1); /* any LPID value to flush guest mappings */
+
+	asm volatile("ptesync": : :"memory");
+	/*
+	 * now flush guest entries by passing PRS = 1 and LPID != 0
+	 */
+	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(r), "i"(1), "i"(ric), "r"(rs) : "memory");
+	/*
+	 * now flush host entires by passing PRS = 0 and LPID == 0
+	 */
+	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(0) : "memory");
+	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c
index 558e30cce33e..702d7689d714 100644
--- a/arch/powerpc/mm/tlb_hash32.c
+++ b/arch/powerpc/mm/tlb_hash32.c
@@ -49,17 +49,6 @@ void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr)
 EXPORT_SYMBOL(flush_hash_entry);
 
 /*
- * Called by ptep_set_access_flags, must flush on CPUs for which the
- * DSI handler can't just "fixup" the TLB on a write fault
- */
-void flush_tlb_page_nohash(struct vm_area_struct *vma, unsigned long addr)
-{
-	if (Hash != 0)
-		return;
-	_tlbie(addr);
-}
-
-/*
  * Called at the end of a mmu_gather operation to make sure the
  * TLB flush is completely done.
  */
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index f4668488512c..050badc0ebd3 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -215,12 +215,6 @@ EXPORT_SYMBOL(local_flush_tlb_page);
 
 static DEFINE_RAW_SPINLOCK(tlbivax_lock);
 
-static int mm_is_core_local(struct mm_struct *mm)
-{
-	return cpumask_subset(mm_cpumask(mm),
-			      topology_sibling_cpumask(smp_processor_id()));
-}
-
 struct tlb_flush_param {
 	unsigned long addr;
 	unsigned int pid;