15 files changed, 946 insertions, 352 deletions
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 1c00e0196f6c..e7392b45a5ef 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -12,7 +12,8 @@ obj-y				:= fault.o mem.o \
 				   mmu_context_$(CONFIG_WORD_SIZE).o
 hash-$(CONFIG_PPC_NATIVE)	:= hash_native_64.o
 obj-$(CONFIG_PPC64)		+= hash_utils_64.o \
-				   slb_low.o slb.o stab.o mmap.o $(hash-y)
+				   slb_low.o slb.o stab.o \
+				   gup.o mmap.o $(hash-y)
 obj-$(CONFIG_PPC_STD_MMU_32)	+= ppc_mmu_32.o
 obj-$(CONFIG_PPC_STD_MMU)	+= hash_low_$(CONFIG_WORD_SIZE).o \
 				   tlb_$(CONFIG_WORD_SIZE).o
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 1707d00331fc..565b7a237c84 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -100,31 +100,6 @@ static int store_updates_sp(struct pt_regs *regs)
 	return 0;
 }
 
-#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
-static void do_dabr(struct pt_regs *regs, unsigned long address,
-		    unsigned long error_code)
-{
-	siginfo_t info;
-
-	if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
-			11, SIGSEGV) == NOTIFY_STOP)
-		return;
-
-	if (debugger_dabr_match(regs))
-		return;
-
-	/* Clear the DABR */
-	set_dabr(0);
-
-	/* Deliver the signal to userspace */
-	info.si_signo = SIGTRAP;
-	info.si_errno = 0;
-	info.si_code = TRAP_HWBKPT;
-	info.si_addr = (void __user *)address;
-	force_sig_info(SIGTRAP, &info, current);
-}
-#endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/
-
 /*
  * For 600- and 800-family processors, the error_code parameter is DSISR
  * for a data fault, SRR1 for an instruction fault. For 400-family processors
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c
index ce10e2b1b902..23cee39534fd 100644
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/arch/powerpc/mm/fsl_booke_mmu.c
@@ -202,7 +202,7 @@ adjust_total_lowmem(void)
 		cam_max_size = max_lowmem_size;
 
 	/* adjust lowmem size to max_lowmem_size */
-	ram = min(max_lowmem_size, (phys_addr_t)total_lowmem);
+	ram = min(max_lowmem_size, total_lowmem);
 
 	/* Calculate CAM values */
 	__cam0 = 1UL << 2 * (__ilog2(ram) / 2);
@@ -225,7 +225,8 @@ adjust_total_lowmem(void)
 	printk(KERN_INFO "Memory CAM mapping: CAM0=%ldMb, CAM1=%ldMb,"
 			" CAM2=%ldMb residual: %ldMb\n",
 			__cam0 >> 20, __cam1 >> 20, __cam2 >> 20,
-			(total_lowmem - __cam0 - __cam1 - __cam2) >> 20);
+			(long int)((total_lowmem - __cam0 - __cam1 - __cam2)
+				   >> 20));
 	__max_low_memory = __cam0 + __cam1 + __cam2;
 	__initial_memory_limit_addr = memstart_addr + __max_low_memory;
 }
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
new file mode 100644
index 000000000000..28a114db3ba0
--- /dev/null
+++ b/arch/powerpc/mm/gup.c
@@ -0,0 +1,281 @@
+/*
+ * Lockless get_user_pages_fast for powerpc
+ *
+ * Copyright (C) 2008 Nick Piggin
+ * Copyright (C) 2008 Novell Inc.
+ */
+#undef DEBUG
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/vmstat.h>
+#include <linux/pagemap.h>
+#include <linux/rwsem.h>
+#include <asm/pgtable.h>
+
+/*
+ * The performance critical leaf functions are made noinline otherwise gcc
+ * inlines everything into a single function which results in too much
+ * register pressure.
+ */
+static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
+		unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long mask, result;
+	pte_t *ptep;
+
+	result = _PAGE_PRESENT|_PAGE_USER;
+	if (write)
+		result |= _PAGE_RW;
+	mask = result | _PAGE_SPECIAL;
+
+	ptep = pte_offset_kernel(&pmd, addr);
+	do {
+		pte_t pte = *ptep;
+		struct page *page;
+
+		if ((pte_val(pte) & mask) != result)
+			return 0;
+		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+		page = pte_page(pte);
+		if (!page_cache_get_speculative(page))
+			return 0;
+		if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+			put_page(page);
+			return 0;
+		}
+		pages[*nr] = page;
+		(*nr)++;
+
+	} while (ptep++, addr += PAGE_SIZE, addr != end);
+
+	return 1;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate,
+				 unsigned long *addr, unsigned long end,
+				 int write, struct page **pages, int *nr)
+{
+	unsigned long mask;
+	unsigned long pte_end;
+	struct page *head, *page;
+	pte_t pte;
+	int refs;
+
+	pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate);
+	if (pte_end < end)
+		end = pte_end;
+
+	pte = *ptep;
+	mask = _PAGE_PRESENT|_PAGE_USER;
+	if (write)
+		mask |= _PAGE_RW;
+	if ((pte_val(pte) & mask) != mask)
+		return 0;
+	/* hugepages are never "special" */
+	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+	refs = 0;
+	head = pte_page(pte);
+	page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT);
+	do {
+		VM_BUG_ON(compound_head(page) != head);
+		pages[*nr] = page;
+		(*nr)++;
+		page++;
+		refs++;
+	} while (*addr += PAGE_SIZE, *addr != end);
+
+	if (!page_cache_add_speculative(head, refs)) {
+		*nr -= refs;
+		return 0;
+	}
+	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+		/* Could be optimized better */
+		while (*nr) {
+			put_page(page);
+			(*nr)--;
+		}
+	}
+
+	return 1;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+		int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pmd_t *pmdp;
+
+	pmdp = pmd_offset(&pud, addr);
+	do {
+		pmd_t pmd = *pmdp;
+
+		next = pmd_addr_end(addr, end);
+		if (pmd_none(pmd))
+			return 0;
+		if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+			return 0;
+	} while (pmdp++, addr = next, addr != end);
+
+	return 1;
+}
+
+static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
+		int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pud_t *pudp;
+
+	pudp = pud_offset(&pgd, addr);
+	do {
+		pud_t pud = *pudp;
+
+		next = pud_addr_end(addr, end);
+		if (pud_none(pud))
+			return 0;
+		if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+			return 0;
+	} while (pudp++, addr = next, addr != end);
+
+	return 1;
+}
+
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+			struct page **pages)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr, len, end;
+	unsigned long next;
+	pgd_t *pgdp;
+	int psize, nr = 0;
+	unsigned int shift;
+
+	pr_debug("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read");
+
+	start &= PAGE_MASK;
+	addr = start;
+	len = (unsigned long) nr_pages << PAGE_SHIFT;
+	end = start + len;
+
+	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+					start, len)))
+		goto slow_irqon;
+
+	pr_debug("  aligned: %lx .. %lx\n", start, end);
+
+#ifdef CONFIG_HUGETLB_PAGE
+	/* We bail out on slice boundary crossing when hugetlb is
+	 * enabled in order to not have to deal with two different
+	 * page table formats
+	 */
+	if (addr < SLICE_LOW_TOP) {
+		if (end > SLICE_LOW_TOP)
+			goto slow_irqon;
+
+		if (unlikely(GET_LOW_SLICE_INDEX(addr) !=
+			     GET_LOW_SLICE_INDEX(end - 1)))
+			goto slow_irqon;
+	} else {
+		if (unlikely(GET_HIGH_SLICE_INDEX(addr) !=
+			     GET_HIGH_SLICE_INDEX(end - 1)))
+			goto slow_irqon;
+	}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+	/*
+	 * XXX: batch / limit 'nr', to avoid large irq off latency
+	 * needs some instrumenting to determine the common sizes used by
+	 * important workloads (eg. DB2), and whether limiting the batch size
+	 * will decrease performance.
+	 *
+	 * It seems like we're in the clear for the moment. Direct-IO is
+	 * the main guy that batches up lots of get_user_pages, and even
+	 * they are limited to 64-at-a-time which is not so many.
+	 */
+	/*
+	 * This doesn't prevent pagetable teardown, but does prevent
+	 * the pagetables from being freed on powerpc.
+	 *
+	 * So long as we atomically load page table pointers versus teardown,
+	 * we can follow the address down to the the page and take a ref on it.
+	 */
+	local_irq_disable();
+
+	psize = get_slice_psize(mm, addr);
+	shift = mmu_psize_defs[psize].shift;
+
+#ifdef CONFIG_HUGETLB_PAGE
+	if (unlikely(mmu_huge_psizes[psize])) {
+		pte_t *ptep;
+		unsigned long a = addr;
+		unsigned long sz = ((1UL) << shift);
+		struct hstate *hstate = size_to_hstate(sz);
+
+		BUG_ON(!hstate);
+		/*
+		 * XXX: could be optimized to avoid hstate
+		 * lookup entirely (just use shift)
+		 */
+
+		do {
+			VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift);
+			ptep = huge_pte_offset(mm, a);
+			pr_debug(" %016lx: huge ptep %p\n", a, ptep);
+			if (!ptep || !gup_huge_pte(ptep, hstate, &a, end, write, pages,
+						   &nr))
+				goto slow;
+		} while (a != end);
+	} else
+#endif /* CONFIG_HUGETLB_PAGE */
+	{
+		pgdp = pgd_offset(mm, addr);
+		do {
+			pgd_t pgd = *pgdp;
+
+			VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift);
+			pr_debug("  %016lx: normal pgd %p\n", addr,
+				 (void *)pgd_val(pgd));
+			next = pgd_addr_end(addr, end);
+			if (pgd_none(pgd))
+				goto slow;
+			if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+				goto slow;
+		} while (pgdp++, addr = next, addr != end);
+	}
+	local_irq_enable();
+
+	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
+	return nr;
+
+	{
+		int ret;
+
+slow:
+		local_irq_enable();
+slow_irqon:
+		pr_debug("  slow path ! nr = %d\n", nr);
+
+		/* Try to get the remaining pages with get_user_pages */
+		start += nr << PAGE_SHIFT;
+		pages += nr;
+
+		down_read(&mm->mmap_sem);
+		ret = get_user_pages(current, mm, start,
+			(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
+		up_read(&mm->mmap_sem);
+
+		/* Have to be a bit careful with return values */
+		if (nr > 0) {
+			if (ret < 0)
+				ret = nr;
+			else
+				ret += nr;
+		}
+
+		return ret;
+	}
+}
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index b9ba7d930801..7bffb70b9fe2 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -75,7 +75,7 @@ _GLOBAL(hash_page_sync)
  * Returns to the caller if the access is illegal or there is no
  * mapping for the address.  Otherwise it places an appropriate PTE
  * in the hash table and returns from the exception.
- * Uses r0, r3 - r8, ctr, lr.
+ * Uses r0, r3 - r8, r10, ctr, lr.
  */
 	.text
 _GLOBAL(hash_page)
@@ -106,9 +106,15 @@ _GLOBAL(hash_page)
 	addi	r5,r5,swapper_pg_dir@l	/* kernel page table */
 	rlwimi	r3,r9,32-12,29,29	/* MSR_PR -> _PAGE_USER */
 112:	add	r5,r5,r7		/* convert to phys addr */
+#ifndef CONFIG_PTE_64BIT
 	rlwimi	r5,r4,12,20,29		/* insert top 10 bits of address */
 	lwz	r8,0(r5)		/* get pmd entry */
 	rlwinm.	r8,r8,0,0,19		/* extract address of pte page */
+#else
+	rlwinm	r8,r4,13,19,29		/* Compute pgdir/pmd offset */
+	lwzx	r8,r8,r5		/* Get L1 entry */
+	rlwinm.	r8,r8,0,0,20		/* extract pt base address */
+#endif
 #ifdef CONFIG_SMP
 	beq-	hash_page_out		/* return if no mapping */
 #else
@@ -118,7 +124,11 @@ _GLOBAL(hash_page)
 	   to the address following the rfi. */
 	beqlr-
 #endif
+#ifndef CONFIG_PTE_64BIT
 	rlwimi	r8,r4,22,20,29		/* insert next 10 bits of address */
+#else
+	rlwimi	r8,r4,23,20,28		/* compute pte address */
+#endif
 	rlwinm	r0,r3,32-3,24,24	/* _PAGE_RW access -> _PAGE_DIRTY */
 	ori	r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE
 
@@ -127,9 +137,15 @@ _GLOBAL(hash_page)
 	 * because almost always, there won't be a permission violation
 	 * and there won't already be an HPTE, and thus we will have
 	 * to update the PTE to set _PAGE_HASHPTE.  -- paulus.
+	 *
+	 * If PTE_64BIT is set, the low word is the flags word; use that
+	 * word for locking since it contains all the interesting bits.
 	 */
+#if (PTE_FLAGS_OFFSET != 0)
+	addi	r8,r8,PTE_FLAGS_OFFSET
+#endif
 retry:
-	lwarx	r6,0,r8			/* get linux-style pte */
+	lwarx	r6,0,r8			/* get linux-style pte, flag word */
 	andc.	r5,r3,r6		/* check access & ~permission */
 #ifdef CONFIG_SMP
 	bne-	hash_page_out		/* return if access not permitted */
@@ -137,6 +153,15 @@ retry:
 	bnelr-
 #endif
 	or	r5,r0,r6		/* set accessed/dirty bits */
+#ifdef CONFIG_PTE_64BIT
+#ifdef CONFIG_SMP
+	subf	r10,r6,r8		/* create false data dependency */
+	subi	r10,r10,PTE_FLAGS_OFFSET
+	lwzx	r10,r6,r10		/* Get upper PTE word */
+#else
+	lwz	r10,-PTE_FLAGS_OFFSET(r8)
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_PTE_64BIT */
 	stwcx.	r5,0,r8			/* attempt to update PTE */
 	bne-	retry			/* retry if someone got there first */
 
@@ -203,9 +228,9 @@ _GLOBAL(add_hash_page)
 	 * we can't take a hash table miss (assuming the code is
 	 * covered by a BAT).  -- paulus
 	 */
-	mfmsr	r10
+	mfmsr	r9
 	SYNC
-	rlwinm	r0,r10,0,17,15		/* clear bit 16 (MSR_EE) */
+	rlwinm	r0,r9,0,17,15		/* clear bit 16 (MSR_EE) */
 	rlwinm	r0,r0,0,28,26		/* clear MSR_DR */
 	mtmsr	r0
 	SYNC_601
@@ -214,14 +239,14 @@ _GLOBAL(add_hash_page)
 	tophys(r7,0)
 
 #ifdef CONFIG_SMP
-	addis	r9,r7,mmu_hash_lock@ha
-	addi	r9,r9,mmu_hash_lock@l
-10:	lwarx	r0,0,r9			/* take the mmu_hash_lock */
+	addis	r6,r7,mmu_hash_lock@ha
+	addi	r6,r6,mmu_hash_lock@l
+10:	lwarx	r0,0,r6			/* take the mmu_hash_lock */
 	cmpi	0,r0,0
 	bne-	11f
-	stwcx.	r8,0,r9
+	stwcx.	r8,0,r6
 	beq+	12f
-11:	lwz	r0,0(r9)
+11:	lwz	r0,0(r6)
 	cmpi	0,r0,0
 	beq	10b
 	b	11b
@@ -234,10 +259,24 @@ _GLOBAL(add_hash_page)
 	 * HPTE, so we just unlock and return.
 	 */
 	mr	r8,r5
+#ifndef CONFIG_PTE_64BIT
 	rlwimi	r8,r4,22,20,29
+#else
+	rlwimi	r8,r4,23,20,28
+	addi	r8,r8,PTE_FLAGS_OFFSET
+#endif
 1:	lwarx	r6,0,r8
 	andi.	r0,r6,_PAGE_HASHPTE
 	bne	9f			/* if HASHPTE already set, done */
+#ifdef CONFIG_PTE_64BIT
+#ifdef CONFIG_SMP
+	subf	r10,r6,r8		/* create false data dependency */
+	subi	r10,r10,PTE_FLAGS_OFFSET
+	lwzx	r10,r6,r10		/* Get upper PTE word */
+#else
+	lwz	r10,-PTE_FLAGS_OFFSET(r8)
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_PTE_64BIT */
 	ori	r5,r6,_PAGE_HASHPTE
 	stwcx.	r5,0,r8
 	bne-	1b
@@ -246,13 +285,15 @@ _GLOBAL(add_hash_page)
 
 9:
 #ifdef CONFIG_SMP
+	addis	r6,r7,mmu_hash_lock@ha
+	addi	r6,r6,mmu_hash_lock@l
 	eieio
 	li	r0,0
-	stw	r0,0(r9)		/* clear mmu_hash_lock */
+	stw	r0,0(r6)		/* clear mmu_hash_lock */
 #endif
 
 	/* reenable interrupts and DR */
-	mtmsr	r10
+	mtmsr	r9
 	SYNC_601
 	isync
 
@@ -267,7 +308,8 @@ _GLOBAL(add_hash_page)
  * r5 contains the linux PTE, r6 contains the old value of the
  * linux PTE (before setting _PAGE_HASHPTE) and r7 contains the
  * offset to be added to addresses (0 if the MMU is on,
- * -KERNELBASE if it is off).
+ * -KERNELBASE if it is off).  r10 contains the upper half of
+ * the PTE if CONFIG_PTE_64BIT.
  * On SMP, the caller should have the mmu_hash_lock held.
  * We assume that the caller has (or will) set the _PAGE_HASHPTE
  * bit in the linux PTE in memory.  The value passed in r6 should
@@ -285,7 +327,7 @@ Hash_bits = 12				/* e.g. 256kB hash table */
 Hash_msk = (((1 << Hash_bits) - 1) * 64)
 
 /* defines for the PTE format for 32-bit PPCs */
-#define PTE_SIZE	8
+#define HPTE_SIZE	8
 #define PTEG_SIZE	64
 #define LG_PTEG_SIZE	6
 #define LDPTEu		lwzu
@@ -313,6 +355,11 @@ _GLOBAL(create_hpte)
 BEGIN_FTR_SECTION
 	ori	r8,r8,_PAGE_COHERENT	/* set M (coherence required) */
 END_FTR_SECTION_IFSET(CPU_FTR_NEED_COHERENT)
+#ifdef CONFIG_PTE_64BIT
+	/* Put the XPN bits into the PTE */
+	rlwimi	r8,r10,8,20,22
+	rlwimi	r8,r10,2,29,29
+#endif
 
 	/* Construct the high word of the PPC-style PTE (r5) */
 	rlwinm	r5,r3,7,1,24		/* put VSID in 0x7fffff80 bits */
@@ -342,8 +389,8 @@ _GLOBAL(hash_page_patch_A)
 
 	/* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */
 	mtctr	r0
-	addi	r4,r3,-PTE_SIZE
-1:	LDPTEu	r6,PTE_SIZE(r4)		/* get next PTE */
+	addi	r4,r3,-HPTE_SIZE
+1:	LDPTEu	r6,HPTE_SIZE(r4)	/* get next PTE */
 	CMPPTE	0,r6,r5
 	bdnzf	2,1b			/* loop while ctr != 0 && !cr0.eq */
 	beq+	found_slot
@@ -353,9 +400,9 @@ _GLOBAL(hash_page_patch_A)
 _GLOBAL(hash_page_patch_B)
 	xoris	r4,r3,Hash_msk>>16	/* compute secondary hash */
 	xori	r4,r4,(-PTEG_SIZE & 0xffff)
-	addi	r4,r4,-PTE_SIZE
+	addi	r4,r4,-HPTE_SIZE
 	mtctr	r0
-2:	LDPTEu	r6,PTE_SIZE(r4)
+2:	LDPTEu	r6,HPTE_SIZE(r4)
 	CMPPTE	0,r6,r5
 	bdnzf	2,2b
 	beq+	found_slot
@@ -363,8 +410,8 @@ _GLOBAL(hash_page_patch_B)
 
 	/* Search the primary PTEG for an empty slot */
 10:	mtctr	r0
-	addi	r4,r3,-PTE_SIZE		/* search primary PTEG */
-1:	LDPTEu	r6,PTE_SIZE(r4)		/* get next PTE */
+	addi	r4,r3,-HPTE_SIZE	/* search primary PTEG */
+1:	LDPTEu	r6,HPTE_SIZE(r4)	/* get next PTE */
 	TST_V(r6)			/* test valid bit */
 	bdnzf	2,1b			/* loop while ctr != 0 && !cr0.eq */
 	beq+	found_empty
@@ -380,9 +427,9 @@ _GLOBAL(hash_page_patch_B)
 _GLOBAL(hash_page_patch_C)
 	xoris	r4,r3,Hash_msk>>16	/* compute secondary hash */
 	xori	r4,r4,(-PTEG_SIZE & 0xffff)
-	addi	r4,r4,-PTE_SIZE
+	addi	r4,r4,-HPTE_SIZE
 	mtctr	r0
-2:	LDPTEu	r6,PTE_SIZE(r4)
+2:	LDPTEu	r6,HPTE_SIZE(r4)
 	TST_V(r6)
 	bdnzf	2,2b
 	beq+	found_empty
@@ -409,11 +456,11 @@ _GLOBAL(hash_page_patch_C)
 
 1:	addis	r4,r7,next_slot@ha		/* get next evict slot */
 	lwz	r6,next_slot@l(r4)
-	addi	r6,r6,PTE_SIZE			/* search for candidate */
-	andi.	r6,r6,7*PTE_SIZE
+	addi	r6,r6,HPTE_SIZE			/* search for candidate */
+	andi.	r6,r6,7*HPTE_SIZE
 	stw	r6,next_slot@l(r4)
 	add	r4,r3,r6
-	LDPTE	r0,PTE_SIZE/2(r4)		/* get PTE second word */
+	LDPTE	r0,HPTE_SIZE/2(r4)		/* get PTE second word */
 	clrrwi	r0,r0,12
 	lis	r6,etext@h
 	ori	r6,r6,etext@l			/* get etext */
@@ -426,7 +473,7 @@ _GLOBAL(hash_page_patch_C)
 found_empty:
 	STPTE	r5,0(r4)
 found_slot:
-	STPTE	r8,PTE_SIZE/2(r4)
+	STPTE	r8,HPTE_SIZE/2(r4)
 
 #else /* CONFIG_SMP */
 /*
@@ -452,7 +499,7 @@ found_slot:
 	STPTE	r5,0(r4)
 	sync
 	TLBSYNC
-	STPTE	r8,PTE_SIZE/2(r4) /* put in correct RPN, WIMG, PP bits */
+	STPTE	r8,HPTE_SIZE/2(r4) /* put in correct RPN, WIMG, PP bits */
 	sync
 	SET_V(r5)
 	STPTE	r5,0(r4)	/* finally set V bit in PTE */
@@ -499,14 +546,18 @@ _GLOBAL(flush_hash_pages)
 	isync
 
 	/* First find a PTE in the range that has _PAGE_HASHPTE set */
+#ifndef CONFIG_PTE_64BIT
 	rlwimi	r5,r4,22,20,29
-1:	lwz	r0,0(r5)
+#else
+	rlwimi	r5,r4,23,20,28
+#endif
+1:	lwz	r0,PTE_FLAGS_OFFSET(r5)
 	cmpwi	cr1,r6,1
 	andi.	r0,r0,_PAGE_HASHPTE
 	bne	2f
 	ble	cr1,19f
 	addi	r4,r4,0x1000
-	addi	r5,r5,4
+	addi	r5,r5,PTE_SIZE
 	addi	r6,r6,-1
 	b	1b
 
@@ -545,7 +596,10 @@ _GLOBAL(flush_hash_pages)
 	 * already clear, we're done (for this pte).  If not,
 	 * clear it (atomically) and proceed.  -- paulus.
 	 */
-33:	lwarx	r8,0,r5			/* fetch the pte */
+#if (PTE_FLAGS_OFFSET != 0)
+	addi	r5,r5,PTE_FLAGS_OFFSET
+#endif
+33:	lwarx	r8,0,r5			/* fetch the pte flags word */
 	andi.	r0,r8,_PAGE_HASHPTE
 	beq	8f			/* done if HASHPTE is already clear */
 	rlwinm	r8,r8,0,31,29		/* clear HASHPTE bit */
@@ -562,8 +616,8 @@ _GLOBAL(flush_hash_patch_A)
 	/* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */
 	li	r0,8			/* PTEs/group */
 	mtctr	r0
-	addi	r12,r8,-PTE_SIZE
-1:	LDPTEu	r0,PTE_SIZE(r12)	/* get next PTE */
+	addi	r12,r8,-HPTE_SIZE
+1:	LDPTEu	r0,HPTE_SIZE(r12)	/* get next PTE */
 	CMPPTE	0,r0,r11
 	bdnzf	2,1b			/* loop while ctr != 0 && !cr0.eq */
 	beq+	3f
@@ -574,9 +628,9 @@ _GLOBAL(flush_hash_patch_A)
 _GLOBAL(flush_hash_patch_B)
 	xoris	r12,r8,Hash_msk>>16	/* compute secondary hash */
 	xori	r12,r12,(-PTEG_SIZE & 0xffff)
-	addi	r12,r12,-PTE_SIZE
+	addi	r12,r12,-HPTE_SIZE
 	mtctr	r0
-2:	LDPTEu	r0,PTE_SIZE(r12)
+2:	LDPTEu	r0,HPTE_SIZE(r12)
 	CMPPTE	0,r0,r11
 	bdnzf	2,2b
 	xori	r11,r11,PTE_H		/* clear H again */
@@ -590,7 +644,7 @@ _GLOBAL(flush_hash_patch_B)
 
 8:	ble	cr1,9f			/* if all ptes checked */
 81:	addi	r6,r6,-1
-	addi	r5,r5,4			/* advance to next pte */
+	addi	r5,r5,PTE_SIZE
 	addi	r4,r4,0x1000
 	lwz	r0,0(r5)		/* check next pte */
 	cmpwi	cr1,r6,1
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 8d3b58ebd38e..8d5b4758c13a 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -68,6 +68,7 @@
 
 #define KB (1024)
 #define MB (1024*KB)
+#define GB (1024L*MB)
 
 /*
  * Note:  pte   --> Linux PTE
@@ -102,7 +103,6 @@ int mmu_kernel_ssize = MMU_SEGSIZE_256M;
 int mmu_highuser_ssize = MMU_SEGSIZE_256M;
 u16 mmu_slb_size = 64;
 #ifdef CONFIG_HUGETLB_PAGE
-int mmu_huge_psize = MMU_PAGE_16M;
 unsigned int HPAGE_SHIFT;
 #endif
 #ifdef CONFIG_PPC_64K_PAGES
@@ -151,39 +151,58 @@ static struct mmu_psize_def mmu_psize_defaults_gp[] = {
 	},
 };
 
+static unsigned long htab_convert_pte_flags(unsigned long pteflags)
+{
+	unsigned long rflags = pteflags & 0x1fa;
+
+	/* _PAGE_EXEC -> NOEXEC */
+	if ((pteflags & _PAGE_EXEC) == 0)
+		rflags |= HPTE_R_N;
+
+	/* PP bits. PAGE_USER is already PP bit 0x2, so we only
+	 * need to add in 0x1 if it's a read-only user page
+	 */
+	if ((pteflags & _PAGE_USER) && !((pteflags & _PAGE_RW) &&
+					 (pteflags & _PAGE_DIRTY)))
+		rflags |= 1;
+
+	/* Always add C */
+	return rflags | HPTE_R_C;
+}
 
 int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
-		      unsigned long pstart, unsigned long mode,
+		      unsigned long pstart, unsigned long prot,
 		      int psize, int ssize)
 {
 	unsigned long vaddr, paddr;
 	unsigned int step, shift;
-	unsigned long tmp_mode;
 	int ret = 0;
 
 	shift = mmu_psize_defs[psize].shift;
 	step = 1 << shift;
 
+	prot = htab_convert_pte_flags(prot);
+
+	DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n",
+	    vstart, vend, pstart, prot, psize, ssize);
+
 	for (vaddr = vstart, paddr = pstart; vaddr < vend;
 	     vaddr += step, paddr += step) {
 		unsigned long hash, hpteg;
 		unsigned long vsid = get_kernel_vsid(vaddr, ssize);
 		unsigned long va = hpt_va(vaddr, vsid, ssize);
+		unsigned long tprot = prot;
 
-		tmp_mode = mode;
-		
-		/* Make non-kernel text non-executable */
-		if (!in_kernel_text(vaddr))
-			tmp_mode = mode | HPTE_R_N;
+		/* Make kernel text executable */
+		if (overlaps_kernel_text(vaddr, vaddr + step))
+			tprot &= ~HPTE_R_N;
 
 		hash = hpt_hash(va, shift, ssize);
 		hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
 
-		DBG("htab_bolt_mapping: calling %p\n", ppc_md.hpte_insert);
-
 		BUG_ON(!ppc_md.hpte_insert);
-		ret = ppc_md.hpte_insert(hpteg, va, paddr,
-				tmp_mode, HPTE_V_BOLTED, psize, ssize);
+		ret = ppc_md.hpte_insert(hpteg, va, paddr, tprot,
+					 HPTE_V_BOLTED, psize, ssize);
 
 		if (ret < 0)
 			break;
@@ -329,6 +348,48 @@ static int __init htab_dt_scan_page_sizes(unsigned long node,
 	return 0;
 }
 
+#ifdef CONFIG_HUGETLB_PAGE
+/* Scan for 16G memory blocks that have been set aside for huge pages
+ * and reserve those blocks for 16G huge pages.
+ */
+static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
+					const char *uname, int depth,
+					void *data) {
+	char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+	unsigned long *addr_prop;
+	u32 *page_count_prop;
+	unsigned int expected_pages;
+	long unsigned int phys_addr;
+	long unsigned int block_size;
+
+	/* We are scanning "memory" nodes only */
+	if (type == NULL || strcmp(type, "memory") != 0)
+		return 0;
+
+	/* This property is the log base 2 of the number of virtual pages that
+	 * will represent this memory block. */
+	page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL);
+	if (page_count_prop == NULL)
+		return 0;
+	expected_pages = (1 << page_count_prop[0]);
+	addr_prop = of_get_flat_dt_prop(node, "reg", NULL);
+	if (addr_prop == NULL)
+		return 0;
+	phys_addr = addr_prop[0];
+	block_size = addr_prop[1];
+	if (block_size != (16 * GB))
+		return 0;
+	printk(KERN_INFO "Huge page(16GB) memory: "
+			"addr = 0x%lX size = 0x%lX pages = %d\n",
+			phys_addr, block_size, expected_pages);
+	if (phys_addr + (16 * GB) <= lmb_end_of_DRAM()) {
+		lmb_reserve(phys_addr, block_size * expected_pages);
+		add_gpage(phys_addr, block_size, expected_pages);
+	}
+	return 0;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
 static void __init htab_init_page_sizes(void)
 {
 	int rc;
@@ -418,15 +479,18 @@ static void __init htab_init_page_sizes(void)
 	       );
 
 #ifdef CONFIG_HUGETLB_PAGE
-	/* Init large page size. Currently, we pick 16M or 1M depending
+	/* Reserve 16G huge page memory sections for huge pages */
+	of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
+
+/* Set default large page size. Currently, we pick 16M or 1M depending
 	 * on what is available
 	 */
 	if (mmu_psize_defs[MMU_PAGE_16M].shift)
-		set_huge_psize(MMU_PAGE_16M);
+		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
 	/* With 4k/4level pagetables, we can't (for now) cope with a
 	 * huge page size < PMD_SIZE */
 	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
-		set_huge_psize(MMU_PAGE_1M);
+		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
 #endif /* CONFIG_HUGETLB_PAGE */
 }
 
@@ -478,9 +542,9 @@ static unsigned long __init htab_get_table_size(void)
 #ifdef CONFIG_MEMORY_HOTPLUG
 void create_section_mapping(unsigned long start, unsigned long end)
 {
-		BUG_ON(htab_bolt_mapping(start, end, __pa(start),
-			_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX,
-			mmu_linear_psize, mmu_kernel_ssize));
+	BUG_ON(htab_bolt_mapping(start, end, __pa(start),
+				 pgprot_val(PAGE_KERNEL), mmu_linear_psize,
+				 mmu_kernel_ssize));
 }
 
 int remove_section_mapping(unsigned long start, unsigned long end)
@@ -529,7 +593,7 @@ void __init htab_initialize(void)
 {
 	unsigned long table;
 	unsigned long pteg_count;
-	unsigned long mode_rw;
+	unsigned long prot;
 	unsigned long base = 0, size = 0, limit;
 	int i;
 
@@ -587,7 +651,7 @@ void __init htab_initialize(void)
 		mtspr(SPRN_SDR1, _SDR1);
 	}
 
-	mode_rw = _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX;
+	prot = pgprot_val(PAGE_KERNEL);
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
 	linear_map_hash_count = lmb_end_of_DRAM() >> PAGE_SHIFT;
@@ -606,7 +670,8 @@ void __init htab_initialize(void)
 		base = (unsigned long)__va(lmb.memory.region[i].base);
 		size = lmb.memory.region[i].size;
 
-		DBG("creating mapping for region: %lx : %lx\n", base, size);
+		DBG("creating mapping for region: %lx..%lx (prot: %x)\n",
+		    base, size, prot);
 
 #ifdef CONFIG_U3_DART
 		/* Do not map the DART space. Fortunately, it will be aligned
@@ -623,21 +688,21 @@ void __init htab_initialize(void)
 			unsigned long dart_table_end = dart_tablebase + 16 * MB;
 			if (base != dart_tablebase)
 				BUG_ON(htab_bolt_mapping(base, dart_tablebase,
-							__pa(base), mode_rw,
+							__pa(base), prot,
 							mmu_linear_psize,
 							mmu_kernel_ssize));
 			if ((base + size) > dart_table_end)
 				BUG_ON(htab_bolt_mapping(dart_tablebase+16*MB,
 							base + size,
 							__pa(dart_table_end),
-							 mode_rw,
+							 prot,
 							 mmu_linear_psize,
 							 mmu_kernel_ssize));
 			continue;
 		}
 #endif /* CONFIG_U3_DART */
 		BUG_ON(htab_bolt_mapping(base, base + size, __pa(base),
-				mode_rw, mmu_linear_psize, mmu_kernel_ssize));
+				prot, mmu_linear_psize, mmu_kernel_ssize));
        }
 
 	/*
@@ -655,7 +720,7 @@ void __init htab_initialize(void)
 			tce_alloc_start = base + size + 1;
 
 		BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end,
-					 __pa(tce_alloc_start), mode_rw,
+					 __pa(tce_alloc_start), prot,
 					 mmu_linear_psize, mmu_kernel_ssize));
 	}
 
@@ -847,7 +912,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 
 #ifdef CONFIG_HUGETLB_PAGE
 	/* Handle hugepage regions */
-	if (HPAGE_SHIFT && psize == mmu_huge_psize) {
+	if (HPAGE_SHIFT && mmu_huge_psizes[psize]) {
 		DBG_LOW(" -> huge page !\n");
 		return hash_huge_page(mm, access, ea, vsid, local, trap);
 	}
@@ -1076,8 +1141,7 @@ static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
 	unsigned long hash, hpteg;
 	unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
 	unsigned long va = hpt_va(vaddr, vsid, mmu_kernel_ssize);
-	unsigned long mode = _PAGE_ACCESSED | _PAGE_DIRTY |
-		_PAGE_COHERENT | PP_RWXX | HPTE_R_N;
+	unsigned long mode = htab_convert_pte_flags(PAGE_KERNEL);
 	int ret;
 
 	hash = hpt_hash(va, PAGE_SHIFT, mmu_kernel_ssize);
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 0d12fba31bc5..a117024ab8cd 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -24,21 +24,43 @@
 #include <asm/cputable.h>
 #include <asm/spu.h>
 
-#define HPAGE_SHIFT_64K	16
-#define HPAGE_SHIFT_16M	24
+#define PAGE_SHIFT_64K	16
+#define PAGE_SHIFT_16M	24
+#define PAGE_SHIFT_16G	34
 
 #define NUM_LOW_AREAS	(0x100000000UL >> SID_SHIFT)
 #define NUM_HIGH_AREAS	(PGTABLE_RANGE >> HTLB_AREA_SHIFT)
+#define MAX_NUMBER_GPAGES	1024
 
-unsigned int hugepte_shift;
-#define PTRS_PER_HUGEPTE	(1 << hugepte_shift)
-#define HUGEPTE_TABLE_SIZE	(sizeof(pte_t) << hugepte_shift)
+/* Tracks the 16G pages after the device tree is scanned and before the
+ * huge_boot_pages list is ready.  */
+static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
+static unsigned nr_gpages;
 
-#define HUGEPD_SHIFT		(HPAGE_SHIFT + hugepte_shift)
-#define HUGEPD_SIZE		(1UL << HUGEPD_SHIFT)
-#define HUGEPD_MASK		(~(HUGEPD_SIZE-1))
+/* Array of valid huge page sizes - non-zero value(hugepte_shift) is
+ * stored for the huge page sizes that are valid.
+ */
+unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
+
+#define hugepte_shift			mmu_huge_psizes
+#define PTRS_PER_HUGEPTE(psize)		(1 << hugepte_shift[psize])
+#define HUGEPTE_TABLE_SIZE(psize)	(sizeof(pte_t) << hugepte_shift[psize])
+
+#define HUGEPD_SHIFT(psize)		(mmu_psize_to_shift(psize) \
+						+ hugepte_shift[psize])
+#define HUGEPD_SIZE(psize)		(1UL << HUGEPD_SHIFT(psize))
+#define HUGEPD_MASK(psize)		(~(HUGEPD_SIZE(psize)-1))
 
-#define huge_pgtable_cache	(pgtable_cache[HUGEPTE_CACHE_NUM])
+/* Subtract one from array size because we don't need a cache for 4K since
+ * is not a huge page size */
+#define huge_pgtable_cache(psize)	(pgtable_cache[HUGEPTE_CACHE_NUM \
+							+ psize-1])
+#define HUGEPTE_CACHE_NAME(psize)	(huge_pgtable_cache_name[psize])
+
+static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = {
+	"unused_4K", "hugepte_cache_64K", "unused_64K_AP",
+	"hugepte_cache_1M", "hugepte_cache_16M", "hugepte_cache_16G"
+};
 
 /* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
  * will choke on pointers to hugepte tables, which is handy for
@@ -49,24 +71,49 @@ typedef struct { unsigned long pd; } hugepd_t;
 
 #define hugepd_none(hpd)	((hpd).pd == 0)
 
+static inline int shift_to_mmu_psize(unsigned int shift)
+{
+	switch (shift) {
+#ifndef CONFIG_PPC_64K_PAGES
+	case PAGE_SHIFT_64K:
+	    return MMU_PAGE_64K;
+#endif
+	case PAGE_SHIFT_16M:
+	    return MMU_PAGE_16M;
+	case PAGE_SHIFT_16G:
+	    return MMU_PAGE_16G;
+	}
+	return -1;
+}
+
+static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
+{
+	if (mmu_psize_defs[mmu_psize].shift)
+		return mmu_psize_defs[mmu_psize].shift;
+	BUG();
+}
+
 static inline pte_t *hugepd_page(hugepd_t hpd)
 {
 	BUG_ON(!(hpd.pd & HUGEPD_OK));
 	return (pte_t *)(hpd.pd & ~HUGEPD_OK);
 }
 
-static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr)
+static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
+				    struct hstate *hstate)
 {
-	unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1));
+	unsigned int shift = huge_page_shift(hstate);
+	int psize = shift_to_mmu_psize(shift);
+	unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1));
 	pte_t *dir = hugepd_page(*hpdp);
 
 	return dir + idx;
 }
 
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
-			   unsigned long address)
+			   unsigned long address, unsigned int psize)
 {
-	pte_t *new = kmem_cache_alloc(huge_pgtable_cache,
+	pte_t *new = kmem_cache_zalloc(huge_pgtable_cache(psize),
 				      GFP_KERNEL|__GFP_REPEAT);
 
 	if (! new)
@@ -74,35 +121,76 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 
 	spin_lock(&mm->page_table_lock);
 	if (!hugepd_none(*hpdp))
-		kmem_cache_free(huge_pgtable_cache, new);
+		kmem_cache_free(huge_pgtable_cache(psize), new);
 	else
 		hpdp->pd = (unsigned long)new | HUGEPD_OK;
 	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
 
-/* Base page size affects how we walk hugetlb page tables */
-#ifdef CONFIG_PPC_64K_PAGES
-#define hpmd_offset(pud, addr)		pmd_offset(pud, addr)
-#define hpmd_alloc(mm, pud, addr)	pmd_alloc(mm, pud, addr)
-#else
-static inline
-pmd_t *hpmd_offset(pud_t *pud, unsigned long addr)
+
+static pud_t *hpud_offset(pgd_t *pgd, unsigned long addr, struct hstate *hstate)
+{
+	if (huge_page_shift(hstate) < PUD_SHIFT)
+		return pud_offset(pgd, addr);
+	else
+		return (pud_t *) pgd;
+}
+static pud_t *hpud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr,
+			 struct hstate *hstate)
+{
+	if (huge_page_shift(hstate) < PUD_SHIFT)
+		return pud_alloc(mm, pgd, addr);
+	else
+		return (pud_t *) pgd;
+}
+static pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate)
 {
-	if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
+	if (huge_page_shift(hstate) < PMD_SHIFT)
 		return pmd_offset(pud, addr);
 	else
 		return (pmd_t *) pud;
 }
-static inline
-pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr)
+static pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr,
+			 struct hstate *hstate)
 {
-	if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
+	if (huge_page_shift(hstate) < PMD_SHIFT)
 		return pmd_alloc(mm, pud, addr);
 	else
 		return (pmd_t *) pud;
 }
-#endif
+
+/* Build list of addresses of gigantic pages.  This function is used in early
+ * boot before the buddy or bootmem allocator is setup.
+ */
+void add_gpage(unsigned long addr, unsigned long page_size,
+	unsigned long number_of_pages)
+{
+	if (!addr)
+		return;
+	while (number_of_pages > 0) {
+		gpage_freearray[nr_gpages] = addr;
+		nr_gpages++;
+		number_of_pages--;
+		addr += page_size;
+	}
+}
+
+/* Moves the gigantic page addresses from the temporary list to the
+ * huge_boot_pages list.
+ */
+int alloc_bootmem_huge_page(struct hstate *hstate)
+{
+	struct huge_bootmem_page *m;
+	if (nr_gpages == 0)
+		return 0;
+	m = phys_to_virt(gpage_freearray[--nr_gpages]);
+	gpage_freearray[nr_gpages] = 0;
+	list_add(&m->list, &huge_boot_pages);
+	m->hstate = hstate;
+	return 1;
+}
+
 
 /* Modelled after find_linux_pte() */
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
@@ -111,39 +199,52 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	pud_t *pu;
 	pmd_t *pm;
 
-	BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
+	unsigned int psize;
+	unsigned int shift;
+	unsigned long sz;
+	struct hstate *hstate;
+	psize = get_slice_psize(mm, addr);
+	shift = mmu_psize_to_shift(psize);
+	sz = ((1UL) << shift);
+	hstate = size_to_hstate(sz);
 
-	addr &= HPAGE_MASK;
+	addr &= hstate->mask;
 
 	pg = pgd_offset(mm, addr);
 	if (!pgd_none(*pg)) {
-		pu = pud_offset(pg, addr);
+		pu = hpud_offset(pg, addr, hstate);
 		if (!pud_none(*pu)) {
-			pm = hpmd_offset(pu, addr);
+			pm = hpmd_offset(pu, addr, hstate);
 			if (!pmd_none(*pm))
-				return hugepte_offset((hugepd_t *)pm, addr);
+				return hugepte_offset((hugepd_t *)pm, addr,
+						      hstate);
 		}
 	}
 
 	return NULL;
 }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+			unsigned long addr, unsigned long sz)
 {
 	pgd_t *pg;
 	pud_t *pu;
 	pmd_t *pm;
 	hugepd_t *hpdp = NULL;
+	struct hstate *hstate;
+	unsigned int psize;
+	hstate = size_to_hstate(sz);
 
-	BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
+	psize = get_slice_psize(mm, addr);
+	BUG_ON(!mmu_huge_psizes[psize]);
 
-	addr &= HPAGE_MASK;
+	addr &= hstate->mask;
 
 	pg = pgd_offset(mm, addr);
-	pu = pud_alloc(mm, pg, addr);
+	pu = hpud_alloc(mm, pg, addr, hstate);
 
 	if (pu) {
-		pm = hpmd_alloc(mm, pu, addr);
+		pm = hpmd_alloc(mm, pu, addr, hstate);
 		if (pm)
 			hpdp = (hugepd_t *)pm;
 	}
@@ -151,10 +252,10 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 	if (! hpdp)
 		return NULL;
 
-	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr))
+	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize))
 		return NULL;
 
-	return hugepte_offset(hpdp, addr);
+	return hugepte_offset(hpdp, addr, hstate);
 }
 
 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
@@ -162,19 +263,22 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 	return 0;
 }
 
-static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp)
+static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp,
+			       unsigned int psize)
 {
 	pte_t *hugepte = hugepd_page(*hpdp);
 
 	hpdp->pd = 0;
 	tlb->need_flush = 1;
-	pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM,
+	pgtable_free_tlb(tlb, pgtable_free_cache(hugepte,
+						 HUGEPTE_CACHE_NUM+psize-1,
 						 PGF_CACHENUM_MASK));
 }
 
 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 				   unsigned long addr, unsigned long end,
-				   unsigned long floor, unsigned long ceiling)
+				   unsigned long floor, unsigned long ceiling,
+				   unsigned int psize)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -186,7 +290,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none(*pmd))
 			continue;
-		free_hugepte_range(tlb, (hugepd_t *)pmd);
+		free_hugepte_range(tlb, (hugepd_t *)pmd, psize);
 	} while (pmd++, addr = next, addr != end);
 
 	start &= PUD_MASK;
@@ -212,26 +316,24 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 	pud_t *pud;
 	unsigned long next;
 	unsigned long start;
+	unsigned int shift;
+	unsigned int psize = get_slice_psize(tlb->mm, addr);
+	shift = mmu_psize_to_shift(psize);
 
 	start = addr;
 	pud = pud_offset(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
-#ifdef CONFIG_PPC_64K_PAGES
-		if (pud_none_or_clear_bad(pud))
-			continue;
-		hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
-#else
-		if (HPAGE_SHIFT == HPAGE_SHIFT_64K) {
+		if (shift < PMD_SHIFT) {
 			if (pud_none_or_clear_bad(pud))
 				continue;
-			hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
+			hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
+					       ceiling, psize);
 		} else {
 			if (pud_none(*pud))
 				continue;
-			free_hugepte_range(tlb, (hugepd_t *)pud);
+			free_hugepte_range(tlb, (hugepd_t *)pud, psize);
 		}
-#endif
 	} while (pud++, addr = next, addr != end);
 
 	start &= PGDIR_MASK;
@@ -255,7 +357,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
  *
  * Must be called with pagetable lock held.
  */
-void hugetlb_free_pgd_range(struct mmu_gather **tlb,
+void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 			    unsigned long addr, unsigned long end,
 			    unsigned long floor, unsigned long ceiling)
 {
@@ -297,31 +399,39 @@ void hugetlb_free_pgd_range(struct mmu_gather **tlb,
 	 * now has no other vmas using it, so can be freed, we don't
 	 * bother to round floor or end up - the tests don't need that.
 	 */
+	unsigned int psize = get_slice_psize(tlb->mm, addr);
 
-	addr &= HUGEPD_MASK;
+	addr &= HUGEPD_MASK(psize);
 	if (addr < floor) {
-		addr += HUGEPD_SIZE;
+		addr += HUGEPD_SIZE(psize);
 		if (!addr)
 			return;
 	}
 	if (ceiling) {
-		ceiling &= HUGEPD_MASK;
+		ceiling &= HUGEPD_MASK(psize);
 		if (!ceiling)
 			return;
 	}
 	if (end - 1 > ceiling - 1)
-		end -= HUGEPD_SIZE;
+		end -= HUGEPD_SIZE(psize);
 	if (addr > end - 1)
 		return;
 
 	start = addr;
-	pgd = pgd_offset((*tlb)->mm, addr);
+	pgd = pgd_offset(tlb->mm, addr);
 	do {
-		BUG_ON(get_slice_psize((*tlb)->mm, addr) != mmu_huge_psize);
+		psize = get_slice_psize(tlb->mm, addr);
+		BUG_ON(!mmu_huge_psizes[psize]);
 		next = pgd_addr_end(addr, end);
-		if (pgd_none_or_clear_bad(pgd))
-			continue;
-		hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
+		if (mmu_psize_to_shift(psize) < PUD_SHIFT) {
+			if (pgd_none_or_clear_bad(pgd))
+				continue;
+			hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
+		} else {
+			if (pgd_none(*pgd))
+				continue;
+			free_hugepte_range(tlb, (hugepd_t *)pgd, psize);
+		}
 	} while (pgd++, addr = next, addr != end);
 }
 
@@ -334,7 +444,11 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 		 * necessary anymore if we make hpte_need_flush() get the
 		 * page size from the slices
 		 */
-		pte_update(mm, addr & HPAGE_MASK, ptep, ~0UL, 1);
+		unsigned int psize = get_slice_psize(mm, addr);
+		unsigned int shift = mmu_psize_to_shift(psize);
+		unsigned long sz = ((1UL) << shift);
+		struct hstate *hstate = size_to_hstate(sz);
+		pte_update(mm, addr & hstate->mask, ptep, ~0UL, 1);
 	}
 	*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
 }
@@ -351,14 +465,19 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 {
 	pte_t *ptep;
 	struct page *page;
+	unsigned int mmu_psize = get_slice_psize(mm, address);
 
-	if (get_slice_psize(mm, address) != mmu_huge_psize)
+	/* Verify it is a huge page else bail. */
+	if (!mmu_huge_psizes[mmu_psize])
 		return ERR_PTR(-EINVAL);
 
 	ptep = huge_pte_offset(mm, address);
 	page = pte_page(*ptep);
-	if (page)
-		page += (address % HPAGE_SIZE) / PAGE_SIZE;
+	if (page) {
+		unsigned int shift = mmu_psize_to_shift(mmu_psize);
+		unsigned long sz = ((1UL) << shift);
+		page += (address % sz) / PAGE_SIZE;
+	}
 
 	return page;
 }
@@ -368,6 +487,11 @@ int pmd_huge(pmd_t pmd)
 	return 0;
 }
 
+int pud_huge(pud_t pud)
+{
+	return 0;
+}
+
 struct page *
 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 		pmd_t *pmd, int write)
@@ -381,15 +505,16 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 					unsigned long len, unsigned long pgoff,
 					unsigned long flags)
 {
-	return slice_get_unmapped_area(addr, len, flags,
-				       mmu_huge_psize, 1, 0);
+	struct hstate *hstate = hstate_file(file);
+	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
+	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
 }
 
 /*
  * Called by asm hashtable.S for doing lazy icache flush
  */
 static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
-						  pte_t pte, int trap)
+					pte_t pte, int trap, unsigned long sz)
 {
 	struct page *page;
 	int i;
@@ -402,7 +527,7 @@ static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
 	/* page is dirty */
 	if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
 		if (trap == 0x400) {
-			for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++)
+			for (i = 0; i < (sz / PAGE_SIZE); i++)
 				__flush_dcache_icache(page_address(page+i));
 			set_bit(PG_arch_1, &page->flags);
 		} else {
@@ -418,11 +543,16 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
 {
 	pte_t *ptep;
 	unsigned long old_pte, new_pte;
-	unsigned long va, rflags, pa;
+	unsigned long va, rflags, pa, sz;
 	long slot;
 	int err = 1;
 	int ssize = user_segment_size(ea);
+	unsigned int mmu_psize;
+	int shift;
+	mmu_psize = get_slice_psize(mm, ea);
 
+	if (!mmu_huge_psizes[mmu_psize])
+		goto out;
 	ptep = huge_pte_offset(mm, ea);
 
 	/* Search the Linux page table for a match with va */
@@ -465,30 +595,32 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
 	rflags = 0x2 | (!(new_pte & _PAGE_RW));
  	/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
 	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
+	shift = mmu_psize_to_shift(mmu_psize);
+	sz = ((1UL) << shift);
 	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
 		/* No CPU has hugepages but lacks no execute, so we
 		 * don't need to worry about that case */
 		rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
-						       trap);
+						       trap, sz);
 
 	/* Check if pte already has an hpte (case 2) */
 	if (unlikely(old_pte & _PAGE_HASHPTE)) {
 		/* There MIGHT be an HPTE for this pte */
 		unsigned long hash, slot;
 
-		hash = hpt_hash(va, HPAGE_SHIFT, ssize);
+		hash = hpt_hash(va, shift, ssize);
 		if (old_pte & _PAGE_F_SECOND)
 			hash = ~hash;
 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 		slot += (old_pte & _PAGE_F_GIX) >> 12;
 
-		if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize,
+		if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
 					 ssize, local) == -1)
 			old_pte &= ~_PAGE_HPTEFLAGS;
 	}
 
 	if (likely(!(old_pte & _PAGE_HASHPTE))) {
-		unsigned long hash = hpt_hash(va, HPAGE_SHIFT, ssize);
+		unsigned long hash = hpt_hash(va, shift, ssize);
 		unsigned long hpte_group;
 
 		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
@@ -509,7 +641,7 @@ repeat:
 
 		/* Insert into the hash table, primary slot */
 		slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
-					  mmu_huge_psize, ssize);
+					  mmu_psize, ssize);
 
 		/* Primary is full, try the secondary */
 		if (unlikely(slot == -1)) {
@@ -517,7 +649,7 @@ repeat:
 				      HPTES_PER_GROUP) & ~0x7UL; 
 			slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
 						  HPTE_V_SECONDARY,
-						  mmu_huge_psize, ssize);
+						  mmu_psize, ssize);
 			if (slot == -1) {
 				if (mftb() & 0x1)
 					hpte_group = ((hash & htab_hash_mask) *
@@ -549,45 +681,54 @@ void set_huge_psize(int psize)
 {
 	/* Check that it is a page size supported by the hardware and
 	 * that it fits within pagetable limits. */
-	if (mmu_psize_defs[psize].shift && mmu_psize_defs[psize].shift < SID_SHIFT &&
+	if (mmu_psize_defs[psize].shift &&
+		mmu_psize_defs[psize].shift < SID_SHIFT_1T &&
 		(mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT ||
-			mmu_psize_defs[psize].shift == HPAGE_SHIFT_64K)) {
-		HPAGE_SHIFT = mmu_psize_defs[psize].shift;
-		mmu_huge_psize = psize;
-#ifdef CONFIG_PPC_64K_PAGES
-		hugepte_shift = (PMD_SHIFT-HPAGE_SHIFT);
-#else
-		if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
-			hugepte_shift = (PMD_SHIFT-HPAGE_SHIFT);
-		else
-			hugepte_shift = (PUD_SHIFT-HPAGE_SHIFT);
-#endif
-
+		 mmu_psize_defs[psize].shift == PAGE_SHIFT_64K ||
+		 mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) {
+		/* Return if huge page size has already been setup or is the
+		 * same as the base page size. */
+		if (mmu_huge_psizes[psize] ||
+		   mmu_psize_defs[psize].shift == PAGE_SHIFT)
+			return;
+		hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
+
+		switch (mmu_psize_defs[psize].shift) {
+		case PAGE_SHIFT_64K:
+		    /* We only allow 64k hpages with 4k base page,
+		     * which was checked above, and always put them
+		     * at the PMD */
+		    hugepte_shift[psize] = PMD_SHIFT;
+		    break;
+		case PAGE_SHIFT_16M:
+		    /* 16M pages can be at two different levels
+		     * of pagestables based on base page size */
+		    if (PAGE_SHIFT == PAGE_SHIFT_64K)
+			    hugepte_shift[psize] = PMD_SHIFT;
+		    else /* 4k base page */
+			    hugepte_shift[psize] = PUD_SHIFT;
+		    break;
+		case PAGE_SHIFT_16G:
+		    /* 16G pages are always at PGD level */
+		    hugepte_shift[psize] = PGDIR_SHIFT;
+		    break;
+		}
+		hugepte_shift[psize] -= mmu_psize_defs[psize].shift;
 	} else
-		HPAGE_SHIFT = 0;
+		hugepte_shift[psize] = 0;
 }
 
 static int __init hugepage_setup_sz(char *str)
 {
 	unsigned long long size;
-	int mmu_psize = -1;
+	int mmu_psize;
 	int shift;
 
 	size = memparse(str, &str);
 
 	shift = __ffs(size);
-	switch (shift) {
-#ifndef CONFIG_PPC_64K_PAGES
-	case HPAGE_SHIFT_64K:
-		mmu_psize = MMU_PAGE_64K;
-		break;
-#endif
-	case HPAGE_SHIFT_16M:
-		mmu_psize = MMU_PAGE_16M;
-		break;
-	}
-
-	if (mmu_psize >=0 && mmu_psize_defs[mmu_psize].shift)
+	mmu_psize = shift_to_mmu_psize(shift);
+	if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift)
 		set_huge_psize(mmu_psize);
 	else
 		printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
@@ -596,23 +737,40 @@ static int __init hugepage_setup_sz(char *str)
 }
 __setup("hugepagesz=", hugepage_setup_sz);
 
-static void zero_ctor(struct kmem_cache *cache, void *addr)
-{
-	memset(addr, 0, kmem_cache_size(cache));
-}
-
 static int __init hugetlbpage_init(void)
 {
+	unsigned int psize;
+
 	if (!cpu_has_feature(CPU_FTR_16M_PAGE))
 		return -ENODEV;
 
-	huge_pgtable_cache = kmem_cache_create("hugepte_cache",
-					       HUGEPTE_TABLE_SIZE,
-					       HUGEPTE_TABLE_SIZE,
-					       0,
-					       zero_ctor);
-	if (! huge_pgtable_cache)
-		panic("hugetlbpage_init(): could not create hugepte cache\n");
+	/* Add supported huge page sizes.  Need to change HUGE_MAX_HSTATE
+	 * and adjust PTE_NONCACHE_NUM if the number of supported huge page
+	 * sizes changes.
+	 */
+	set_huge_psize(MMU_PAGE_16M);
+	set_huge_psize(MMU_PAGE_16G);
+
+	/* Temporarily disable support for 64K huge pages when 64K SPU local
+	 * store support is enabled as the current implementation conflicts.
+	 */
+#ifndef CONFIG_SPU_FS_64K_LS
+	set_huge_psize(MMU_PAGE_64K);
+#endif
+
+	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+		if (mmu_huge_psizes[psize]) {
+			huge_pgtable_cache(psize) = kmem_cache_create(
+						HUGEPTE_CACHE_NAME(psize),
+						HUGEPTE_TABLE_SIZE(psize),
+						HUGEPTE_TABLE_SIZE(psize),
+						0,
+						NULL);
+			if (!huge_pgtable_cache(psize))
+				panic("hugetlbpage_init(): could not create %s"\
+				      "\n", HUGEPTE_CACHE_NAME(psize));
+		}
+	}
 
 	return 0;
 }
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 6ef63caca682..3e6a6543f53a 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -136,9 +136,14 @@ static int __init setup_kcore(void)
 module_init(setup_kcore);
 #endif
 
-static void zero_ctor(struct kmem_cache *cache, void *addr)
+static void pgd_ctor(void *addr)
 {
-	memset(addr, 0, kmem_cache_size(cache));
+	memset(addr, 0, PGD_TABLE_SIZE);
+}
+
+static void pmd_ctor(void *addr)
+{
+	memset(addr, 0, PMD_TABLE_SIZE);
 }
 
 static const unsigned int pgtable_cache_size[2] = {
@@ -153,29 +158,18 @@ static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
 };
 
 #ifdef CONFIG_HUGETLB_PAGE
-/* Hugepages need one extra cache, initialized in hugetlbpage.c.  We
- * can't put into the tables above, because HPAGE_SHIFT is not compile
- * time constant. */
-struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+1];
+/* Hugepages need an extra cache per hugepagesize, initialized in
+ * hugetlbpage.c.  We can't put into the tables above, because HPAGE_SHIFT
+ * is not compile time constant. */
+struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+MMU_PAGE_COUNT];
 #else
 struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
 #endif
 
 void pgtable_cache_init(void)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) {
-		int size = pgtable_cache_size[i];
-		const char *name = pgtable_cache_name[i];
-
-		pr_debug("Allocating page table cache %s (#%d) "
-			"for size: %08x...\n", name, i, size);
-		pgtable_cache[i] = kmem_cache_create(name,
-						     size, size,
-						     SLAB_PANIC,
-						     zero_ctor);
-	}
+	pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor);
+	pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor);
 }
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
@@ -212,13 +206,10 @@ static int __meminit vmemmap_populated(unsigned long start, int page_size)
 int __meminit vmemmap_populate(struct page *start_page,
 			       unsigned long nr_pages, int node)
 {
-	unsigned long mode_rw;
 	unsigned long start = (unsigned long)start_page;
 	unsigned long end = (unsigned long)(start_page + nr_pages);
 	unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
 
-	mode_rw = _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX;
-
 	/* Align to the page size of the linear mapping. */
 	start = _ALIGN_DOWN(start, page_size);
 
@@ -236,9 +227,9 @@ int __meminit vmemmap_populate(struct page *start_page,
 		pr_debug("vmemmap %08lx allocated at %p, physical %08lx.\n",
 			start, p, __pa(p));
 
-		mapped = htab_bolt_mapping(start, start + page_size,
-					__pa(p), mode_rw, mmu_vmemmap_psize,
-					mmu_kernel_ssize);
+		mapped = htab_bolt_mapping(start, start + page_size, __pa(p),
+					   pgprot_val(PAGE_KERNEL),
+					   mmu_vmemmap_psize, mmu_kernel_ssize);
 		BUG_ON(mapped < 0);
 	}
 
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 1ca2235f0965..b9e1a1da6e52 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -75,11 +75,10 @@ static inline pte_t *virt_to_kpte(unsigned long vaddr)
 
 int page_is_ram(unsigned long pfn)
 {
-	unsigned long paddr = (pfn << PAGE_SHIFT);
-
 #ifndef CONFIG_PPC64	/* XXX for now */
-	return paddr < __pa(high_memory);
+	return pfn < max_pfn;
 #else
+	unsigned long paddr = (pfn << PAGE_SHIFT);
 	int i;
 	for (i=0; i < lmb.memory.cnt; i++) {
 		unsigned long base;
@@ -135,23 +134,6 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
 	return __add_pages(zone, start_pfn, nr_pages);
 }
-
-#ifdef CONFIG_MEMORY_HOTREMOVE
-int remove_memory(u64 start, u64 size)
-{
-	unsigned long start_pfn, end_pfn;
-	int ret;
-
-	start_pfn = start >> PAGE_SHIFT;
-	end_pfn = start_pfn + (size >> PAGE_SHIFT);
-	ret = offline_pages(start_pfn, end_pfn, 120 * HZ);
-	if (ret)
-		goto out;
-	/* Arch-specific calls go here - next patch */
-out:
-	return ret;
-}
-#endif /* CONFIG_MEMORY_HOTREMOVE */
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 /*
@@ -186,45 +168,6 @@ walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg,
 }
 EXPORT_SYMBOL_GPL(walk_memory_resource);
 
-void show_mem(void)
-{
-	unsigned long total = 0, reserved = 0;
-	unsigned long shared = 0, cached = 0;
-	unsigned long highmem = 0;
-	struct page *page;
-	pg_data_t *pgdat;
-	unsigned long i;
-
-	printk("Mem-info:\n");
-	show_free_areas();
-	for_each_online_pgdat(pgdat) {
-		unsigned long flags;
-		pgdat_resize_lock(pgdat, &flags);
-		for (i = 0; i < pgdat->node_spanned_pages; i++) {
-			if (!pfn_valid(pgdat->node_start_pfn + i))
-				continue;
-			page = pgdat_page_nr(pgdat, i);
-			total++;
-			if (PageHighMem(page))
-				highmem++;
-			if (PageReserved(page))
-				reserved++;
-			else if (PageSwapCache(page))
-				cached++;
-			else if (page_count(page))
-				shared += page_count(page) - 1;
-		}
-		pgdat_resize_unlock(pgdat, &flags);
-	}
-	printk("%ld pages of RAM\n", total);
-#ifdef CONFIG_HIGHMEM
-	printk("%ld pages of HIGHMEM\n", highmem);
-#endif
-	printk("%ld reserved pages\n", reserved);
-	printk("%ld pages shared\n", shared);
-	printk("%ld pages swap cached\n", cached);
-}
-
 /*
  * Initialize the bootmem system and give it all the memory we
  * have available.  If we are using highmem, we only put the
@@ -350,7 +293,7 @@ void __init paging_init(void)
 #endif /* CONFIG_HIGHMEM */
 
 	printk(KERN_DEBUG "Top of RAM: 0x%llx, Total RAM: 0x%lx\n",
-	       (u64)top_of_ram, total_ram);
+	       (unsigned long long)top_of_ram, total_ram);
 	printk(KERN_DEBUG "Memory hole size: %ldMB\n",
 	       (long int)((top_of_ram - total_ram) >> 20));
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index cf4bffba6f7c..eb505ad34a85 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -39,7 +39,6 @@ EXPORT_SYMBOL(numa_cpu_lookup_table);
 EXPORT_SYMBOL(numa_cpumask_lookup_table);
 EXPORT_SYMBOL(node_data);
 
-static bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
 static int min_common_depth;
 static int n_mem_addr_cells, n_mem_size_cells;
 
@@ -90,6 +89,48 @@ static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn,
 	return 0;
 }
 
+/*
+ * get_active_region_work_fn - A helper function for get_node_active_region
+ *	Returns datax set to the start_pfn and end_pfn if they contain
+ *	the initial value of datax->start_pfn between them
+ * @start_pfn: start page(inclusive) of region to check
+ * @end_pfn: end page(exclusive) of region to check
+ * @datax: comes in with ->start_pfn set to value to search for and
+ *	goes out with active range if it contains it
+ * Returns 1 if search value is in range else 0
+ */
+static int __init get_active_region_work_fn(unsigned long start_pfn,
+					unsigned long end_pfn, void *datax)
+{
+	struct node_active_region *data;
+	data = (struct node_active_region *)datax;
+
+	if (start_pfn <= data->start_pfn && end_pfn > data->start_pfn) {
+		data->start_pfn = start_pfn;
+		data->end_pfn = end_pfn;
+		return 1;
+	}
+	return 0;
+
+}
+
+/*
+ * get_node_active_region - Return active region containing start_pfn
+ * Active range returned is empty if none found.
+ * @start_pfn: The page to return the region for.
+ * @node_ar: Returned set to the active region containing start_pfn
+ */
+static void __init get_node_active_region(unsigned long start_pfn,
+		       struct node_active_region *node_ar)
+{
+	int nid = early_pfn_to_nid(start_pfn);
+
+	node_ar->nid = nid;
+	node_ar->start_pfn = start_pfn;
+	node_ar->end_pfn = start_pfn;
+	work_with_active_regions(nid, get_active_region_work_fn, node_ar);
+}
+
 static void __cpuinit map_cpu_to_node(int cpu, int node)
 {
 	numa_cpu_lookup_table[cpu] = node;
@@ -151,6 +192,21 @@ static const int *of_get_associativity(struct device_node *dev)
 	return of_get_property(dev, "ibm,associativity", NULL);
 }
 
+/*
+ * Returns the property linux,drconf-usable-memory if
+ * it exists (the property exists only in kexec/kdump kernels,
+ * added by kexec-tools)
+ */
+static const u32 *of_get_usable_memory(struct device_node *memory)
+{
+	const u32 *prop;
+	u32 len;
+	prop = of_get_property(memory, "linux,drconf-usable-memory", &len);
+	if (!prop || len < sizeof(unsigned int))
+		return 0;
+	return prop;
+}
+
 /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
  * info is found.
  */
@@ -472,12 +528,10 @@ static unsigned long __init numa_enforce_memory_limit(unsigned long start,
 	/*
 	 * We use lmb_end_of_DRAM() in here instead of memory_limit because
 	 * we've already adjusted it for the limit and it takes care of
-	 * having memory holes below the limit.
+	 * having memory holes below the limit.  Also, in the case of
+	 * iommu_is_off, memory_limit is not set but is implicitly enforced.
 	 */
 
-	if (! memory_limit)
-		return size;
-
 	if (start + size <= lmb_end_of_DRAM())
 		return size;
 
@@ -488,14 +542,29 @@ static unsigned long __init numa_enforce_memory_limit(unsigned long start,
 }
 
 /*
+ * Reads the counter for a given entry in
+ * linux,drconf-usable-memory property
+ */
+static inline int __init read_usm_ranges(const u32 **usm)
+{
+	/*
+	 * For each lmb in ibm,dynamic-memory a corresponding
+	 * entry in linux,drconf-usable-memory property contains
+	 * a counter followed by that many (base, size) duple.
+	 * read the counter from linux,drconf-usable-memory
+	 */
+	return read_n_cells(n_mem_size_cells, usm);
+}
+
+/*
  * Extract NUMA information from the ibm,dynamic-reconfiguration-memory
  * node.  This assumes n_mem_{addr,size}_cells have been set.
  */
 static void __init parse_drconf_memory(struct device_node *memory)
 {
-	const u32 *dm;
-	unsigned int n, rc;
-	unsigned long lmb_size, size;
+	const u32 *dm, *usm;
+	unsigned int n, rc, ranges, is_kexec_kdump = 0;
+	unsigned long lmb_size, base, size, sz;
 	int nid;
 	struct assoc_arrays aa;
 
@@ -511,6 +580,11 @@ static void __init parse_drconf_memory(struct device_node *memory)
 	if (rc)
 		return;
 
+	/* check if this is a kexec/kdump kernel */
+	usm = of_get_usable_memory(memory);
+	if (usm != NULL)
+		is_kexec_kdump = 1;
+
 	for (; n != 0; --n) {
 		struct of_drconf_cell drmem;
 
@@ -522,21 +596,31 @@ static void __init parse_drconf_memory(struct device_node *memory)
 		    || !(drmem.flags & DRCONF_MEM_ASSIGNED))
 			continue;
 
-		nid = of_drconf_to_nid_single(&drmem, &aa);
+		base = drmem.base_addr;
+		size = lmb_size;
+		ranges = 1;
 
-		fake_numa_create_new_node(
-				((drmem.base_addr + lmb_size) >> PAGE_SHIFT),
+		if (is_kexec_kdump) {
+			ranges = read_usm_ranges(&usm);
+			if (!ranges) /* there are no (base, size) duple */
+				continue;
+		}
+		do {
+			if (is_kexec_kdump) {
+				base = read_n_cells(n_mem_addr_cells, &usm);
+				size = read_n_cells(n_mem_size_cells, &usm);
+			}
+			nid = of_drconf_to_nid_single(&drmem, &aa);
+			fake_numa_create_new_node(
+				((base + size) >> PAGE_SHIFT),
 					   &nid);
-
-		node_set_online(nid);
-
-		size = numa_enforce_memory_limit(drmem.base_addr, lmb_size);
-		if (!size)
-			continue;
-
-		add_active_range(nid, drmem.base_addr >> PAGE_SHIFT,
-				 (drmem.base_addr >> PAGE_SHIFT)
-				 + (size >> PAGE_SHIFT));
+			node_set_online(nid);
+			sz = numa_enforce_memory_limit(base, size);
+			if (sz)
+				add_active_range(nid, base >> PAGE_SHIFT,
+						 (base >> PAGE_SHIFT)
+						 + (sz >> PAGE_SHIFT));
+		} while (--ranges);
 	}
 }
 
@@ -816,7 +900,7 @@ void __init do_init_bootmem(void)
   		dbg("node %d\n", nid);
 		dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
 
-		NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
+		NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
 		NODE_DATA(nid)->node_start_pfn = start_pfn;
 		NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
 
@@ -838,38 +922,53 @@ void __init do_init_bootmem(void)
 				  start_pfn, end_pfn);
 
 		free_bootmem_with_active_regions(nid, end_pfn);
+	}
 
-		/* Mark reserved regions on this node */
-		for (i = 0; i < lmb.reserved.cnt; i++) {
-			unsigned long physbase = lmb.reserved.region[i].base;
-			unsigned long size = lmb.reserved.region[i].size;
-			unsigned long start_paddr = start_pfn << PAGE_SHIFT;
-			unsigned long end_paddr = end_pfn << PAGE_SHIFT;
-
-			if (early_pfn_to_nid(physbase >> PAGE_SHIFT) != nid &&
-			    early_pfn_to_nid((physbase+size-1) >> PAGE_SHIFT) != nid)
-				continue;
-
-			if (physbase < end_paddr &&
-			    (physbase+size) > start_paddr) {
-				/* overlaps */
-				if (physbase < start_paddr) {
-					size -= start_paddr - physbase;
-					physbase = start_paddr;
-				}
-
-				if (size > end_paddr - physbase)
-					size = end_paddr - physbase;
-
-				dbg("reserve_bootmem %lx %lx\n", physbase,
-				    size);
-				reserve_bootmem_node(NODE_DATA(nid), physbase,
-						     size, BOOTMEM_DEFAULT);
-			}
+	/* Mark reserved regions */
+	for (i = 0; i < lmb.reserved.cnt; i++) {
+		unsigned long physbase = lmb.reserved.region[i].base;
+		unsigned long size = lmb.reserved.region[i].size;
+		unsigned long start_pfn = physbase >> PAGE_SHIFT;
+		unsigned long end_pfn = ((physbase + size) >> PAGE_SHIFT);
+		struct node_active_region node_ar;
+
+		get_node_active_region(start_pfn, &node_ar);
+		while (start_pfn < end_pfn &&
+			node_ar.start_pfn < node_ar.end_pfn) {
+			unsigned long reserve_size = size;
+			/*
+			 * if reserved region extends past active region
+			 * then trim size to active region
+			 */
+			if (end_pfn > node_ar.end_pfn)
+				reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
+					- (start_pfn << PAGE_SHIFT);
+			dbg("reserve_bootmem %lx %lx nid=%d\n", physbase,
+				reserve_size, node_ar.nid);
+			reserve_bootmem_node(NODE_DATA(node_ar.nid), physbase,
+						reserve_size, BOOTMEM_DEFAULT);
+			/*
+			 * if reserved region is contained in the active region
+			 * then done.
+			 */
+			if (end_pfn <= node_ar.end_pfn)
+				break;
+
+			/*
+			 * reserved region extends past the active region
+			 *   get next active region that contains this
+			 *   reserved region
+			 */
+			start_pfn = node_ar.end_pfn;
+			physbase = start_pfn << PAGE_SHIFT;
+			size = size - reserve_size;
+			get_node_active_region(start_pfn, &node_ar);
 		}
 
-		sparse_memory_present_with_active_regions(nid);
 	}
+
+	for_each_online_node(nid)
+		sparse_memory_present_with_active_regions(nid);
 }
 
 void __init paging_init(void)
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index c7584072dfcc..c31d6d26f0b5 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -73,7 +73,7 @@ extern unsigned long p_mapped_by_tlbcam(unsigned long pa);
 #endif /* HAVE_TLBCAM */
 
 #ifdef CONFIG_PTE_64BIT
-/* 44x uses an 8kB pgdir because it has 8-byte Linux PTEs. */
+/* Some processors use an 8kB pgdir because they have 8-byte Linux PTEs. */
 #define PGDIR_ORDER	1
 #else
 #define PGDIR_ORDER	0
@@ -145,13 +145,20 @@ void pte_free(struct mm_struct *mm, pgtable_t ptepage)
 void __iomem *
 ioremap(phys_addr_t addr, unsigned long size)
 {
-	return __ioremap(addr, size, _PAGE_NO_CACHE);
+	return __ioremap(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED);
 }
 EXPORT_SYMBOL(ioremap);
 
 void __iomem *
 ioremap_flags(phys_addr_t addr, unsigned long size, unsigned long flags)
 {
+	/* writeable implies dirty for kernel addresses */
+	if (flags & _PAGE_RW)
+		flags |= _PAGE_DIRTY | _PAGE_HWWRITE;
+
+	/* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
+	flags &= ~(_PAGE_USER | _PAGE_EXEC | _PAGE_HWEXEC);
+
 	return __ioremap(addr, size, flags);
 }
 EXPORT_SYMBOL(ioremap_flags);
@@ -163,6 +170,14 @@ __ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
 	phys_addr_t p;
 	int err;
 
+	/* Make sure we have the base flags */
+	if ((flags & _PAGE_PRESENT) == 0)
+		flags |= _PAGE_KERNEL;
+
+	/* Non-cacheable page cannot be coherent */
+	if (flags & _PAGE_NO_CACHE)
+		flags &= ~_PAGE_COHERENT;
+
 	/*
 	 * Choose an address to map it to.
 	 * Once the vmalloc system is running, we use it.
@@ -219,11 +234,6 @@ __ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
 		v = (ioremap_bot -= size);
 	}
 
-	if ((flags & _PAGE_PRESENT) == 0)
-		flags |= _PAGE_KERNEL;
-	if (flags & _PAGE_NO_CACHE)
-		flags |= _PAGE_GUARDED;
-
 	/*
 	 * Should check if it is a candidate for a BAT mapping
 	 */
@@ -278,7 +288,7 @@ int map_page(unsigned long va, phys_addr_t pa, int flags)
 }
 
 /*
- * Map in all of physical memory starting at KERNELBASE.
+ * Map in a big chunk of physical memory starting at KERNELBASE.
  */
 void __init mapin_ram(void)
 {
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 3ef0ad2f9ca0..365e61ae5dbc 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -107,9 +107,18 @@ void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size,
 {
 	unsigned long i;
 
+	/* Make sure we have the base flags */
 	if ((flags & _PAGE_PRESENT) == 0)
 		flags |= pgprot_val(PAGE_KERNEL);
 
+	/* Non-cacheable page cannot be coherent */
+	if (flags & _PAGE_NO_CACHE)
+		flags &= ~_PAGE_COHERENT;
+
+	/* We don't support the 4K PFN hack with ioremap */
+	if (flags & _PAGE_4K_PFN)
+		return NULL;
+
 	WARN_ON(pa & ~PAGE_MASK);
 	WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
 	WARN_ON(size & ~PAGE_MASK);
@@ -190,6 +199,13 @@ void __iomem * ioremap(phys_addr_t addr, unsigned long size)
 void __iomem * ioremap_flags(phys_addr_t addr, unsigned long size,
 			     unsigned long flags)
 {
+	/* writeable implies dirty for kernel addresses */
+	if (flags & _PAGE_RW)
+		flags |= _PAGE_DIRTY;
+
+	/* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
+	flags &= ~(_PAGE_USER | _PAGE_EXEC);
+
 	if (ppc_md.ioremap)
 		return ppc_md.ioremap(addr, size, flags);
 	return __ioremap(addr, size, flags);
diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
index c53145f61942..6aa120813775 100644
--- a/arch/powerpc/mm/ppc_mmu_32.c
+++ b/arch/powerpc/mm/ppc_mmu_32.c
@@ -236,8 +236,8 @@ void __init MMU_init_hw(void)
 
 	Hash_end = (struct hash_pte *) ((unsigned long)Hash + Hash_size);
 
-	printk("Total memory = %ldMB; using %ldkB for hash table (at %p)\n",
-	       total_memory >> 20, Hash_size >> 10, Hash);
+	printk("Total memory = %lldMB; using %ldkB for hash table (at %p)\n",
+	       (unsigned long long)(total_memory >> 20), Hash_size >> 10, Hash);
 
 
 	/*
diff --git a/arch/powerpc/mm/tlb_32.c b/arch/powerpc/mm/tlb_32.c
index eb4b512d65fa..f9a47fee3927 100644
--- a/arch/powerpc/mm/tlb_32.c
+++ b/arch/powerpc/mm/tlb_32.c
@@ -45,6 +45,7 @@ void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr)
 		flush_hash_pages(mm->context.id, addr, ptephys, 1);
 	}
 }
+EXPORT_SYMBOL(flush_hash_entry);
 
 /*
  * Called by ptep_set_access_flags, must flush on CPUs for which the
diff --git a/arch/powerpc/mm/tlb_64.c b/arch/powerpc/mm/tlb_64.c
index a01b5c608ff9..be7dd422c0fa 100644
--- a/arch/powerpc/mm/tlb_64.c
+++ b/arch/powerpc/mm/tlb_64.c
@@ -34,7 +34,7 @@
 DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
 
 /* This is declared as we are using the more or less generic
- * include/asm-powerpc/tlb.h file -- tgall
+ * arch/powerpc/include/asm/tlb.h file -- tgall
  */
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 static DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
@@ -147,7 +147,7 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 	 */
 	if (huge) {
 #ifdef CONFIG_HUGETLB_PAGE
-		psize = mmu_huge_psize;
+		psize = get_slice_psize(mm, addr);;
 #else
 		BUG();
 		psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */