From 50c6a665b383cb5839e45d04e36faeeefaffa052 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Fri, 10 Apr 2015 19:37:34 -0500 Subject: powerpc/hugetlb: Call mm_dec_nr_pmds() in hugetlb_free_pmd_range() Commit dc6c9a35b66b5 ("mm: account pmd page tables to the process") added a counter that is incremented whenever a PMD is allocated and decremented whenever a PMD is freed. For hugepages on PPC, common code is used to allocated PMDs, but arch-specific code is used to free PMDs. This results in kernel output such as "BUG: non-zero nr_pmds on freeing mm: 1" when using hugepages. Update the PPC hugepage PMD freeing code to decrement the count, just as the above commit did for free_pmd_range(). Fixes: dc6c9a35b66b5 ("mm: account pmd page tables to the process") Signed-off-by: Scott Wood Reviewed-by: Aneesh Kumar K.V Cc: stable@vger.kernel.org # 4.0.x --- arch/powerpc/mm/hugetlbpage.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc/mm/hugetlbpage.c') diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 7e408bfc7948..cecbe00cee24 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -581,6 +581,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, pmd = pmd_offset(pud, start); pud_clear(pud); pmd_free_tlb(tlb, pmd, start); + mm_dec_nr_pmds(tlb->mm); } static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, -- cgit v1.2.1 From 691e95fd7396905a38d98919e9c150dbc3ea21a3 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 30 Mar 2015 10:41:03 +0530 Subject: powerpc/mm/thp: Make page table walk safe against thp split/collapse We can disable a THP split or a hugepage collapse by disabling irq. We do send IPI to all the cpus in the early part of split/collapse, and disabling local irq ensure we don't make progress with split/collapse. If the THP is getting split we return NULL from find_linux_pte_or_hugepte(). For all the current callers it should be ok. We need to be careful if we want to use returned pte_t pointer outside the irq disabled region. W.r.t to THP split, the pfn remains the same, but then a hugepage collapse will result in a pfn change. There are few steps we can take to avoid a hugepage collapse.One way is to take page reference inside the irq disable region. Other option is to take mmap_sem so that a parallel collapse will not happen. We can also disable collapse by taking pmd_lock. Another method used by kvm subsystem is to check whether we had a mmu_notifer update in between using mmu_notifier_retry(). Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hugetlbpage.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/mm/hugetlbpage.c') diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 2cb278a2f658..a9dbb27ca887 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -109,7 +109,7 @@ int pgd_huge(pgd_t pgd) pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { /* Only called for hugetlbfs pages, hence can ignore THP */ - return find_linux_pte_or_hugepte(mm->pgd, addr, NULL); + return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL); } static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, @@ -682,28 +682,35 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, } while (addr = next, addr != end); } +/* + * We are holding mmap_sem, so a parallel huge page collapse cannot run. + * To prevent hugepage split, disable irq. + */ struct page * follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { pte_t *ptep; struct page *page; unsigned shift; - unsigned long mask; + unsigned long mask, flags; /* * Transparent hugepages are handled by generic code. We can skip them * here. */ + local_irq_save(flags); ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift); /* Verify it is a huge page else bail. */ - if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep)) + if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep)) { + local_irq_restore(flags); return ERR_PTR(-EINVAL); - + } mask = (1UL << shift) - 1; page = pte_page(*ptep); if (page) page += (address & mask) / PAGE_SIZE; + local_irq_restore(flags); return page; } @@ -950,9 +957,12 @@ void flush_dcache_icache_hugepage(struct page *page) * * So long as we atomically load page table pointers we are safe against teardown, * we can follow the address down to the the page and take a ref on it. + * This function need to be called with interrupts disabled. We use this variant + * when we have MSR[EE] = 0 but the paca->soft_enabled = 1 */ -pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) +pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, + unsigned *shift) { pgd_t pgd, *pgdp; pud_t pud, *pudp; @@ -1031,7 +1041,7 @@ out: *shift = pdshift; return ret_pte; } -EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte); +EXPORT_SYMBOL_GPL(__find_linux_pte_or_hugepte); int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) -- cgit v1.2.1 From 7d6e7f7ffaba4e013c7a0589140431799bc17985 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 30 Mar 2015 10:41:04 +0530 Subject: powerpc/mm/thp: Return pte address if we find trans_splitting. For THP that is marked trans splitting, we return the pte. This require the callers to handle the pmd_trans_splitting scenario, if they care. All the current callers are either looking at pfn or write_ok, hence we don't need to update them. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hugetlbpage.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/powerpc/mm/hugetlbpage.c') diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index a9dbb27ca887..0ce968b00b7c 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -1014,12 +1014,11 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, * A hugepage collapse is captured by pmd_none, because * it mark the pmd none and do a hpte invalidate. * - * A hugepage split is captured by pmd_trans_splitting - * because we mark the pmd trans splitting and do a - * hpte invalidate - * + * We don't worry about pmd_trans_splitting here, The + * caller if it needs to handle the splitting case + * should check for that. */ - if (pmd_none(pmd) || pmd_trans_splitting(pmd)) + if (pmd_none(pmd)) return NULL; if (pmd_huge(pmd) || pmd_large(pmd)) { -- cgit v1.2.1