summaryrefslogtreecommitdiffstats
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c118
1 files changed, 50 insertions, 68 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 86fe697e8bfb..87ab9b8f56b5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -842,20 +842,15 @@ EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd)
+ pmd_t *pmd, int flags)
{
pmd_t _pmd;
- /*
- * We should set the dirty bit only for FOLL_WRITE but for now
- * the dirty bit in the pmd is meaningless. And if the dirty
- * bit will become meaningful and we'll only set it with
- * FOLL_WRITE, an atomic set_bit will be required on the pmd to
- * set the young bit, instead of the current set_pmd_at.
- */
- _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
+ _pmd = pmd_mkyoung(*pmd);
+ if (flags & FOLL_WRITE)
+ _pmd = pmd_mkdirty(_pmd);
if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
- pmd, _pmd, 1))
+ pmd, _pmd, flags & FOLL_WRITE))
update_mmu_cache_pmd(vma, addr, pmd);
}
@@ -884,7 +879,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
return NULL;
if (flags & FOLL_TOUCH)
- touch_pmd(vma, addr, pmd);
+ touch_pmd(vma, addr, pmd, flags);
/*
* device mapped pages can only be returned if the
@@ -995,20 +990,15 @@ out:
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
- pud_t *pud)
+ pud_t *pud, int flags)
{
pud_t _pud;
- /*
- * We should set the dirty bit only for FOLL_WRITE but for now
- * the dirty bit in the pud is meaningless. And if the dirty
- * bit will become meaningful and we'll only set it with
- * FOLL_WRITE, an atomic set_bit will be required on the pud to
- * set the young bit, instead of the current set_pud_at.
- */
- _pud = pud_mkyoung(pud_mkdirty(*pud));
+ _pud = pud_mkyoung(*pud);
+ if (flags & FOLL_WRITE)
+ _pud = pud_mkdirty(_pud);
if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
- pud, _pud, 1))
+ pud, _pud, flags & FOLL_WRITE))
update_mmu_cache_pud(vma, addr, pud);
}
@@ -1031,7 +1021,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
return NULL;
if (flags & FOLL_TOUCH)
- touch_pud(vma, addr, pud);
+ touch_pud(vma, addr, pud, flags);
/*
* device mapped pages can only be returned if the
@@ -1424,7 +1414,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
page = pmd_page(*pmd);
VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
if (flags & FOLL_TOUCH)
- touch_pmd(vma, addr, pmd);
+ touch_pmd(vma, addr, pmd, flags);
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
/*
* We don't mlock() pte-mapped THPs. This way we can avoid
@@ -1920,17 +1910,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
* pmdp_invalidate() is required to make sure we don't miss
* dirty/young flags set by hardware.
*/
- entry = *pmd;
- pmdp_invalidate(vma, addr, pmd);
-
- /*
- * Recover dirty/young flags. It relies on pmdp_invalidate to not
- * corrupt them.
- */
- if (pmd_dirty(*pmd))
- entry = pmd_mkdirty(entry);
- if (pmd_young(*pmd))
- entry = pmd_mkyoung(entry);
+ entry = pmdp_invalidate(vma, addr, pmd);
entry = pmd_modify(entry, newprot);
if (preserve_write)
@@ -2083,8 +2063,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
struct mm_struct *mm = vma->vm_mm;
struct page *page;
pgtable_t pgtable;
- pmd_t _pmd;
- bool young, write, dirty, soft_dirty, pmd_migration = false;
+ pmd_t old_pmd, _pmd;
+ bool young, write, soft_dirty, pmd_migration = false;
unsigned long addr;
int i;
@@ -2126,24 +2106,50 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
return __split_huge_zero_page_pmd(vma, haddr, pmd);
}
+ /*
+ * Up to this point the pmd is present and huge and userland has the
+ * whole access to the hugepage during the split (which happens in
+ * place). If we overwrite the pmd with the not-huge version pointing
+ * to the pte here (which of course we could if all CPUs were bug
+ * free), userland could trigger a small page size TLB miss on the
+ * small sized TLB while the hugepage TLB entry is still established in
+ * the huge TLB. Some CPU doesn't like that.
+ * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
+ * 383 on page 93. Intel should be safe but is also warns that it's
+ * only safe if the permission and cache attributes of the two entries
+ * loaded in the two TLB is identical (which should be the case here).
+ * But it is generally safer to never allow small and huge TLB entries
+ * for the same virtual address to be loaded simultaneously. So instead
+ * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
+ * current pmd notpresent (atomically because here the pmd_trans_huge
+ * must remain set at all times on the pmd until the split is complete
+ * for this pmd), then we flush the SMP TLB and finally we write the
+ * non-huge version of the pmd entry with pmd_populate.
+ */
+ old_pmd = pmdp_invalidate(vma, haddr, pmd);
+
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
- pmd_migration = is_pmd_migration_entry(*pmd);
+ pmd_migration = is_pmd_migration_entry(old_pmd);
if (pmd_migration) {
swp_entry_t entry;
- entry = pmd_to_swp_entry(*pmd);
+ entry = pmd_to_swp_entry(old_pmd);
page = pfn_to_page(swp_offset(entry));
} else
#endif
- page = pmd_page(*pmd);
+ page = pmd_page(old_pmd);
VM_BUG_ON_PAGE(!page_count(page), page);
page_ref_add(page, HPAGE_PMD_NR - 1);
- write = pmd_write(*pmd);
- young = pmd_young(*pmd);
- dirty = pmd_dirty(*pmd);
- soft_dirty = pmd_soft_dirty(*pmd);
+ if (pmd_dirty(old_pmd))
+ SetPageDirty(page);
+ write = pmd_write(old_pmd);
+ young = pmd_young(old_pmd);
+ soft_dirty = pmd_soft_dirty(old_pmd);
- pmdp_huge_split_prepare(vma, haddr, pmd);
+ /*
+ * Withdraw the table only after we mark the pmd entry invalid.
+ * This's critical for some architectures (Power).
+ */
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
@@ -2170,8 +2176,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
if (soft_dirty)
entry = pte_mksoft_dirty(entry);
}
- if (dirty)
- SetPageDirty(page + i);
pte = pte_offset_map(&_pmd, addr);
BUG_ON(!pte_none(*pte));
set_pte_at(mm, addr, pte, entry);
@@ -2199,28 +2203,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
}
smp_wmb(); /* make pte visible before pmd */
- /*
- * Up to this point the pmd is present and huge and userland has the
- * whole access to the hugepage during the split (which happens in
- * place). If we overwrite the pmd with the not-huge version pointing
- * to the pte here (which of course we could if all CPUs were bug
- * free), userland could trigger a small page size TLB miss on the
- * small sized TLB while the hugepage TLB entry is still established in
- * the huge TLB. Some CPU doesn't like that.
- * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
- * 383 on page 93. Intel should be safe but is also warns that it's
- * only safe if the permission and cache attributes of the two entries
- * loaded in the two TLB is identical (which should be the case here).
- * But it is generally safer to never allow small and huge TLB entries
- * for the same virtual address to be loaded simultaneously. So instead
- * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
- * current pmd notpresent (atomically because here the pmd_trans_huge
- * and pmd_trans_splitting must remain set at all times on the pmd
- * until the split is complete for this pmd), then we flush the SMP TLB
- * and finally we write the non-huge version of the pmd entry with
- * pmd_populate.
- */
- pmdp_invalidate(vma, haddr, pmd);
pmd_populate(mm, pmd, pgtable);
if (freeze) {
OpenPOWER on IntegriCloud