diff options
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r-- | mm/huge_memory.c | 196 |
1 files changed, 99 insertions, 97 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 55478ab3c83b..faf357eaf0ce 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -62,6 +62,16 @@ static struct shrinker deferred_split_shrinker; static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; +bool transparent_hugepage_enabled(struct vm_area_struct *vma) +{ + if (vma_is_anonymous(vma)) + return __transparent_hugepage_enabled(vma); + if (vma_is_shmem(vma) && shmem_huge_enabled(vma)) + return __transparent_hugepage_enabled(vma); + + return false; +} + static struct page *get_huge_zero_page(void) { struct page *zero_page; @@ -420,7 +430,7 @@ static int __init hugepage_init(void) * where the extra memory used could hurt more than TLB overhead * is likely to save. The admin can still enable it through /sys. */ - if (totalram_pages < (512 << (20 - PAGE_SHIFT))) { + if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) { transparent_hugepage_flags = 0; return 0; } @@ -558,7 +568,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, return VM_FAULT_FALLBACK; } - pgtable = pte_alloc_one(vma->vm_mm, haddr); + pgtable = pte_alloc_one(vma->vm_mm); if (unlikely(!pgtable)) { ret = VM_FAULT_OOM; goto release; @@ -629,40 +639,30 @@ release: * available * never: never stall for any thp allocation */ -static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr) +static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) { const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); - gfp_t this_node = 0; - -#ifdef CONFIG_NUMA - struct mempolicy *pol; - /* - * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not - * specified, to express a general desire to stay on the current - * node for optimistic allocation attempts. If the defrag mode - * and/or madvise hint requires the direct reclaim then we prefer - * to fallback to other node rather than node reclaim because that - * can lead to excessive reclaim even though there is free memory - * on other nodes. We expect that NUMA preferences are specified - * by memory policies. - */ - pol = get_vma_policy(vma, addr); - if (pol->mode != MPOL_BIND) - this_node = __GFP_THISNODE; - mpol_cond_put(pol); -#endif + /* Always do synchronous compaction */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); + + /* Kick kcompactd and fail quickly */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node; + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; + + /* Synchronous compaction if madvised, otherwise kick kcompactd */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : - __GFP_KSWAPD_RECLAIM | this_node); + return GFP_TRANSHUGE_LIGHT | + (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM); + + /* Only do synchronous compaction if madvised */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : - this_node); - return GFP_TRANSHUGE_LIGHT | this_node; + return GFP_TRANSHUGE_LIGHT | + (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); + + return GFP_TRANSHUGE_LIGHT; } /* Caller must hold page table lock. */ @@ -702,7 +702,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) struct page *zero_page; bool set; vm_fault_t ret; - pgtable = pte_alloc_one(vma->vm_mm, haddr); + pgtable = pte_alloc_one(vma->vm_mm); if (unlikely(!pgtable)) return VM_FAULT_OOM; zero_page = mm_get_huge_zero_page(vma->vm_mm); @@ -734,8 +734,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) pte_free(vma->vm_mm, pgtable); return ret; } - gfp = alloc_hugepage_direct_gfpmask(vma, haddr); - page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id()); + gfp = alloc_hugepage_direct_gfpmask(vma); + page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); if (unlikely(!page)) { count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -791,7 +791,7 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, return VM_FAULT_SIGBUS; if (arch_needs_pgtable_deposit()) { - pgtable = pte_alloc_one(vma->vm_mm, addr); + pgtable = pte_alloc_one(vma->vm_mm); if (!pgtable) return VM_FAULT_OOM; } @@ -927,7 +927,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (!vma_is_anonymous(vma)) return 0; - pgtable = pte_alloc_one(dst_mm, addr); + pgtable = pte_alloc_one(dst_mm); if (unlikely(!pgtable)) goto out; @@ -1144,8 +1144,7 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, int i; vm_fault_t ret = 0; struct page **pages; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ + struct mmu_notifier_range range; pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *), GFP_KERNEL); @@ -1183,9 +1182,9 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, cond_resched(); } - mmun_start = haddr; - mmun_end = haddr + HPAGE_PMD_SIZE; - mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); + mmu_notifier_range_init(&range, vma->vm_mm, haddr, + haddr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) @@ -1230,8 +1229,7 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, * No need to double call mmu_notifier->invalidate_range() callback as * the above pmdp_huge_clear_flush_notify() did already call it. */ - mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start, - mmun_end); + mmu_notifier_invalidate_range_only_end(&range); ret |= VM_FAULT_WRITE; put_page(page); @@ -1241,7 +1239,7 @@ out: out_free_pages: spin_unlock(vmf->ptl); - mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); for (i = 0; i < HPAGE_PMD_NR; i++) { memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); @@ -1258,8 +1256,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) struct page *page = NULL, *new_page; struct mem_cgroup *memcg; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ + struct mmu_notifier_range range; gfp_t huge_gfp; /* for allocation and charge */ vm_fault_t ret = 0; @@ -1303,11 +1300,10 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) get_page(page); spin_unlock(vmf->ptl); alloc: - if (transparent_hugepage_enabled(vma) && + if (__transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { - huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr); - new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma, - haddr, numa_node_id()); + huge_gfp = alloc_hugepage_direct_gfpmask(vma); + new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); } else new_page = NULL; @@ -1349,9 +1345,9 @@ alloc: vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); - mmun_start = haddr; - mmun_end = haddr + HPAGE_PMD_SIZE; - mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); + mmu_notifier_range_init(&range, vma->vm_mm, haddr, + haddr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); spin_lock(vmf->ptl); if (page) @@ -1386,8 +1382,7 @@ out_mn: * No need to double call mmu_notifier->invalidate_range() callback as * the above pmdp_huge_clear_flush_notify() did already call it. */ - mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start, - mmun_end); + mmu_notifier_invalidate_range_only_end(&range); out: return ret; out_unlock: @@ -1501,8 +1496,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) if (!get_page_unless_zero(page)) goto out_unlock; spin_unlock(vmf->ptl); - wait_on_page_locked(page); - put_page(page); + put_and_wait_on_page_locked(page); goto out; } @@ -1538,8 +1532,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) if (!get_page_unless_zero(page)) goto out_unlock; spin_unlock(vmf->ptl); - wait_on_page_locked(page); - put_page(page); + put_and_wait_on_page_locked(page); goto out; } @@ -2028,14 +2021,15 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, unsigned long address) { spinlock_t *ptl; - struct mm_struct *mm = vma->vm_mm; - unsigned long haddr = address & HPAGE_PUD_MASK; + struct mmu_notifier_range range; - mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE); - ptl = pud_lock(mm, pud); + mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PUD_MASK, + (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); + mmu_notifier_invalidate_range_start(&range); + ptl = pud_lock(vma->vm_mm, pud); if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud))) goto out; - __split_huge_pud_locked(vma, pud, haddr); + __split_huge_pud_locked(vma, pud, range.start); out: spin_unlock(ptl); @@ -2043,8 +2037,7 @@ out: * No need to double call mmu_notifier->invalidate_range() callback as * the above pudp_huge_clear_flush_notify() did already call it. */ - mmu_notifier_invalidate_range_only_end(mm, haddr, haddr + - HPAGE_PUD_SIZE); + mmu_notifier_invalidate_range_only_end(&range); } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ @@ -2155,23 +2148,25 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, */ old_pmd = pmdp_invalidate(vma, haddr, pmd); -#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION pmd_migration = is_pmd_migration_entry(old_pmd); - if (pmd_migration) { + if (unlikely(pmd_migration)) { swp_entry_t entry; entry = pmd_to_swp_entry(old_pmd); page = pfn_to_page(swp_offset(entry)); - } else -#endif + write = is_write_migration_entry(entry); + young = false; + soft_dirty = pmd_swp_soft_dirty(old_pmd); + } else { page = pmd_page(old_pmd); + if (pmd_dirty(old_pmd)) + SetPageDirty(page); + write = pmd_write(old_pmd); + young = pmd_young(old_pmd); + soft_dirty = pmd_soft_dirty(old_pmd); + } VM_BUG_ON_PAGE(!page_count(page), page); page_ref_add(page, HPAGE_PMD_NR - 1); - if (pmd_dirty(old_pmd)) - SetPageDirty(page); - write = pmd_write(old_pmd); - young = pmd_young(old_pmd); - soft_dirty = pmd_soft_dirty(old_pmd); /* * Withdraw the table only after we mark the pmd entry invalid. @@ -2244,11 +2239,12 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct page *page) { spinlock_t *ptl; - struct mm_struct *mm = vma->vm_mm; - unsigned long haddr = address & HPAGE_PMD_MASK; + struct mmu_notifier_range range; - mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE); - ptl = pmd_lock(mm, pmd); + mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PMD_MASK, + (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); + ptl = pmd_lock(vma->vm_mm, pmd); /* * If caller asks to setup a migration entries, we need a page to check @@ -2264,7 +2260,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, clear_page_mlock(page); } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) goto out; - __split_huge_pmd_locked(vma, pmd, haddr, freeze); + __split_huge_pmd_locked(vma, pmd, range.start, freeze); out: spin_unlock(ptl); /* @@ -2280,8 +2276,7 @@ out: * any further changes to individual pte will notify. So no need * to call mmu_notifier->invalidate_range() */ - mmu_notifier_invalidate_range_only_end(mm, haddr, haddr + - HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_only_end(&range); } void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, @@ -2350,7 +2345,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, } } -static void freeze_page(struct page *page) +static void unmap_page(struct page *page) { enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; @@ -2365,7 +2360,7 @@ static void freeze_page(struct page *page) VM_BUG_ON_PAGE(!unmap_success, page); } -static void unfreeze_page(struct page *page) +static void remap_page(struct page *page) { int i; if (PageTransHuge(page)) { @@ -2402,6 +2397,12 @@ static void __split_huge_page_tail(struct page *head, int tail, (1L << PG_unevictable) | (1L << PG_dirty))); + /* ->mapping in first tail page is compound_mapcount */ + VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, + page_tail); + page_tail->mapping = head->mapping; + page_tail->index = head->index + tail; + /* Page flags must be visible before we make the page non-compound. */ smp_wmb(); @@ -2422,12 +2423,6 @@ static void __split_huge_page_tail(struct page *head, int tail, if (page_is_idle(head)) set_page_idle(page_tail); - /* ->mapping in first tail page is compound_mapcount */ - VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, - page_tail); - page_tail->mapping = head->mapping; - - page_tail->index = head->index + tail; page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); /* @@ -2439,12 +2434,11 @@ static void __split_huge_page_tail(struct page *head, int tail, } static void __split_huge_page(struct page *page, struct list_head *list, - unsigned long flags) + pgoff_t end, unsigned long flags) { struct page *head = compound_head(page); struct zone *zone = page_zone(head); struct lruvec *lruvec; - pgoff_t end = -1; int i; lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat); @@ -2452,9 +2446,6 @@ static void __split_huge_page(struct page *page, struct list_head *list, /* complete memcg works before add pages to LRU */ mem_cgroup_split_huge_fixup(head); - if (!PageAnon(page)) - end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE); - for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { __split_huge_page_tail(head, i, lruvec, list); /* Some pages can be beyond i_size: drop them from page cache */ @@ -2483,7 +2474,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); - unfreeze_page(head); + remap_page(head); for (i = 0; i < HPAGE_PMD_NR; i++) { struct page *subpage = head + i; @@ -2626,6 +2617,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) int count, mapcount, extra_pins, ret; bool mlocked; unsigned long flags; + pgoff_t end; VM_BUG_ON_PAGE(is_huge_zero_page(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -2648,6 +2640,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) ret = -EBUSY; goto out; } + end = -1; mapping = NULL; anon_vma_lock_write(anon_vma); } else { @@ -2661,10 +2654,19 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) anon_vma = NULL; i_mmap_lock_read(mapping); + + /* + *__split_huge_page() may need to trim off pages beyond EOF: + * but on 32-bit, i_size_read() takes an irq-unsafe seqlock, + * which cannot be nested inside the page tree lock. So note + * end now: i_size itself may be changed at any moment, but + * head page lock is good enough to serialize the trimming. + */ + end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); } /* - * Racy check if we can split the page, before freeze_page() will + * Racy check if we can split the page, before unmap_page() will * split PMDs */ if (!can_split_huge_page(head, &extra_pins)) { @@ -2673,7 +2675,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } mlocked = PageMlocked(page); - freeze_page(head); + unmap_page(head); VM_BUG_ON_PAGE(compound_mapcount(head), head); /* Make sure the page is not on per-CPU pagevec as it takes pin */ @@ -2707,7 +2709,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mapping) __dec_node_page_state(page, NR_SHMEM_THPS); spin_unlock(&pgdata->split_queue_lock); - __split_huge_page(page, list, flags); + __split_huge_page(page, list, end, flags); if (PageSwapCache(head)) { swp_entry_t entry = { .val = page_private(head) }; @@ -2727,7 +2729,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) fail: if (mapping) xa_unlock(&mapping->i_pages); spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); - unfreeze_page(head); + remap_page(head); ret = -EBUSY; } |