diff options
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r-- | mm/huge_memory.c | 122 |
1 files changed, 64 insertions, 58 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fd3a07b3e6f4..1ea21e203a70 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -138,9 +138,6 @@ static struct khugepaged_scan khugepaged_scan = { .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), }; -static DEFINE_SPINLOCK(split_queue_lock); -static LIST_HEAD(split_queue); -static unsigned long split_queue_len; static struct shrinker deferred_split_shrinker; static void set_recommended_min_free_kbytes(void) @@ -861,7 +858,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, return false; entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); - pgtable_trans_huge_deposit(mm, pmd, pgtable); + if (pgtable) + pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); atomic_long_inc(&mm->nr_ptes); return true; @@ -1039,13 +1037,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, spinlock_t *dst_ptl, *src_ptl; struct page *src_page; pmd_t pmd; - pgtable_t pgtable; + pgtable_t pgtable = NULL; int ret; - ret = -ENOMEM; - pgtable = pte_alloc_one(dst_mm, addr); - if (unlikely(!pgtable)) - goto out; + if (!vma_is_dax(vma)) { + ret = -ENOMEM; + pgtable = pte_alloc_one(dst_mm, addr); + if (unlikely(!pgtable)) + goto out; + } dst_ptl = pmd_lock(dst_mm, dst_pmd); src_ptl = pmd_lockptr(src_mm, src_pmd); @@ -1076,7 +1076,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, goto out_unlock; } - if (pmd_trans_huge(pmd)) { + if (!vma_is_dax(vma)) { /* thp accounting separate from pmd_devmap accounting */ src_page = pmd_page(pmd); VM_BUG_ON_PAGE(!PageHead(src_page), src_page); @@ -1700,7 +1700,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); - if (pmd_move_must_withdraw(new_ptl, old_ptl)) { + if (pmd_move_must_withdraw(new_ptl, old_ptl) && + vma_is_anonymous(vma)) { pgtable_t pgtable; pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); pgtable_trans_huge_deposit(mm, new_pmd, pgtable); @@ -2835,6 +2836,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, pgtable_t pgtable; pmd_t _pmd; bool young, write, dirty; + unsigned long addr; int i; VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); @@ -2860,10 +2862,11 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, young = pmd_young(*pmd); dirty = pmd_dirty(*pmd); + pmdp_huge_split_prepare(vma, haddr, pmd); pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); - for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { pte_t entry, *pte; /* * Note that NUMA hinting access restrictions are not @@ -2884,9 +2887,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } if (dirty) SetPageDirty(page + i); - pte = pte_offset_map(&_pmd, haddr); + pte = pte_offset_map(&_pmd, addr); BUG_ON(!pte_none(*pte)); - set_pte_at(mm, haddr, pte, entry); + set_pte_at(mm, addr, pte, entry); atomic_inc(&page[i]._mapcount); pte_unmap(pte); } @@ -2936,7 +2939,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, pmd_populate(mm, pmd, pgtable); if (freeze) { - for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { + for (i = 0; i < HPAGE_PMD_NR; i++) { page_remove_rmap(page + i, false); put_page(page + i); } @@ -3217,28 +3220,26 @@ static void unfreeze_page(struct anon_vma *anon_vma, struct page *page) } } -static int __split_huge_page_tail(struct page *head, int tail, +static void __split_huge_page_tail(struct page *head, int tail, struct lruvec *lruvec, struct list_head *list) { - int mapcount; struct page *page_tail = head + tail; - mapcount = atomic_read(&page_tail->_mapcount) + 1; + VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail); /* * tail_page->_count is zero and not changing from under us. But * get_page_unless_zero() may be running from under us on the - * tail_page. If we used atomic_set() below instead of atomic_add(), we + * tail_page. If we used atomic_set() below instead of atomic_inc(), we * would then run atomic_set() concurrently with * get_page_unless_zero(), and atomic_set() is implemented in C not * using locked ops. spin_unlock on x86 sometime uses locked ops * because of PPro errata 66, 92, so unless somebody can guarantee * atomic_set() here would be safe on all archs (and not only on x86), - * it's safer to use atomic_add(). + * it's safer to use atomic_inc(). */ - atomic_add(mapcount + 1, &page_tail->_count); - + atomic_inc(&page_tail->_count); page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; page_tail->flags |= (head->flags & @@ -3272,8 +3273,6 @@ static int __split_huge_page_tail(struct page *head, int tail, page_tail->index = head->index + tail; page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); lru_add_page_tail(head, page_tail, lruvec, list); - - return mapcount; } static void __split_huge_page(struct page *page, struct list_head *list) @@ -3281,7 +3280,7 @@ static void __split_huge_page(struct page *page, struct list_head *list) struct page *head = compound_head(page); struct zone *zone = page_zone(head); struct lruvec *lruvec; - int i, tail_mapcount; + int i; /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irq(&zone->lru_lock); @@ -3290,10 +3289,8 @@ static void __split_huge_page(struct page *page, struct list_head *list) /* complete memcg works before add pages to LRU */ mem_cgroup_split_huge_fixup(head); - tail_mapcount = 0; for (i = HPAGE_PMD_NR - 1; i >= 1; i--) - tail_mapcount += __split_huge_page_tail(head, i, lruvec, list); - atomic_sub(tail_mapcount, &head->_count); + __split_huge_page_tail(head, i, lruvec, list); ClearPageCompound(head); spin_unlock_irq(&zone->lru_lock); @@ -3358,6 +3355,7 @@ int total_mapcount(struct page *page) int split_huge_page_to_list(struct page *page, struct list_head *list) { struct page *head = compound_head(page); + struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); struct anon_vma *anon_vma; int count, mapcount, ret; bool mlocked; @@ -3401,19 +3399,19 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) lru_add_drain(); /* Prevent deferred_split_scan() touching ->_count */ - spin_lock_irqsave(&split_queue_lock, flags); + spin_lock_irqsave(&pgdata->split_queue_lock, flags); count = page_count(head); mapcount = total_mapcount(head); if (!mapcount && count == 1) { if (!list_empty(page_deferred_list(head))) { - split_queue_len--; + pgdata->split_queue_len--; list_del(page_deferred_list(head)); } - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); __split_huge_page(page, list); ret = 0; } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); pr_alert("total_mapcount: %u, page_count(): %u\n", mapcount, count); if (PageTail(page)) @@ -3421,7 +3419,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) dump_page(page, "total_mapcount(head) > 0"); BUG(); } else { - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); unfreeze_page(anon_vma, head); ret = -EBUSY; } @@ -3436,64 +3434,65 @@ out: void free_transhuge_page(struct page *page) { + struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); unsigned long flags; - spin_lock_irqsave(&split_queue_lock, flags); + spin_lock_irqsave(&pgdata->split_queue_lock, flags); if (!list_empty(page_deferred_list(page))) { - split_queue_len--; + pgdata->split_queue_len--; list_del(page_deferred_list(page)); } - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); free_compound_page(page); } void deferred_split_huge_page(struct page *page) { + struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); unsigned long flags; VM_BUG_ON_PAGE(!PageTransHuge(page), page); - spin_lock_irqsave(&split_queue_lock, flags); + spin_lock_irqsave(&pgdata->split_queue_lock, flags); if (list_empty(page_deferred_list(page))) { - list_add_tail(page_deferred_list(page), &split_queue); - split_queue_len++; + list_add_tail(page_deferred_list(page), &pgdata->split_queue); + pgdata->split_queue_len++; } - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); } static unsigned long deferred_split_count(struct shrinker *shrink, struct shrink_control *sc) { - /* - * Split a page from split_queue will free up at least one page, - * at most HPAGE_PMD_NR - 1. We don't track exact number. - * Let's use HPAGE_PMD_NR / 2 as ballpark. - */ - return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2; + struct pglist_data *pgdata = NODE_DATA(sc->nid); + return ACCESS_ONCE(pgdata->split_queue_len); } static unsigned long deferred_split_scan(struct shrinker *shrink, struct shrink_control *sc) { + struct pglist_data *pgdata = NODE_DATA(sc->nid); unsigned long flags; LIST_HEAD(list), *pos, *next; struct page *page; int split = 0; - spin_lock_irqsave(&split_queue_lock, flags); - list_splice_init(&split_queue, &list); - + spin_lock_irqsave(&pgdata->split_queue_lock, flags); /* Take pin on all head pages to avoid freeing them under us */ - list_for_each_safe(pos, next, &list) { + list_for_each_safe(pos, next, &pgdata->split_queue) { page = list_entry((void *)pos, struct page, mapping); page = compound_head(page); - /* race with put_compound_page() */ - if (!get_page_unless_zero(page)) { + if (get_page_unless_zero(page)) { + list_move(page_deferred_list(page), &list); + } else { + /* We lost race with put_compound_page() */ list_del_init(page_deferred_list(page)); - split_queue_len--; + pgdata->split_queue_len--; } + if (!--sc->nr_to_scan) + break; } - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); list_for_each_safe(pos, next, &list) { page = list_entry((void *)pos, struct page, mapping); @@ -3505,17 +3504,24 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, put_page(page); } - spin_lock_irqsave(&split_queue_lock, flags); - list_splice_tail(&list, &split_queue); - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_lock_irqsave(&pgdata->split_queue_lock, flags); + list_splice_tail(&list, &pgdata->split_queue); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); - return split * HPAGE_PMD_NR / 2; + /* + * Stop shrinker if we didn't split any page, but the queue is empty. + * This can happen if pages were freed under us. + */ + if (!split && list_empty(&pgdata->split_queue)) + return SHRINK_STOP; + return split; } static struct shrinker deferred_split_shrinker = { .count_objects = deferred_split_count, .scan_objects = deferred_split_scan, .seeks = DEFAULT_SEEKS, + .flags = SHRINKER_NUMA_AWARE, }; #ifdef CONFIG_DEBUG_FS |