diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-12-13 13:11:15 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-12-13 13:11:15 -0800 |
commit | f6e858a00af788bab0fd4c0b7f5cd788000edc18 (patch) | |
tree | f9403ca3671be9821dbf83e726e61dbe75fbca6b /mm/huge_memory.c | |
parent | 193c0d682525987db59ac3a24531a77e4947aa95 (diff) | |
parent | 98870901cce098bbe94d90d2c41d8d1fa8d94392 (diff) | |
download | blackbird-op-linux-f6e858a00af788bab0fd4c0b7f5cd788000edc18.tar.gz blackbird-op-linux-f6e858a00af788bab0fd4c0b7f5cd788000edc18.zip |
Merge branch 'akpm' (Andrew's patch-bomb)
Merge misc VM changes from Andrew Morton:
"The rest of most-of-MM. The other MM bits await a slab merge.
This patch includes the addition of a huge zero_page. Not a
performance boost but it an save large amounts of physical memory in
some situations.
Also a bunch of Fujitsu engineers are working on memory hotplug.
Which, as it turns out, was badly broken. About half of their patches
are included here; the remainder are 3.8 material."
However, this merge disables CONFIG_MOVABLE_NODE, which was totally
broken. We don't add new features with "default y", nor do we add
Kconfig questions that are incomprehensible to most people without any
help text. Does the feature even make sense without compaction or
memory hotplug?
* akpm: (54 commits)
mm/bootmem.c: remove unused wrapper function reserve_bootmem_generic()
mm/memory.c: remove unused code from do_wp_page()
asm-generic, mm: pgtable: consolidate zero page helpers
mm/hugetlb.c: fix warning on freeing hwpoisoned hugepage
hwpoison, hugetlbfs: fix RSS-counter warning
hwpoison, hugetlbfs: fix "bad pmd" warning in unmapping hwpoisoned hugepage
mm: protect against concurrent vma expansion
memcg: do not check for mm in __mem_cgroup_count_vm_event
tmpfs: support SEEK_DATA and SEEK_HOLE (reprise)
mm: provide more accurate estimation of pages occupied by memmap
fs/buffer.c: remove redundant initialization in alloc_page_buffers()
fs/buffer.c: do not inline exported function
writeback: fix a typo in comment
mm: introduce new field "managed_pages" to struct zone
mm, oom: remove statically defined arch functions of same name
mm, oom: remove redundant sleep in pagefault oom handler
mm, oom: cleanup pagefault oom handler
memory_hotplug: allow online/offline memory to result movable node
numa: add CONFIG_MOVABLE_NODE for movable-dedicated node
mm, memcg: avoid unnecessary function call when memcg is disabled
...
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r-- | mm/huge_memory.c | 359 |
1 files changed, 333 insertions, 26 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5f902e20e8c0..827d9c813051 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -12,12 +12,14 @@ #include <linux/mmu_notifier.h> #include <linux/rmap.h> #include <linux/swap.h> +#include <linux/shrinker.h> #include <linux/mm_inline.h> #include <linux/kthread.h> #include <linux/khugepaged.h> #include <linux/freezer.h> #include <linux/mman.h> #include <linux/pagemap.h> + #include <asm/tlb.h> #include <asm/pgalloc.h> #include "internal.h" @@ -37,7 +39,8 @@ unsigned long transparent_hugepage_flags __read_mostly = (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| #endif (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| - (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); + (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| + (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); /* default scan 8*512 pte (or vmas) every 30 second */ static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; @@ -159,6 +162,77 @@ static int start_khugepaged(void) return err; } +static atomic_t huge_zero_refcount; +static unsigned long huge_zero_pfn __read_mostly; + +static inline bool is_huge_zero_pfn(unsigned long pfn) +{ + unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn); + return zero_pfn && pfn == zero_pfn; +} + +static inline bool is_huge_zero_pmd(pmd_t pmd) +{ + return is_huge_zero_pfn(pmd_pfn(pmd)); +} + +static unsigned long get_huge_zero_page(void) +{ + struct page *zero_page; +retry: + if (likely(atomic_inc_not_zero(&huge_zero_refcount))) + return ACCESS_ONCE(huge_zero_pfn); + + zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, + HPAGE_PMD_ORDER); + if (!zero_page) { + count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); + return 0; + } + count_vm_event(THP_ZERO_PAGE_ALLOC); + preempt_disable(); + if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) { + preempt_enable(); + __free_page(zero_page); + goto retry; + } + + /* We take additional reference here. It will be put back by shrinker */ + atomic_set(&huge_zero_refcount, 2); + preempt_enable(); + return ACCESS_ONCE(huge_zero_pfn); +} + +static void put_huge_zero_page(void) +{ + /* + * Counter should never go to zero here. Only shrinker can put + * last reference. + */ + BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); +} + +static int shrink_huge_zero_page(struct shrinker *shrink, + struct shrink_control *sc) +{ + if (!sc->nr_to_scan) + /* we can free zero page only if last reference remains */ + return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; + + if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { + unsigned long zero_pfn = xchg(&huge_zero_pfn, 0); + BUG_ON(zero_pfn == 0); + __free_page(__pfn_to_page(zero_pfn)); + } + + return 0; +} + +static struct shrinker huge_zero_page_shrinker = { + .shrink = shrink_huge_zero_page, + .seeks = DEFAULT_SEEKS, +}; + #ifdef CONFIG_SYSFS static ssize_t double_flag_show(struct kobject *kobj, @@ -284,6 +358,20 @@ static ssize_t defrag_store(struct kobject *kobj, static struct kobj_attribute defrag_attr = __ATTR(defrag, 0644, defrag_show, defrag_store); +static ssize_t use_zero_page_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return single_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); +} +static ssize_t use_zero_page_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + return single_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); +} +static struct kobj_attribute use_zero_page_attr = + __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); #ifdef CONFIG_DEBUG_VM static ssize_t debug_cow_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -305,6 +393,7 @@ static struct kobj_attribute debug_cow_attr = static struct attribute *hugepage_attr[] = { &enabled_attr.attr, &defrag_attr.attr, + &use_zero_page_attr.attr, #ifdef CONFIG_DEBUG_VM &debug_cow_attr.attr, #endif @@ -550,6 +639,8 @@ static int __init hugepage_init(void) goto out; } + register_shrinker(&huge_zero_page_shrinker); + /* * By default disable transparent hugepages on smaller systems, * where the extra memory used could hurt more than TLB overhead @@ -678,6 +769,22 @@ static inline struct page *alloc_hugepage(int defrag) } #endif +static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, + unsigned long zero_pfn) +{ + pmd_t entry; + if (!pmd_none(*pmd)) + return false; + entry = pfn_pmd(zero_pfn, vma->vm_page_prot); + entry = pmd_wrprotect(entry); + entry = pmd_mkhuge(entry); + set_pmd_at(mm, haddr, pmd, entry); + pgtable_trans_huge_deposit(mm, pgtable); + mm->nr_ptes++; + return true; +} + int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) @@ -691,6 +798,30 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; if (unlikely(khugepaged_enter(vma))) return VM_FAULT_OOM; + if (!(flags & FAULT_FLAG_WRITE) && + transparent_hugepage_use_zero_page()) { + pgtable_t pgtable; + unsigned long zero_pfn; + bool set; + pgtable = pte_alloc_one(mm, haddr); + if (unlikely(!pgtable)) + return VM_FAULT_OOM; + zero_pfn = get_huge_zero_page(); + if (unlikely(!zero_pfn)) { + pte_free(mm, pgtable); + count_vm_event(THP_FAULT_FALLBACK); + goto out; + } + spin_lock(&mm->page_table_lock); + set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, + zero_pfn); + spin_unlock(&mm->page_table_lock); + if (!set) { + pte_free(mm, pgtable); + put_huge_zero_page(); + } + return 0; + } page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), vma, haddr, numa_node_id(), 0); if (unlikely(!page)) { @@ -755,6 +886,26 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_free(dst_mm, pgtable); goto out_unlock; } + /* + * mm->page_table_lock is enough to be sure that huge zero pmd is not + * under splitting since we don't split the page itself, only pmd to + * a page table. + */ + if (is_huge_zero_pmd(pmd)) { + unsigned long zero_pfn; + bool set; + /* + * get_huge_zero_page() will never allocate a new page here, + * since we already have a zero page to copy. It just takes a + * reference. + */ + zero_pfn = get_huge_zero_page(); + set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, + zero_pfn); + BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ + ret = 0; + goto out_unlock; + } if (unlikely(pmd_trans_splitting(pmd))) { /* split huge page running from under us */ spin_unlock(&src_mm->page_table_lock); @@ -806,6 +957,80 @@ unlock: spin_unlock(&mm->page_table_lock); } +static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr) +{ + pgtable_t pgtable; + pmd_t _pmd; + struct page *page; + int i, ret = 0; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + + page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (!page) { + ret |= VM_FAULT_OOM; + goto out; + } + + if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { + put_page(page); + ret |= VM_FAULT_OOM; + goto out; + } + + clear_user_highpage(page, address); + __SetPageUptodate(page); + + mmun_start = haddr; + mmun_end = haddr + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto out_free_page; + + pmdp_clear_flush(vma, haddr, pmd); + /* leave pmd empty until pte is filled */ + + pgtable = pgtable_trans_huge_withdraw(mm); + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { + pte_t *pte, entry; + if (haddr == (address & PAGE_MASK)) { + entry = mk_pte(page, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + page_add_new_anon_rmap(page, vma, haddr); + } else { + entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); + entry = pte_mkspecial(entry); + } + pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + pte_unmap(pte); + } + smp_wmb(); /* make pte visible before pmd */ + pmd_populate(mm, pmd, pgtable); + spin_unlock(&mm->page_table_lock); + put_huge_zero_page(); + inc_mm_counter(mm, MM_ANONPAGES); + + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + + ret |= VM_FAULT_WRITE; +out: + return ret; +out_free_page: + spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + mem_cgroup_uncharge_page(page); + put_page(page); + goto out; +} + static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, @@ -912,19 +1137,21 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pmd_t orig_pmd) { int ret = 0; - struct page *page, *new_page; + struct page *page = NULL, *new_page; unsigned long haddr; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ VM_BUG_ON(!vma->anon_vma); + haddr = address & HPAGE_PMD_MASK; + if (is_huge_zero_pmd(orig_pmd)) + goto alloc; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto out_unlock; page = pmd_page(orig_pmd); VM_BUG_ON(!PageCompound(page) || !PageHead(page)); - haddr = address & HPAGE_PMD_MASK; if (page_mapcount(page) == 1) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); @@ -936,7 +1163,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, } get_page(page); spin_unlock(&mm->page_table_lock); - +alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), @@ -946,24 +1173,34 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!new_page)) { count_vm_event(THP_FAULT_FALLBACK); - ret = do_huge_pmd_wp_page_fallback(mm, vma, address, - pmd, orig_pmd, page, haddr); - if (ret & VM_FAULT_OOM) - split_huge_page(page); - put_page(page); + if (is_huge_zero_pmd(orig_pmd)) { + ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, + address, pmd, orig_pmd, haddr); + } else { + ret = do_huge_pmd_wp_page_fallback(mm, vma, address, + pmd, orig_pmd, page, haddr); + if (ret & VM_FAULT_OOM) + split_huge_page(page); + put_page(page); + } goto out; } count_vm_event(THP_FAULT_ALLOC); if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { put_page(new_page); - split_huge_page(page); - put_page(page); + if (page) { + split_huge_page(page); + put_page(page); + } ret |= VM_FAULT_OOM; goto out; } - copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); + if (is_huge_zero_pmd(orig_pmd)) + clear_huge_page(new_page, haddr, HPAGE_PMD_NR); + else + copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); mmun_start = haddr; @@ -971,7 +1208,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); spin_lock(&mm->page_table_lock); - put_page(page); + if (page) + put_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { spin_unlock(&mm->page_table_lock); mem_cgroup_uncharge_page(new_page); @@ -979,14 +1217,19 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_mn; } else { pmd_t entry; - VM_BUG_ON(!PageHead(page)); entry = mk_huge_pmd(new_page, vma); pmdp_clear_flush(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); update_mmu_cache_pmd(vma, address, pmd); - page_remove_rmap(page); - put_page(page); + if (is_huge_zero_pmd(orig_pmd)) { + add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); + put_huge_zero_page(); + } else { + VM_BUG_ON(!PageHead(page)); + page_remove_rmap(page); + put_page(page); + } ret |= VM_FAULT_WRITE; } spin_unlock(&mm->page_table_lock); @@ -1055,15 +1298,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t orig_pmd; pgtable = pgtable_trans_huge_withdraw(tlb->mm); orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); - page = pmd_page(orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); - page_remove_rmap(page); - VM_BUG_ON(page_mapcount(page) < 0); - add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); - VM_BUG_ON(!PageHead(page)); - tlb->mm->nr_ptes--; - spin_unlock(&tlb->mm->page_table_lock); - tlb_remove_page(tlb, page); + if (is_huge_zero_pmd(orig_pmd)) { + tlb->mm->nr_ptes--; + spin_unlock(&tlb->mm->page_table_lock); + put_huge_zero_page(); + } else { + page = pmd_page(orig_pmd); + page_remove_rmap(page); + VM_BUG_ON(page_mapcount(page) < 0); + add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); + VM_BUG_ON(!PageHead(page)); + tlb->mm->nr_ptes--; + spin_unlock(&tlb->mm->page_table_lock); + tlb_remove_page(tlb, page); + } pte_free(tlb->mm, pgtable); ret = 1; } @@ -1135,6 +1384,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, pmd_t entry; entry = pmdp_get_and_clear(mm, addr, pmd); entry = pmd_modify(entry, newprot); + BUG_ON(pmd_write(entry)); set_pmd_at(mm, addr, pmd, entry); spin_unlock(&vma->vm_mm->page_table_lock); ret = 1; @@ -1477,6 +1727,7 @@ int split_huge_page(struct page *page) struct anon_vma *anon_vma; int ret = 1; + BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); BUG_ON(!PageAnon(page)); anon_vma = page_lock_anon_vma(page); if (!anon_vma) @@ -2336,19 +2587,65 @@ static int khugepaged(void *none) return 0; } -void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) +static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, + unsigned long haddr, pmd_t *pmd) +{ + struct mm_struct *mm = vma->vm_mm; + pgtable_t pgtable; + pmd_t _pmd; + int i; + + pmdp_clear_flush(vma, haddr, pmd); + /* leave pmd empty until pte is filled */ + + pgtable = pgtable_trans_huge_withdraw(mm); + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { + pte_t *pte, entry; + entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); + entry = pte_mkspecial(entry); + pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + pte_unmap(pte); + } + smp_wmb(); /* make pte visible before pmd */ + pmd_populate(mm, pmd, pgtable); + put_huge_zero_page(); +} + +void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd) { struct page *page; + struct mm_struct *mm = vma->vm_mm; + unsigned long haddr = address & HPAGE_PMD_MASK; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + + BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE); + mmun_start = haddr; + mmun_end = haddr + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); spin_lock(&mm->page_table_lock); if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + return; + } + if (is_huge_zero_pmd(*pmd)) { + __split_huge_zero_page_pmd(vma, haddr, pmd); + spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); return; } page = pmd_page(*pmd); VM_BUG_ON(!page_count(page)); get_page(page); spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); split_huge_page(page); @@ -2356,6 +2653,16 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) BUG_ON(pmd_trans_huge(*pmd)); } +void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, + pmd_t *pmd) +{ + struct vm_area_struct *vma; + + vma = find_vma(mm, address); + BUG_ON(vma == NULL); + split_huge_page_pmd(vma, address, pmd); +} + static void split_huge_page_address(struct mm_struct *mm, unsigned long address) { @@ -2370,7 +2677,7 @@ static void split_huge_page_address(struct mm_struct *mm, * Caller holds the mmap_sem write mode, so a huge pmd cannot * materialize from under us. */ - split_huge_page_pmd(mm, pmd); + split_huge_page_pmd_mm(mm, address, pmd); } void __vma_adjust_trans_huge(struct vm_area_struct *vma, |