From 0abdd7a81b7e3fd781d7fabcca49501852bba17e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 21 Jan 2014 15:48:12 -0800 Subject: dma-debug: introduce debug_dma_assert_idle() Record actively mapped pages and provide an api for asserting a given page is dma inactive before execution proceeds. Placing debug_dma_assert_idle() in cow_user_page() flagged the violation of the dma-api in the NET_DMA implementation (see commit 77873803363c "net_dma: mark broken"). The implementation includes the capability to count, in a limited way, repeat mappings of the same page that occur without an intervening unmap. This 'overlap' counter is limited to the few bits of tag space in a radix tree. This mechanism is added to mitigate false negative cases where, for example, a page is dma mapped twice and debug_dma_assert_idle() is called after the page is un-mapped once. Signed-off-by: Dan Williams Cc: Joerg Roedel Cc: Vinod Koul Cc: Russell King Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 6768ce9e57d2..e9c550484ba6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include @@ -2559,6 +2560,8 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) { + debug_dma_assert_idle(src); + /* * If the source page was a PFN mapping, we don't have * a "struct page" for it. We do a best-effort copy by -- cgit v1.2.1 From a0368d4e48fc9ad65a66f6819a801f3f542b4f0f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 21 Jan 2014 15:48:49 -0800 Subject: mm: hugetlb: use get_page_foll() in follow_hugetlb_page() get_page_foll() is more optimal and is always safe to use under the PT lock. More so for hugetlbfs as there's no risk of race conditions with split_huge_page regardless of the PT lock. Signed-off-by: Andrea Arcangeli Tested-by: Khalid Aziz Cc: Pravin Shelar Cc: Greg Kroah-Hartman Cc: Ben Hutchings Cc: Christoph Lameter Cc: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dee6cf4e6d34..7596e104bffa 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3079,7 +3079,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, same_page: if (pages) { pages[i] = mem_map_offset(page, pfn_offset); - get_page(pages[i]); + get_page_foll(pages[i]); } if (vmas) -- cgit v1.2.1 From ebf360f9bb957f68e19e88f5067c015997dc26a6 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 21 Jan 2014 15:48:51 -0800 Subject: mm: hugetlbfs: move the put/get_page slab and hugetlbfs optimization in a faster path We don't actually need a reference on the head page in the slab and hugetlbfs paths, as long as we add a smp_rmb() which should be faster than get_page_unless_zero. [akpm@linux-foundation.org: fix typo in comment] Signed-off-by: Andrea Arcangeli Cc: Khalid Aziz Cc: Pravin Shelar Cc: Greg Kroah-Hartman Cc: Ben Hutchings Cc: Christoph Lameter Cc: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 140 ++++++++++++++++++++++++++++++++++---------------------------- 1 file changed, 78 insertions(+), 62 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 84b26aaabd03..e2757fbb04ea 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -86,45 +86,61 @@ static void put_compound_page(struct page *page) /* __split_huge_page_refcount can run under us */ struct page *page_head = compound_trans_head(page); + /* + * THP can not break up slab pages so avoid taking + * compound_lock(). Slab performs non-atomic bit ops + * on page->flags for better performance. In + * particular slab_unlock() in slub used to be a hot + * path. It is still hot on arches that do not support + * this_cpu_cmpxchg_double(). + * + * If "page" is part of a slab or hugetlbfs page it + * cannot be splitted and the head page cannot change + * from under us. And if "page" is part of a THP page + * under splitting, if the head page pointed by the + * THP tail isn't a THP head anymore, we'll find + * PageTail clear after smp_rmb() and we'll treat it + * as a single page. + */ + if (PageSlab(page_head) || PageHeadHuge(page_head)) { + /* + * If "page" is a THP tail, we must read the tail page + * flags after the head page flags. The + * split_huge_page side enforces write memory + * barriers between clearing PageTail and before the + * head page can be freed and reallocated. + */ + smp_rmb(); + if (likely(PageTail(page))) { + /* + * __split_huge_page_refcount + * cannot race here. + */ + VM_BUG_ON(!PageHead(page_head)); + VM_BUG_ON(page_mapcount(page) <= 0); + atomic_dec(&page->_mapcount); + if (put_page_testzero(page_head)) + __put_compound_page(page_head); + return; + } else + /* + * __split_huge_page_refcount + * run before us, "page" was a + * THP tail. The split + * page_head has been freed + * and reallocated as slab or + * hugetlbfs page of smaller + * order (only possible if + * reallocated as slab on + * x86). + */ + goto out_put_single; + } + if (likely(page != page_head && get_page_unless_zero(page_head))) { unsigned long flags; - /* - * THP can not break up slab pages so avoid taking - * compound_lock(). Slab performs non-atomic bit ops - * on page->flags for better performance. In particular - * slab_unlock() in slub used to be a hot path. It is - * still hot on arches that do not support - * this_cpu_cmpxchg_double(). - */ - if (PageSlab(page_head) || PageHeadHuge(page_head)) { - if (likely(PageTail(page))) { - /* - * __split_huge_page_refcount - * cannot race here. - */ - VM_BUG_ON(!PageHead(page_head)); - atomic_dec(&page->_mapcount); - if (put_page_testzero(page_head)) - VM_BUG_ON(1); - if (put_page_testzero(page_head)) - __put_compound_page(page_head); - return; - } else - /* - * __split_huge_page_refcount - * run before us, "page" was a - * THP tail. The split - * page_head has been freed - * and reallocated as slab or - * hugetlbfs page of smaller - * order (only possible if - * reallocated as slab on - * x86). - */ - goto skip_lock; - } /* * page_head wasn't a dangling pointer but it * may not be a head page anymore by the time @@ -135,7 +151,6 @@ static void put_compound_page(struct page *page) if (unlikely(!PageTail(page))) { /* __split_huge_page_refcount run before us */ compound_unlock_irqrestore(page_head, flags); -skip_lock: if (put_page_testzero(page_head)) { /* * The head page may have been @@ -221,36 +236,37 @@ bool __get_page_tail(struct page *page) * split_huge_page(). */ unsigned long flags; - bool got = false; + bool got; struct page *page_head = compound_trans_head(page); - if (likely(page != page_head && get_page_unless_zero(page_head))) { - /* Ref to put_compound_page() comment. */ - if (PageSlab(page_head) || PageHeadHuge(page_head)) { - if (likely(PageTail(page))) { - /* - * This is a hugetlbfs page or a slab - * page. __split_huge_page_refcount - * cannot race here. - */ - VM_BUG_ON(!PageHead(page_head)); - __get_page_tail_foll(page, false); - return true; - } else { - /* - * __split_huge_page_refcount run - * before us, "page" was a THP - * tail. The split page_head has been - * freed and reallocated as slab or - * hugetlbfs page of smaller order - * (only possible if reallocated as - * slab on x86). - */ - put_page(page_head); - return false; - } + /* Ref to put_compound_page() comment. */ + if (PageSlab(page_head) || PageHeadHuge(page_head)) { + smp_rmb(); + if (likely(PageTail(page))) { + /* + * This is a hugetlbfs page or a slab + * page. __split_huge_page_refcount + * cannot race here. + */ + VM_BUG_ON(!PageHead(page_head)); + __get_page_tail_foll(page, true); + return true; + } else { + /* + * __split_huge_page_refcount run + * before us, "page" was a THP + * tail. The split page_head has been + * freed and reallocated as slab or + * hugetlbfs page of smaller order + * (only possible if reallocated as + * slab on x86). + */ + return false; } + } + got = false; + if (likely(page != page_head && get_page_unless_zero(page_head))) { /* * page_head wasn't a dangling pointer but it * may not be a head page anymore by the time -- cgit v1.2.1 From 44518d2b32646e37b4b7a0813bbbe98dc21c7f8f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 21 Jan 2014 15:48:54 -0800 Subject: mm: tail page refcounting optimization for slab and hugetlbfs This skips the _mapcount mangling for slab and hugetlbfs pages. The main trouble in doing this is to guarantee that PageSlab and PageHeadHuge remains constant for all get_page/put_page run on the tail of slab or hugetlbfs compound pages. Otherwise if they're set during get_page but not set during put_page, the _mapcount of the tail page would underflow. PageHeadHuge will remain true until the compound page is released and enters the buddy allocator so it won't risk to change even if the tail page is the last reference left on the page. PG_slab instead is cleared before the slab frees the head page with put_page, so if the tail pin is released after the slab freed the page, we would have a problem. But in the slab case the tail pin cannot be the last reference left on the page. This is because the slab code is free to reuse the compound page after a kfree/kmem_cache_free without having to check if there's any tail pin left. In turn all tail pins must be always released while the head is still pinned by the slab code and so we know PG_slab will be still set too. Signed-off-by: Andrea Arcangeli Reviewed-by: Khalid Aziz Cc: Pravin Shelar Cc: Greg Kroah-Hartman Cc: Ben Hutchings Cc: Christoph Lameter Cc: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 3 ++- mm/swap.c | 33 +++++++++++++++++++++++++++------ 2 files changed, 29 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 684f7aa9692a..a85a3ab1f7ef 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -51,7 +51,8 @@ static inline void __get_page_tail_foll(struct page *page, VM_BUG_ON(page_mapcount(page) < 0); if (get_page_head) atomic_inc(&page->first_page->_count); - atomic_inc(&page->_mapcount); + if (compound_tail_refcounted(page->first_page)) + atomic_inc(&page->_mapcount); } /* diff --git a/mm/swap.c b/mm/swap.c index e2757fbb04ea..bba4aa5bf686 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -88,8 +88,9 @@ static void put_compound_page(struct page *page) /* * THP can not break up slab pages so avoid taking - * compound_lock(). Slab performs non-atomic bit ops - * on page->flags for better performance. In + * compound_lock() and skip the tail page refcounting + * (in _mapcount) too. Slab performs non-atomic bit + * ops on page->flags for better performance. In * particular slab_unlock() in slub used to be a hot * path. It is still hot on arches that do not support * this_cpu_cmpxchg_double(). @@ -102,7 +103,7 @@ static void put_compound_page(struct page *page) * PageTail clear after smp_rmb() and we'll treat it * as a single page. */ - if (PageSlab(page_head) || PageHeadHuge(page_head)) { + if (!__compound_tail_refcounted(page_head)) { /* * If "page" is a THP tail, we must read the tail page * flags after the head page flags. The @@ -117,10 +118,30 @@ static void put_compound_page(struct page *page) * cannot race here. */ VM_BUG_ON(!PageHead(page_head)); - VM_BUG_ON(page_mapcount(page) <= 0); - atomic_dec(&page->_mapcount); - if (put_page_testzero(page_head)) + VM_BUG_ON(page_mapcount(page) != 0); + if (put_page_testzero(page_head)) { + /* + * If this is the tail of a + * slab compound page, the + * tail pin must not be the + * last reference held on the + * page, because the PG_slab + * cannot be cleared before + * all tail pins (which skips + * the _mapcount tail + * refcounting) have been + * released. For hugetlbfs the + * tail pin may be the last + * reference on the page + * instead, because + * PageHeadHuge will not go + * away until the compound + * page enters the buddy + * allocator. + */ + VM_BUG_ON(PageSlab(page_head)); __put_compound_page(page_head); + } return; } else /* -- cgit v1.2.1 From 3bfcd13ec0b43b39b02072ba67bf197d15379387 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 21 Jan 2014 15:48:56 -0800 Subject: mm: hugetlbfs: use __compound_tail_refcounted in __get_page_tail too Also remove hugetlb.h which isn't needed anymore as PageHeadHuge is handled in mm.h. Signed-off-by: Andrea Arcangeli Cc: Khalid Aziz Cc: Pravin Shelar Cc: Greg Kroah-Hartman Cc: Ben Hutchings Cc: Christoph Lameter Cc: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index bba4aa5bf686..7434e3619c14 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -31,7 +31,6 @@ #include #include #include -#include #include "internal.h" @@ -261,7 +260,7 @@ bool __get_page_tail(struct page *page) struct page *page_head = compound_trans_head(page); /* Ref to put_compound_page() comment. */ - if (PageSlab(page_head) || PageHeadHuge(page_head)) { + if (!__compound_tail_refcounted(page_head)) { smp_rmb(); if (likely(PageTail(page))) { /* -- cgit v1.2.1 From 758f66a29ccc6383353fd395aa04be15e8dea445 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 21 Jan 2014 15:48:57 -0800 Subject: mm/hugetlb.c: simplify PageHeadHuge() and PageHuge() Signed-off-by: Andrea Arcangeli Cc: Khalid Aziz Cc: Pravin Shelar Cc: Greg Kroah-Hartman Cc: Ben Hutchings Cc: Christoph Lameter Cc: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7596e104bffa..1d9125360bf5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -690,15 +690,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) */ int PageHuge(struct page *page) { - compound_page_dtor *dtor; - if (!PageCompound(page)) return 0; page = compound_head(page); - dtor = get_compound_page_dtor(page); - - return dtor == free_huge_page; + return get_compound_page_dtor(page) == free_huge_page; } EXPORT_SYMBOL_GPL(PageHuge); @@ -708,14 +704,10 @@ EXPORT_SYMBOL_GPL(PageHuge); */ int PageHeadHuge(struct page *page_head) { - compound_page_dtor *dtor; - if (!PageHead(page_head)) return 0; - dtor = get_compound_page_dtor(page_head); - - return dtor == free_huge_page; + return get_compound_page_dtor(page_head) == free_huge_page; } EXPORT_SYMBOL_GPL(PageHeadHuge); -- cgit v1.2.1 From 26296ad2dfb4059f840e46cd7af38d0025a9d8d7 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 21 Jan 2014 15:48:59 -0800 Subject: mm/swap.c: reorganize put_compound_page() Tweak it so save a tab stop, make code layout slightly less nutty. Signed-off-by: Andrea Arcangeli Cc: Khalid Aziz Cc: Pravin Shelar Cc: Greg Kroah-Hartman Cc: Ben Hutchings Cc: Christoph Lameter Cc: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 254 +++++++++++++++++++++++++++++++------------------------------- 1 file changed, 125 insertions(+), 129 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 7434e3619c14..d1100b619e61 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -81,154 +81,150 @@ static void __put_compound_page(struct page *page) static void put_compound_page(struct page *page) { - if (unlikely(PageTail(page))) { - /* __split_huge_page_refcount can run under us */ - struct page *page_head = compound_trans_head(page); + struct page *page_head; - /* - * THP can not break up slab pages so avoid taking - * compound_lock() and skip the tail page refcounting - * (in _mapcount) too. Slab performs non-atomic bit - * ops on page->flags for better performance. In - * particular slab_unlock() in slub used to be a hot - * path. It is still hot on arches that do not support - * this_cpu_cmpxchg_double(). - * - * If "page" is part of a slab or hugetlbfs page it - * cannot be splitted and the head page cannot change - * from under us. And if "page" is part of a THP page - * under splitting, if the head page pointed by the - * THP tail isn't a THP head anymore, we'll find - * PageTail clear after smp_rmb() and we'll treat it - * as a single page. - */ - if (!__compound_tail_refcounted(page_head)) { + if (likely(!PageTail(page))) { + if (put_page_testzero(page)) { /* - * If "page" is a THP tail, we must read the tail page - * flags after the head page flags. The - * split_huge_page side enforces write memory - * barriers between clearing PageTail and before the - * head page can be freed and reallocated. + * By the time all refcounts have been released + * split_huge_page cannot run anymore from under us. */ - smp_rmb(); - if (likely(PageTail(page))) { - /* - * __split_huge_page_refcount - * cannot race here. - */ - VM_BUG_ON(!PageHead(page_head)); - VM_BUG_ON(page_mapcount(page) != 0); - if (put_page_testzero(page_head)) { - /* - * If this is the tail of a - * slab compound page, the - * tail pin must not be the - * last reference held on the - * page, because the PG_slab - * cannot be cleared before - * all tail pins (which skips - * the _mapcount tail - * refcounting) have been - * released. For hugetlbfs the - * tail pin may be the last - * reference on the page - * instead, because - * PageHeadHuge will not go - * away until the compound - * page enters the buddy - * allocator. - */ - VM_BUG_ON(PageSlab(page_head)); - __put_compound_page(page_head); - } - return; - } else - /* - * __split_huge_page_refcount - * run before us, "page" was a - * THP tail. The split - * page_head has been freed - * and reallocated as slab or - * hugetlbfs page of smaller - * order (only possible if - * reallocated as slab on - * x86). - */ - goto out_put_single; + if (PageHead(page)) + __put_compound_page(page); + else + __put_single_page(page); } + return; + } - if (likely(page != page_head && - get_page_unless_zero(page_head))) { - unsigned long flags; + /* __split_huge_page_refcount can run under us */ + page_head = compound_trans_head(page); + /* + * THP can not break up slab pages so avoid taking + * compound_lock() and skip the tail page refcounting (in + * _mapcount) too. Slab performs non-atomic bit ops on + * page->flags for better performance. In particular + * slab_unlock() in slub used to be a hot path. It is still + * hot on arches that do not support + * this_cpu_cmpxchg_double(). + * + * If "page" is part of a slab or hugetlbfs page it cannot be + * splitted and the head page cannot change from under us. And + * if "page" is part of a THP page under splitting, if the + * head page pointed by the THP tail isn't a THP head anymore, + * we'll find PageTail clear after smp_rmb() and we'll treat + * it as a single page. + */ + if (!__compound_tail_refcounted(page_head)) { + /* + * If "page" is a THP tail, we must read the tail page + * flags after the head page flags. The + * split_huge_page side enforces write memory barriers + * between clearing PageTail and before the head page + * can be freed and reallocated. + */ + smp_rmb(); + if (likely(PageTail(page))) { /* - * page_head wasn't a dangling pointer but it - * may not be a head page anymore by the time - * we obtain the lock. That is ok as long as it - * can't be freed from under us. + * __split_huge_page_refcount cannot race + * here. */ - flags = compound_lock_irqsave(page_head); - if (unlikely(!PageTail(page))) { - /* __split_huge_page_refcount run before us */ - compound_unlock_irqrestore(page_head, flags); - if (put_page_testzero(page_head)) { - /* - * The head page may have been - * freed and reallocated as a - * compound page of smaller - * order and then freed again. - * All we know is that it - * cannot have become: a THP - * page, a compound page of - * higher order, a tail page. - * That is because we still - * hold the refcount of the - * split THP tail and - * page_head was the THP head - * before the split. - */ - if (PageHead(page_head)) - __put_compound_page(page_head); - else - __put_single_page(page_head); - } -out_put_single: - if (put_page_testzero(page)) - __put_single_page(page); - return; + VM_BUG_ON(!PageHead(page_head)); + VM_BUG_ON(page_mapcount(page) != 0); + if (put_page_testzero(page_head)) { + /* + * If this is the tail of a slab + * compound page, the tail pin must + * not be the last reference held on + * the page, because the PG_slab + * cannot be cleared before all tail + * pins (which skips the _mapcount + * tail refcounting) have been + * released. For hugetlbfs the tail + * pin may be the last reference on + * the page instead, because + * PageHeadHuge will not go away until + * the compound page enters the buddy + * allocator. + */ + VM_BUG_ON(PageSlab(page_head)); + __put_compound_page(page_head); } - VM_BUG_ON(page_head != page->first_page); + return; + } else /* - * We can release the refcount taken by - * get_page_unless_zero() now that - * __split_huge_page_refcount() is blocked on - * the compound_lock. + * __split_huge_page_refcount run before us, + * "page" was a THP tail. The split page_head + * has been freed and reallocated as slab or + * hugetlbfs page of smaller order (only + * possible if reallocated as slab on x86). */ - if (put_page_testzero(page_head)) - VM_BUG_ON(1); - /* __split_huge_page_refcount will wait now */ - VM_BUG_ON(page_mapcount(page) <= 0); - atomic_dec(&page->_mapcount); - VM_BUG_ON(atomic_read(&page_head->_count) <= 0); - VM_BUG_ON(atomic_read(&page->_count) != 0); - compound_unlock_irqrestore(page_head, flags); + goto out_put_single; + } + + if (likely(page != page_head && get_page_unless_zero(page_head))) { + unsigned long flags; + /* + * page_head wasn't a dangling pointer but it may not + * be a head page anymore by the time we obtain the + * lock. That is ok as long as it can't be freed from + * under us. + */ + flags = compound_lock_irqsave(page_head); + if (unlikely(!PageTail(page))) { + /* __split_huge_page_refcount run before us */ + compound_unlock_irqrestore(page_head, flags); if (put_page_testzero(page_head)) { + /* + * The head page may have been freed + * and reallocated as a compound page + * of smaller order and then freed + * again. All we know is that it + * cannot have become: a THP page, a + * compound page of higher order, a + * tail page. That is because we + * still hold the refcount of the + * split THP tail and page_head was + * the THP head before the split. + */ if (PageHead(page_head)) __put_compound_page(page_head); else __put_single_page(page_head); } - } else { - /* page_head is a dangling pointer */ - VM_BUG_ON(PageTail(page)); - goto out_put_single; +out_put_single: + if (put_page_testzero(page)) + __put_single_page(page); + return; } - } else if (put_page_testzero(page)) { - if (PageHead(page)) - __put_compound_page(page); - else - __put_single_page(page); + VM_BUG_ON(page_head != page->first_page); + /* + * We can release the refcount taken by + * get_page_unless_zero() now that + * __split_huge_page_refcount() is blocked on the + * compound_lock. + */ + if (put_page_testzero(page_head)) + VM_BUG_ON(1); + /* __split_huge_page_refcount will wait now */ + VM_BUG_ON(page_mapcount(page) <= 0); + atomic_dec(&page->_mapcount); + VM_BUG_ON(atomic_read(&page_head->_count) <= 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + compound_unlock_irqrestore(page_head, flags); + + if (put_page_testzero(page_head)) { + if (PageHead(page_head)) + __put_compound_page(page_head); + else + __put_single_page(page_head); + } + } else { + /* page_head is a dangling pointer */ + VM_BUG_ON(PageTail(page)); + goto out_put_single; } } -- cgit v1.2.1 From 9b7ac260188ddacffdcaadd6a61e4a502238a63f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 21 Jan 2014 15:49:01 -0800 Subject: mm/hugetlb.c: defer PageHeadHuge() symbol export No actual need of it. So keep it internal. Signed-off-by: Andrea Arcangeli Cc: Khalid Aziz Cc: Pravin Shelar Cc: Greg Kroah-Hartman Cc: Ben Hutchings Cc: Christoph Lameter Cc: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1d9125360bf5..f730b7a37590 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -709,7 +709,6 @@ int PageHeadHuge(struct page *page_head) return get_compound_page_dtor(page_head) == free_huge_page; } -EXPORT_SYMBOL_GPL(PageHeadHuge); pgoff_t __basepage_index(struct page *page) { -- cgit v1.2.1 From c728852f5dd41ce34e7ce0a179cf28cd5f4dc301 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jan 2014 15:49:02 -0800 Subject: mm: thp: __get_page_tail_foll() can use get_huge_page_tail() Cleanup. Change __get_page_tail_foll() to use get_huge_page_tail() to avoid the code duplication. Signed-off-by: Oleg Nesterov Cc: Thomas Gleixner Cc: Dave Jones Cc: Darren Hart Cc: Peter Zijlstra Cc: Mel Gorman Acked-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index a85a3ab1f7ef..a346ba120e42 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -47,12 +47,9 @@ static inline void __get_page_tail_foll(struct page *page, * page_cache_get_speculative()) on tail pages. */ VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); - VM_BUG_ON(atomic_read(&page->_count) != 0); - VM_BUG_ON(page_mapcount(page) < 0); if (get_page_head) atomic_inc(&page->first_page->_count); - if (compound_tail_refcounted(page->first_page)) - atomic_inc(&page->_mapcount); + get_huge_page_tail(page); } /* -- cgit v1.2.1 From 943dca1a1fcbccb58de944669b833fd38a6c809b Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Tue, 21 Jan 2014 15:49:06 -0800 Subject: mm: get rid of unnecessary pageblock scanning in setup_zone_migrate_reserve Yasuaki Ishimatsu reported memory hot-add spent more than 5 _hours_ on 9TB memory machine since onlining memory sections is too slow. And we found out setup_zone_migrate_reserve spent >90% of the time. The problem is, setup_zone_migrate_reserve scans all pageblocks unconditionally, but it is only necessary if the number of reserved block was reduced (i.e. memory hot remove). Moreover, maximum MIGRATE_RESERVE per zone is currently 2. It means that the number of reserved pageblocks is almost always unchanged. This patch adds zone->nr_migrate_reserve_block to maintain the number of MIGRATE_RESERVE pageblocks and it reduces the overhead of setup_zone_migrate_reserve dramatically. The following table shows time of onlining a memory section. Amount of memory | 128GB | 192GB | 256GB| --------------------------------------------- linux-3.12 | 23.9 | 31.4 | 44.5 | This patch | 8.3 | 8.3 | 8.6 | Mel's proposal patch | 10.9 | 19.2 | 31.3 | --------------------------------------------- (millisecond) 128GB : 4 nodes and each node has 32GB of memory 192GB : 6 nodes and each node has 32GB of memory 256GB : 8 nodes and each node has 32GB of memory (*1) Mel proposed his idea by the following threads. https://lkml.org/lkml/2013/10/30/272 [akpm@linux-foundation.org: tweak comment] Signed-off-by: KOSAKI Motohiro Signed-off-by: Yasuaki Ishimatsu Reported-by: Yasuaki Ishimatsu Tested-by: Yasuaki Ishimatsu Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5248fe070aa4..89d81f4429ca 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3901,6 +3901,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) struct page *page; unsigned long block_migratetype; int reserve; + int old_reserve; /* * Get the start pfn, end pfn and the number of blocks to reserve @@ -3922,6 +3923,12 @@ static void setup_zone_migrate_reserve(struct zone *zone) * future allocation of hugepages at runtime. */ reserve = min(2, reserve); + old_reserve = zone->nr_migrate_reserve_block; + + /* When memory hot-add, we almost always need to do nothing */ + if (reserve == old_reserve) + return; + zone->nr_migrate_reserve_block = reserve; for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { if (!pfn_valid(pfn)) @@ -3959,6 +3966,12 @@ static void setup_zone_migrate_reserve(struct zone *zone) reserve--; continue; } + } else if (!old_reserve) { + /* + * At boot time we don't need to scan the whole zone + * for turning off MIGRATE_RESERVE. + */ + break; } /* -- cgit v1.2.1 From b35f1819acd9243a3ff7ad25b1fa8bd6bfe80fb2 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 21 Jan 2014 15:49:07 -0800 Subject: mm: create a separate slab for page->ptl allocation If DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC are enabled spinlock_t on x86_64 is 72 bytes. For page->ptl they will be allocated from kmalloc-96 slab, so we loose 24 on each. An average system can easily allocate few tens thousands of page->ptl and overhead is significant. Let's create a separate slab for page->ptl allocation to solve this. To make sure that it really works this time, some numbers from my test machine (just booted, no load): Before: # grep '^\(kmalloc-96\|page->ptl\)' /proc/slabinfo kmalloc-96 31987 32190 128 30 1 : tunables 120 60 8 : slabdata 1073 1073 92 After: # grep '^\(kmalloc-96\|page->ptl\)' /proc/slabinfo page->ptl 27516 28143 72 53 1 : tunables 120 60 8 : slabdata 531 531 9 kmalloc-96 3853 5280 128 30 1 : tunables 120 60 8 : slabdata 176 176 0 Note that the patch is useful not only for debug case, but also for PREEMPT_RT, where spinlock_t is always bloated. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index e9c550484ba6..86487dfa5e59 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4275,11 +4275,20 @@ void copy_user_huge_page(struct page *dst, struct page *src, #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS + +static struct kmem_cache *page_ptl_cachep; + +void __init ptlock_cache_init(void) +{ + page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, + SLAB_PANIC, NULL); +} + bool ptlock_alloc(struct page *page) { spinlock_t *ptl; - ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL); + ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); if (!ptl) return false; page->ptl = ptl; @@ -4288,6 +4297,6 @@ bool ptlock_alloc(struct page *page) void ptlock_free(struct page *page) { - kfree(page->ptl); + kmem_cache_free(page_ptl_cachep, page->ptl); } #endif -- cgit v1.2.1 From 549543dff797ae1081f61a69f8511c61806c3735 Mon Sep 17 00:00:00 2001 From: Zhi Yong Wu Date: Tue, 21 Jan 2014 15:49:08 -0800 Subject: mm, memory-failure: fix typo in me_pagecache_dirty() [akpm@linux-foundation.org: s/cache/pagecache/] Signed-off-by: Zhi Yong Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index fabe55046c1d..9fa6586d5275 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -611,7 +611,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) } /* - * Dirty cache page page + * Dirty pagecache page * Issues: when the error hit a hole page the error is not properly * propagated. */ -- cgit v1.2.1 From e8569dd299dbc7bac878325c0bdc7aa449eae479 Mon Sep 17 00:00:00 2001 From: Andreas Sandberg Date: Tue, 21 Jan 2014 15:49:09 -0800 Subject: mm/hugetlb.c: call MMU notifiers when copying a hugetlb page range When copy_hugetlb_page_range() is called to copy a range of hugetlb mappings, the secondary MMUs are not notified if there is a protection downgrade, which breaks COW semantics in KVM. This patch adds the necessary MMU notifier calls. Signed-off-by: Andreas Sandberg Acked-by: Steve Capper Acked-by: Marc Zyngier Cc: Mel Gorman Cc: Rik van Riel Cc: Hugh Dickins Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f730b7a37590..1697ff0cc53a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2346,17 +2346,27 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, int cow; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + int ret = 0; cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; + mmun_start = vma->vm_start; + mmun_end = vma->vm_end; + if (cow) + mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end); + for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; src_pte = huge_pte_offset(src, addr); if (!src_pte) continue; dst_pte = huge_pte_alloc(dst, addr, sz); - if (!dst_pte) - goto nomem; + if (!dst_pte) { + ret = -ENOMEM; + break; + } /* If the pagetables are shared don't copy or take references */ if (dst_pte == src_pte) @@ -2377,10 +2387,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, spin_unlock(src_ptl); spin_unlock(dst_ptl); } - return 0; -nomem: - return -ENOMEM; + if (cow) + mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end); + + return ret; } static int is_hugetlb_entry_migration(pte_t pte) -- cgit v1.2.1 From ece86e222db48d04bda218a2be70e384518bb08c Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Tue, 21 Jan 2014 15:49:12 -0800 Subject: mm/vmalloc: interchage the implementation of vmalloc_to_{pfn,page} Currently we are implementing vmalloc_to_pfn() as a wrapper around vmalloc_to_page(), which is implemented as follow: 1. walks the page talbes to generates the corresponding pfn, 2. then converts the pfn to struct page, 3. returns it. And vmalloc_to_pfn() re-wraps vmalloc_to_page() to get the pfn. This seems too circuitous, so this patch reverses the way: implement vmalloc_to_page() as a wrapper around vmalloc_to_pfn(). This makes vmalloc_to_pfn() and vmalloc_to_page() slightly more efficient. No functional change. Signed-off-by: Jianyu Zhan Cc: Vladimir Murzin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0fdf96803c5b..e4f0db2a3eae 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -220,12 +220,12 @@ int is_vmalloc_or_module_addr(const void *x) } /* - * Walk a vmap address to the struct page it maps. + * Walk a vmap address to the physical pfn it maps to. */ -struct page *vmalloc_to_page(const void *vmalloc_addr) +unsigned long vmalloc_to_pfn(const void *vmalloc_addr) { unsigned long addr = (unsigned long) vmalloc_addr; - struct page *page = NULL; + unsigned long pfn = 0; pgd_t *pgd = pgd_offset_k(addr); /* @@ -244,23 +244,23 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) ptep = pte_offset_map(pmd, addr); pte = *ptep; if (pte_present(pte)) - page = pte_page(pte); + pfn = pte_pfn(pte); pte_unmap(ptep); } } } - return page; + return pfn; } -EXPORT_SYMBOL(vmalloc_to_page); +EXPORT_SYMBOL(vmalloc_to_pfn); /* - * Map a vmalloc()-space virtual address to the physical page frame number. + * Map a vmalloc()-space virtual address to the struct page. */ -unsigned long vmalloc_to_pfn(const void *vmalloc_addr) +struct page *vmalloc_to_page(const void *vmalloc_addr) { - return page_to_pfn(vmalloc_to_page(vmalloc_addr)); + return pfn_to_page(vmalloc_to_pfn(vmalloc_addr)); } -EXPORT_SYMBOL(vmalloc_to_pfn); +EXPORT_SYMBOL(vmalloc_to_page); /*** Global kva allocator ***/ -- cgit v1.2.1 From aec6a8889a98a0cd58357cd0937a25189908f191 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 15:49:13 -0800 Subject: mm, show_mem: remove SHOW_MEM_FILTER_PAGE_COUNT Commit 4b59e6c47309 ("mm, show_mem: suppress page counts in non-blockable contexts") introduced SHOW_MEM_FILTER_PAGE_COUNT to suppress PFN walks on large memory machines. Commit c78e93630d15 ("mm: do not walk all of system memory during show_mem") avoided a PFN walk in the generic show_mem helper which removes the requirement for SHOW_MEM_FILTER_PAGE_COUNT in that case. This patch removes PFN walkers from the arch-specific implementations that report on a per-node or per-zone granularity. ARM and unicore32 still do a PFN walk as they report memory usage on each bank which is a much finer granularity where the debugging information may still be of use. As the remaining arches doing PFN walks have relatively small amounts of memory, this patch simply removes SHOW_MEM_FILTER_PAGE_COUNT. [akpm@linux-foundation.org: fix parisc] Signed-off-by: Mel Gorman Acked-by: David Rientjes Cc: Tony Luck Cc: Russell King Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 89d81f4429ca..ec4417cb458a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2071,13 +2071,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) debug_guardpage_minorder() > 0) return; - /* - * Walking all memory to count page types is very expensive and should - * be inhibited in non-blockable contexts. - */ - if (!(gfp_mask & __GFP_WAIT)) - filter |= SHOW_MEM_FILTER_PAGE_COUNT; - /* * This documents exceptions given to allocations in certain * contexts that are allowed to allocate outside current's set -- cgit v1.2.1 From 49f0ce5f92321cdcf741e35f385669a421013cb7 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Tue, 21 Jan 2014 15:49:14 -0800 Subject: mm: add overcommit_kbytes sysctl variable Some applications that run on HPC clusters are designed around the availability of RAM and the overcommit ratio is fine tuned to get the maximum usage of memory without swapping. With growing memory, the 1%-of-all-RAM grain provided by overcommit_ratio has become too coarse for these workload (on a 2TB machine it represents no less than 20GB). This patch adds the new overcommit_kbytes sysctl variable that allow a much finer grain. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix nommu build] Signed-off-by: Jerome Marchand Cc: Dave Hansen Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 1 + mm/nommu.c | 1 + mm/util.c | 36 ++++++++++++++++++++++++++++++++++-- 3 files changed, 36 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 834b2d785f1e..39552de6e1db 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -86,6 +86,7 @@ EXPORT_SYMBOL(vm_get_page_prot); int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ +unsigned long sysctl_overcommit_kbytes __read_mostly; int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ diff --git a/mm/nommu.c b/mm/nommu.c index fec093adad9c..8740213b1647 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -60,6 +60,7 @@ unsigned long highest_memmap_pfn; struct percpu_counter vm_committed_as; int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio = 50; /* default is 50% */ +unsigned long sysctl_overcommit_kbytes __read_mostly; int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ diff --git a/mm/util.c b/mm/util.c index 808f375648e7..a24aa22f2473 100644 --- a/mm/util.c +++ b/mm/util.c @@ -404,13 +404,45 @@ struct address_space *page_mapping(struct page *page) return mapping; } +int overcommit_ratio_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + sysctl_overcommit_kbytes = 0; + return ret; +} + +int overcommit_kbytes_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + sysctl_overcommit_ratio = 0; + return ret; +} + /* * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used */ unsigned long vm_commit_limit(void) { - return ((totalram_pages - hugetlb_total_pages()) - * sysctl_overcommit_ratio / 100) + total_swap_pages; + unsigned long allowed; + + if (sysctl_overcommit_kbytes) + allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); + else + allowed = ((totalram_pages - hugetlb_total_pages()) + * sysctl_overcommit_ratio / 100); + allowed += total_swap_pages; + + return allowed; } -- cgit v1.2.1 From 363ee17f0f405faff74d9aaf93d21d5f41d5102d Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 21 Jan 2014 15:49:15 -0800 Subject: mm/mmap.c: add mlock_future_check() helper Both do_brk and do_mmap_pgoff verify that we are actually capable of locking future pages if the corresponding VM_LOCKED flags are used. Encapsulate this logic into a single mlock_future_check() helper function. Signed-off-by: Davidlohr Bueso Cc: Rik van Riel Reviewed-by: Michel Lespinasse Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 39552de6e1db..a0e7153a79e6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1191,6 +1191,24 @@ static inline unsigned long round_hint_to_min(unsigned long hint) return hint; } +static inline int mlock_future_check(struct mm_struct *mm, + unsigned long flags, + unsigned long len) +{ + unsigned long locked, lock_limit; + + /* mlock MCL_FUTURE? */ + if (flags & VM_LOCKED) { + locked = len >> PAGE_SHIFT; + locked += mm->locked_vm; + lock_limit = rlimit(RLIMIT_MEMLOCK); + lock_limit >>= PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + return -EAGAIN; + } + return 0; +} + /* * The caller must hold down_write(¤t->mm->mmap_sem). */ @@ -1252,16 +1270,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, if (!can_do_mlock()) return -EPERM; - /* mlock MCL_FUTURE? */ - if (vm_flags & VM_LOCKED) { - unsigned long locked, lock_limit; - locked = len >> PAGE_SHIFT; - locked += mm->locked_vm; - lock_limit = rlimit(RLIMIT_MEMLOCK); - lock_limit >>= PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - return -EAGAIN; - } + if (mlock_future_check(mm, vm_flags, len)) + return -EAGAIN; if (file) { struct inode *inode = file_inode(file); @@ -2592,18 +2602,9 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) if (error & ~PAGE_MASK) return error; - /* - * mlock MCL_FUTURE? - */ - if (mm->def_flags & VM_LOCKED) { - unsigned long locked, lock_limit; - locked = len >> PAGE_SHIFT; - locked += mm->locked_vm; - lock_limit = rlimit(RLIMIT_MEMLOCK); - lock_limit >>= PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - return -EAGAIN; - } + error = mlock_future_check(mm, mm->def_flags, len); + if (error) + return error; /* * mm->mmap_sem is required to protect against another thread -- cgit v1.2.1 From 1f1cd7054fe7f45e65dd4963d0a38e5ab7a57cae Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 21 Jan 2014 15:49:16 -0800 Subject: mm/mlock: prepare params outside critical region All mlock related syscalls prepare lock limits, lengths and start parameters with the mmap_sem held. Move this logic outside of the critical region. For the case of mlock, continue incrementing the amount already locked by mm->locked_vm with the rwsem taken. Signed-off-by: Davidlohr Bueso Cc: Rik van Riel Reviewed-by: Michel Lespinasse Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index 192e6eebe4f2..10819ed4df3e 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -709,19 +709,21 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) lru_add_drain_all(); /* flush pagevec */ - down_write(¤t->mm->mmap_sem); len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); start &= PAGE_MASK; - locked = len >> PAGE_SHIFT; - locked += current->mm->locked_vm; - lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; + locked = len >> PAGE_SHIFT; + + down_write(¤t->mm->mmap_sem); + + locked += current->mm->locked_vm; /* check against resource limits */ if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) error = do_mlock(start, len, 1); + up_write(¤t->mm->mmap_sem); if (!error) error = __mm_populate(start, len, 0); @@ -732,11 +734,13 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; - down_write(¤t->mm->mmap_sem); len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); start &= PAGE_MASK; + + down_write(¤t->mm->mmap_sem); ret = do_mlock(start, len, 0); up_write(¤t->mm->mmap_sem); + return ret; } @@ -781,12 +785,12 @@ SYSCALL_DEFINE1(mlockall, int, flags) if (flags & MCL_CURRENT) lru_add_drain_all(); /* flush pagevec */ - down_write(¤t->mm->mmap_sem); - lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; ret = -ENOMEM; + down_write(¤t->mm->mmap_sem); + if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || capable(CAP_IPC_LOCK)) ret = do_mlockall(flags); -- cgit v1.2.1 From 931d13f534a9bb39539f0a851209ca18013ba0c2 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 21 Jan 2014 15:49:17 -0800 Subject: mm/memblock: debug: correct displaying of upper memory boundary Current memblock APIs don't work on 32 PAE or LPAE extension arches where the physical memory start address beyond 4GB. The problem was discussed here [3] where Tejun, Yinghai(thanks) proposed a way forward with memblock interfaces. Based on the proposal, this series adds necessary memblock interfaces and convert the core kernel code to use them. Architectures already converted to NO_BOOTMEM use these new interfaces and other which still uses bootmem, these new interfaces just fallback to exiting bootmem APIs. So no functional change in behavior. In long run, once all the architectures moves to NO_BOOTMEM, we can get rid of bootmem layer completely. This is one step to remove the core code dependency with bootmem and also gives path for architectures to move away from bootmem. Testing is done on ARM architecture with 32 bit ARM LAPE machines with normal as well sparse(faked) memory model. This patch (of 23): When debugging is enabled (cmdline has "memblock=debug") the memblock will display upper memory boundary per each allocated/freed memory range wrongly. For example: memblock_reserve: [0x0000009e7e8000-0x0000009e7ed000] _memblock_early_alloc_try_nid_nopanic+0xfc/0x12c The 0x0000009e7ed000 is displayed instead of 0x0000009e7ecfff Hence, correct this by changing formula used to calculate upper memory boundary to (u64)base + size - 1 instead of (u64)base + size everywhere in the debug messages. Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Cc: Yinghai Lu Acked-by: Tejun Heo Cc: H. Peter Anvin Cc: Russell King Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Tony Lindgren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 53e477bb5558..aab566998b61 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -643,7 +643,7 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) { memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", (unsigned long long)base, - (unsigned long long)base + size, + (unsigned long long)base + size - 1, (void *)_RET_IP_); return __memblock_remove(&memblock.reserved, base, size); @@ -655,7 +655,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n", (unsigned long long)base, - (unsigned long long)base + size, + (unsigned long long)base + size - 1, (void *)_RET_IP_); return memblock_add_region(_rgn, base, size, MAX_NUMNODES); -- cgit v1.2.1 From 66a20757214d94b915f2d2aada1384dead9ab18d Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 21 Jan 2014 15:49:20 -0800 Subject: memblock, numa: introduce flags field into memblock There is no flag in memblock to describe what type the memory is. Sometimes, we may use memblock to reserve some memory for special usage. And we want to know what kind of memory it is. So we need a way to In hotplug environment, we want to reserve hotpluggable memory so the kernel won't be able to use it. And when the system is up, we have to free these hotpluggable memory to buddy. So we need to mark these memory first. In order to do so, we need to mark out these special memory in memblock. In this patch, we introduce a new "flags" member into memblock_region: struct memblock_region { phys_addr_t base; phys_addr_t size; unsigned long flags; /* This is new. */ #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int nid; #endif }; This patch does the following things: 1) Add "flags" member to memblock_region. 2) Modify the following APIs' prototype: memblock_add_region() memblock_insert_region() 3) Add memblock_reserve_region() to support reserve memory with flags, and keep memblock_reserve()'s prototype unmodified. 4) Modify other APIs to support flags, but keep their prototype unmodified. The idea is from Wen Congyang and Liu Jiang . Suggested-by: Wen Congyang Suggested-by: Liu Jiang Signed-off-by: Tang Chen Reviewed-by: Zhang Yanfei Cc: "H. Peter Anvin" Cc: "Rafael J . Wysocki" Cc: Chen Tang Cc: Gong Chen Cc: Ingo Molnar Cc: Jiang Liu Cc: Johannes Weiner Cc: Lai Jiangshan Cc: Larry Woodman Cc: Len Brown Cc: Mel Gorman Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Prarit Bhargava Cc: Rik van Riel Cc: Taku Izumi Cc: Tejun Heo Cc: Thomas Gleixner Cc: Thomas Renninger Cc: Toshi Kani Cc: Vasilis Liaskovitis Cc: Wanpeng Li Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 53 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index aab566998b61..270b005ca964 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -255,6 +255,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u type->cnt = 1; type->regions[0].base = 0; type->regions[0].size = 0; + type->regions[0].flags = 0; memblock_set_region_node(&type->regions[0], MAX_NUMNODES); } } @@ -405,7 +406,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) if (this->base + this->size != next->base || memblock_get_region_node(this) != - memblock_get_region_node(next)) { + memblock_get_region_node(next) || + this->flags != next->flags) { BUG_ON(this->base + this->size > next->base); i++; continue; @@ -425,13 +427,15 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) * @base: base address of the new region * @size: size of the new region * @nid: node id of the new region + * @flags: flags of the new region * * Insert new memblock region [@base,@base+@size) into @type at @idx. * @type must already have extra room to accomodate the new region. */ static void __init_memblock memblock_insert_region(struct memblock_type *type, int idx, phys_addr_t base, - phys_addr_t size, int nid) + phys_addr_t size, + int nid, unsigned long flags) { struct memblock_region *rgn = &type->regions[idx]; @@ -439,6 +443,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn)); rgn->base = base; rgn->size = size; + rgn->flags = flags; memblock_set_region_node(rgn, nid); type->cnt++; type->total_size += size; @@ -450,6 +455,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, * @base: base address of the new region * @size: size of the new region * @nid: nid of the new region + * @flags: flags of the new region * * Add new memblock region [@base,@base+@size) into @type. The new region * is allowed to overlap with existing ones - overlaps don't affect already @@ -460,7 +466,8 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, * 0 on success, -errno on failure. */ static int __init_memblock memblock_add_region(struct memblock_type *type, - phys_addr_t base, phys_addr_t size, int nid) + phys_addr_t base, phys_addr_t size, + int nid, unsigned long flags) { bool insert = false; phys_addr_t obase = base; @@ -475,6 +482,7 @@ static int __init_memblock memblock_add_region(struct memblock_type *type, WARN_ON(type->cnt != 1 || type->total_size); type->regions[0].base = base; type->regions[0].size = size; + type->regions[0].flags = flags; memblock_set_region_node(&type->regions[0], nid); type->total_size = size; return 0; @@ -505,7 +513,8 @@ repeat: nr_new++; if (insert) memblock_insert_region(type, i++, base, - rbase - base, nid); + rbase - base, nid, + flags); } /* area below @rend is dealt with, forget about it */ base = min(rend, end); @@ -515,7 +524,8 @@ repeat: if (base < end) { nr_new++; if (insert) - memblock_insert_region(type, i, base, end - base, nid); + memblock_insert_region(type, i, base, end - base, + nid, flags); } /* @@ -537,12 +547,13 @@ repeat: int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, int nid) { - return memblock_add_region(&memblock.memory, base, size, nid); + return memblock_add_region(&memblock.memory, base, size, nid, 0); } int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) { - return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES); + return memblock_add_region(&memblock.memory, base, size, + MAX_NUMNODES, 0); } /** @@ -597,7 +608,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, rgn->size -= base - rbase; type->total_size -= base - rbase; memblock_insert_region(type, i, rbase, base - rbase, - memblock_get_region_node(rgn)); + memblock_get_region_node(rgn), + rgn->flags); } else if (rend > end) { /* * @rgn intersects from above. Split and redo the @@ -607,7 +619,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, rgn->size -= end - rbase; type->total_size -= end - rbase; memblock_insert_region(type, i--, rbase, end - rbase, - memblock_get_region_node(rgn)); + memblock_get_region_node(rgn), + rgn->flags); } else { /* @rgn is fully contained, record it */ if (!*end_rgn) @@ -649,16 +662,24 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) return __memblock_remove(&memblock.reserved, base, size); } -int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) +static int __init_memblock memblock_reserve_region(phys_addr_t base, + phys_addr_t size, + int nid, + unsigned long flags) { struct memblock_type *_rgn = &memblock.reserved; - memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n", + memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", (unsigned long long)base, (unsigned long long)base + size - 1, - (void *)_RET_IP_); + flags, (void *)_RET_IP_); + + return memblock_add_region(_rgn, base, size, nid, flags); +} - return memblock_add_region(_rgn, base, size, MAX_NUMNODES); +int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) +{ + return memblock_reserve_region(base, size, MAX_NUMNODES, 0); } /** @@ -1101,6 +1122,7 @@ void __init_memblock memblock_set_current_limit(phys_addr_t limit) static void __init_memblock memblock_dump(struct memblock_type *type, char *name) { unsigned long long base, size; + unsigned long flags; int i; pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); @@ -1111,13 +1133,14 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name base = rgn->base; size = rgn->size; + flags = rgn->flags; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP if (memblock_get_region_node(rgn) != MAX_NUMNODES) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); #endif - pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n", - name, i, base, base + size - 1, size, nid_buf); + pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n", + name, i, base, base + size - 1, size, nid_buf, flags); } } -- cgit v1.2.1 From 66b16edf9eafc3291cabb2253d0f342a847656b7 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 21 Jan 2014 15:49:23 -0800 Subject: memblock, mem_hotplug: introduce MEMBLOCK_HOTPLUG flag to mark hotpluggable regions In find_hotpluggable_memory, once we find out a memory region which is hotpluggable, we want to mark them in memblock.memory. So that we could control memblock allocator not to allocte hotpluggable memory for the kernel later. To achieve this goal, we introduce MEMBLOCK_HOTPLUG flag to indicate the hotpluggable memory regions in memblock and a function memblock_mark_hotplug() to mark hotpluggable memory if we find one. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Tang Chen Reviewed-by: Zhang Yanfei Cc: "H. Peter Anvin" Cc: "Rafael J . Wysocki" Cc: Chen Tang Cc: Gong Chen Cc: Ingo Molnar Cc: Jiang Liu Cc: Johannes Weiner Cc: Lai Jiangshan Cc: Larry Woodman Cc: Len Brown Cc: Liu Jiang Cc: Mel Gorman Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Prarit Bhargava Cc: Rik van Riel Cc: Taku Izumi Cc: Tejun Heo Cc: Thomas Gleixner Cc: Thomas Renninger Cc: Toshi Kani Cc: Vasilis Liaskovitis Cc: Wanpeng Li Cc: Wen Congyang Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 270b005ca964..2121ec4c7fa0 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -682,6 +682,59 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) return memblock_reserve_region(base, size, MAX_NUMNODES, 0); } +/** + * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG. + * @base: the base phys addr of the region + * @size: the size of the region + * + * This function isolates region [@base, @base + @size), and mark it with flag + * MEMBLOCK_HOTPLUG. + * + * Return 0 on succees, -errno on failure. + */ +int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) +{ + struct memblock_type *type = &memblock.memory; + int i, ret, start_rgn, end_rgn; + + ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); + if (ret) + return ret; + + for (i = start_rgn; i < end_rgn; i++) + memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG); + + memblock_merge_regions(type); + return 0; +} + +/** + * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region. + * @base: the base phys addr of the region + * @size: the size of the region + * + * This function isolates region [@base, @base + @size), and clear flag + * MEMBLOCK_HOTPLUG for the isolated regions. + * + * Return 0 on succees, -errno on failure. + */ +int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) +{ + struct memblock_type *type = &memblock.memory; + int i, ret, start_rgn, end_rgn; + + ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); + if (ret) + return ret; + + for (i = start_rgn; i < end_rgn; i++) + memblock_clear_region_flags(&type->regions[i], + MEMBLOCK_HOTPLUG); + + memblock_merge_regions(type); + return 0; +} + /** * __next_free_mem_range - next function for for_each_free_mem_range() * @idx: pointer to u64 loop variable -- cgit v1.2.1 From e7e8de5918dd6a07cbddae559600d7765ad6a56e Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 21 Jan 2014 15:49:26 -0800 Subject: memblock: make memblock_set_node() support different memblock_type [sfr@canb.auug.org.au: fix powerpc build] Signed-off-by: Tang Chen Reviewed-by: Zhang Yanfei Cc: "H. Peter Anvin" Cc: "Rafael J . Wysocki" Cc: Chen Tang Cc: Gong Chen Cc: Ingo Molnar Cc: Jiang Liu Cc: Johannes Weiner Cc: Lai Jiangshan Cc: Larry Woodman Cc: Len Brown Cc: Liu Jiang Cc: Mel Gorman Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Prarit Bhargava Cc: Rik van Riel Cc: Taku Izumi Cc: Tejun Heo Cc: Thomas Gleixner Cc: Thomas Renninger Cc: Toshi Kani Cc: Vasilis Liaskovitis Cc: Wanpeng Li Cc: Wen Congyang Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 2121ec4c7fa0..d5681008dce1 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -911,18 +911,18 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, * memblock_set_node - set node ID on memblock regions * @base: base of area to set node ID for * @size: size of area to set node ID for + * @type: memblock type to set node ID for * @nid: node ID to set * - * Set the nid of memblock memory regions in [@base,@base+@size) to @nid. + * Set the nid of memblock @type regions in [@base,@base+@size) to @nid. * Regions which cross the area boundaries are split as necessary. * * RETURNS: * 0 on success, -errno on failure. */ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, - int nid) + struct memblock_type *type, int nid) { - struct memblock_type *type = &memblock.memory; int start_rgn, end_rgn; int i, ret; -- cgit v1.2.1 From 55ac590c2fadad785d60dd70c12d62823bc2cd39 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 21 Jan 2014 15:49:35 -0800 Subject: memblock, mem_hotplug: make memblock skip hotpluggable regions if needed Linux kernel cannot migrate pages used by the kernel. As a result, hotpluggable memory used by the kernel won't be able to be hot-removed. To solve this problem, the basic idea is to prevent memblock from allocating hotpluggable memory for the kernel at early time, and arrange all hotpluggable memory in ACPI SRAT(System Resource Affinity Table) as ZONE_MOVABLE when initializing zones. In the previous patches, we have marked hotpluggable memory regions with MEMBLOCK_HOTPLUG flag in memblock.memory. In this patch, we make memblock skip these hotpluggable memory regions in the default top-down allocation function if movable_node boot option is specified. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Tang Chen Signed-off-by: Zhang Yanfei Cc: "H. Peter Anvin" Cc: "Rafael J . Wysocki" Cc: Chen Tang Cc: Gong Chen Cc: Ingo Molnar Cc: Jiang Liu Cc: Johannes Weiner Cc: Lai Jiangshan Cc: Larry Woodman Cc: Len Brown Cc: Liu Jiang Cc: Mel Gorman Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Prarit Bhargava Cc: Rik van Riel Cc: Taku Izumi Cc: Tejun Heo Cc: Thomas Gleixner Cc: Thomas Renninger Cc: Toshi Kani Cc: Vasilis Liaskovitis Cc: Wanpeng Li Cc: Wen Congyang Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 12 ++++++++++++ mm/memory_hotplug.c | 1 + 2 files changed, 13 insertions(+) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index d5681008dce1..6a2a48a122a9 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -39,6 +39,9 @@ struct memblock memblock __initdata_memblock = { }; int memblock_debug __initdata_memblock; +#ifdef CONFIG_MOVABLE_NODE +bool movable_node_enabled __initdata_memblock = false; +#endif static int memblock_can_resize __initdata_memblock; static int memblock_memory_in_slab __initdata_memblock = 0; static int memblock_reserved_in_slab __initdata_memblock = 0; @@ -820,6 +823,11 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, * @out_nid: ptr to int for nid of the range, can be %NULL * * Reverse of __next_free_mem_range(). + * + * Linux kernel cannot migrate pages used by itself. Memory hotplug users won't + * be able to hot-remove hotpluggable memory used by the kernel. So this + * function skip hotpluggable regions if needed when allocating memory for the + * kernel. */ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t *out_start, @@ -844,6 +852,10 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) continue; + /* skip hotpluggable memory regions if needed */ + if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) + continue; + /* scan areas before each reservation for intersection */ for ( ; ri >= 0; ri--) { struct memblock_region *r = &rsv->regions[ri]; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 489f235502db..01e39afde1cb 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1446,6 +1446,7 @@ static int __init cmdline_parse_movable_node(char *p) * the kernel away from hotpluggable memory. */ memblock_set_bottom_up(true); + movable_node_enabled = true; #else pr_warn("movable_node option not supported\n"); #endif -- cgit v1.2.1 From b2f3eebe7a8ef6cd4e2ea088ac7f613793f6cad6 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 21 Jan 2014 15:49:38 -0800 Subject: x86, numa, acpi, memory-hotplug: make movable_node have higher priority If users specify the original movablecore=nn@ss boot option, the kernel will arrange [ss, ss+nn) as ZONE_MOVABLE. The kernelcore=nn@ss boot option is similar except it specifies ZONE_NORMAL ranges. Now, if users specify "movable_node" in kernel commandline, the kernel will arrange hotpluggable memory in SRAT as ZONE_MOVABLE. And if users do this, all the other movablecore=nn@ss and kernelcore=nn@ss options should be ignored. For those who don't want this, just specify nothing. The kernel will act as before. Signed-off-by: Tang Chen Signed-off-by: Zhang Yanfei Reviewed-by: Wanpeng Li Cc: "H. Peter Anvin" Cc: "Rafael J . Wysocki" Cc: Chen Tang Cc: Gong Chen Cc: Ingo Molnar Cc: Jiang Liu Cc: Johannes Weiner Cc: Lai Jiangshan Cc: Larry Woodman Cc: Len Brown Cc: Liu Jiang Cc: Mel Gorman Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Prarit Bhargava Cc: Rik van Riel Cc: Taku Izumi Cc: Tejun Heo Cc: Thomas Gleixner Cc: Thomas Renninger Cc: Toshi Kani Cc: Vasilis Liaskovitis Cc: Wen Congyang Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ec4417cb458a..4f59d1986018 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5018,9 +5018,33 @@ static void __init find_zone_movable_pfns_for_nodes(void) nodemask_t saved_node_state = node_states[N_MEMORY]; unsigned long totalpages = early_calculate_totalpages(); int usable_nodes = nodes_weight(node_states[N_MEMORY]); + struct memblock_type *type = &memblock.memory; + + /* Need to find movable_zone earlier when movable_node is specified. */ + find_usable_zone_for_movable(); + + /* + * If movable_node is specified, ignore kernelcore and movablecore + * options. + */ + if (movable_node_is_enabled()) { + for (i = 0; i < type->cnt; i++) { + if (!memblock_is_hotpluggable(&type->regions[i])) + continue; + + nid = type->regions[i].nid; + + usable_startpfn = PFN_DOWN(type->regions[i].base); + zone_movable_pfn[nid] = zone_movable_pfn[nid] ? + min(usable_startpfn, zone_movable_pfn[nid]) : + usable_startpfn; + } + + goto out2; + } /* - * If movablecore was specified, calculate what size of + * If movablecore=nn[KMG] was specified, calculate what size of * kernelcore that corresponds so that memory usable for * any allocation type is evenly spread. If both kernelcore * and movablecore are specified, then the value of kernelcore @@ -5046,7 +5070,6 @@ static void __init find_zone_movable_pfns_for_nodes(void) goto out; /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ - find_usable_zone_for_movable(); usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; restart: @@ -5137,6 +5160,7 @@ restart: if (usable_nodes && required_kernelcore > usable_nodes) goto restart; +out2: /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ for (nid = 0; nid < MAX_NUMNODES; nid++) zone_movable_pfn[nid] = -- cgit v1.2.1 From 1c98dd905ddb7552f13a3f06aa0bd9ef6affeeb7 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 21 Jan 2014 15:49:41 -0800 Subject: memcg: fix kmem_account_flags check in memcg_can_account_kmem() We should start kmem accounting for a memory cgroup only after both its kmem limit is set (KMEM_ACCOUNTED_ACTIVE) and related call sites are patched (KMEM_ACCOUNTED_ACTIVATED). Currently memcg_can_account_kmem() allows kmem accounting even if only one of the conditions is true. Fix it. This means that a page might get charged by memcg_kmem_newpage_charge which would see its static key patched already but memcg_kmem_commit_charge would still see it unpatched and so the charge won't be committed. The result would be charge inconsistency (page_cgroup not marked as PageCgroupUsed) and the charge would leak because __memcg_kmem_uncharge_pages would ignore it. [mhocko@suse.cz: augment changelog] Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Acked-by: Michal Hocko Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7f1a356153c0..3065fa80251d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2959,7 +2959,8 @@ static DEFINE_MUTEX(set_limit_mutex); static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) { return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && - (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); + (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK) == + KMEM_ACCOUNTED_MASK; } /* -- cgit v1.2.1 From 2753b35bcd3156727138cd2305b825611a571a3f Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 21 Jan 2014 15:49:42 -0800 Subject: memcg: make memcg_update_cache_sizes() static This function is not used outside of memcontrol.c so make it static. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Acked-by: Michal Hocko Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3065fa80251d..08541f680d90 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3087,7 +3087,7 @@ int memcg_cache_id(struct mem_cgroup *memcg) * But when we create a new cache, we can call this as well if its parent * is kmem-limited. That will have to hold set_limit_mutex as well. */ -int memcg_update_cache_sizes(struct mem_cgroup *memcg) +static int memcg_update_cache_sizes(struct mem_cgroup *memcg) { int num, ret; -- cgit v1.2.1 From b854f711f6b8b49674d494c5e6d706096dd38301 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:49:43 -0800 Subject: mm/rmap: recompute pgoff for huge page Rmap traversing is used in five different cases, try_to_unmap(), try_to_munlock(), page_referenced(), page_mkclean() and remove_migration_ptes(). Each one implements its own traversing functions for the cases, anon, file, ksm, respectively. These cause lots of duplications and cause maintenance overhead. They also make codes being hard to understand and error-prone. One example is hugepage handling. There is a code to compute hugepage offset correctly in try_to_unmap_file(), but, there isn't a code to compute hugepage offset in rmap_walk_file(). These are used pairwise in migration context, but we missed to modify pairwise. To overcome these drawbacks, we should unify these through one unified function. I decide rmap_walk() as main function since it has no unnecessity. And to control behavior of rmap_walk(), I introduce struct rmap_walk_control having some function pointers. These makes rmap_walk() working for their specific needs. This patchset remove a lot of duplicated code as you can see in below short-stat and kernel text size also decrease slightly. text data bss dec hex filename 10640 1 16 10657 29a1 mm/rmap.o 10047 1 16 10064 2750 mm/rmap.o 13823 705 8288 22816 5920 mm/ksm.o 13199 705 8288 22192 56b0 mm/ksm.o This patch (of 9): We have to recompute pgoff if the given page is huge, since result based on HPAGE_SIZE is not approapriate for scanning the vma interval tree, as shown by commit 36e4f20af833 ("hugetlb: do not use vma_hugecache_offset() for vma_prio_tree_foreach") and commit 369a713e ("rmap: recompute pgoff for unmapping huge page"). To handle both the cases, normal page for page cache and hugetlb page, by same way, we can use compound_page(). It returns 0 on non-compound page and it also returns proper value on compound page. Signed-off-by: Joonsoo Kim Cc: Naoya Horiguchi Cc: Mel Gorman Cc: Hugh Dickins Cc: Rik van Riel Cc: Ingo Molnar Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 068522d8502a..edc0aea2c4e3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1512,7 +1512,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) static int try_to_unmap_file(struct page *page, enum ttu_flags flags) { struct address_space *mapping = page->mapping; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pgoff_t pgoff = page->index << compound_order(page); struct vm_area_struct *vma; int ret = SWAP_AGAIN; unsigned long cursor; @@ -1520,9 +1520,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) unsigned long max_nl_size = 0; unsigned int mapcount; - if (PageHuge(page)) - pgoff = page->index << compound_order(page); - mutex_lock(&mapping->i_mmap_mutex); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); @@ -1712,7 +1709,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, struct vm_area_struct *, unsigned long, void *), void *arg) { struct address_space *mapping = page->mapping; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pgoff_t pgoff = page->index << compound_order(page); struct vm_area_struct *vma; int ret = SWAP_AGAIN; -- cgit v1.2.1 From 0f843c6ac318bb3ea7b63437b66dd39d8f01b088 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:49:45 -0800 Subject: mm/rmap: factor nonlinear handling out of try_to_unmap_file() To merge all kinds of rmap traverse functions, try_to_unmap(), try_to_munlock(), page_referenced() and page_mkclean(), we need to extract common parts and separate out non-common parts. Nonlinear handling is handled just in try_to_unmap_file() and other rmap traverse functions doesn't care of it. Therfore it is better to factor nonlinear handling out of try_to_unmap_file() in order to merge all kinds of rmap traverse functions easily. Signed-off-by: Joonsoo Kim Reviewed-by: Naoya Horiguchi Cc: Mel Gorman Cc: Hugh Dickins Cc: Rik van Riel Cc: Ingo Molnar Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 136 ++++++++++++++++++++++++++++++++++---------------------------- 1 file changed, 74 insertions(+), 62 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index edc0aea2c4e3..7eab4ed304c1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1426,6 +1426,79 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, return ret; } +static int try_to_unmap_nonlinear(struct page *page, + struct address_space *mapping, struct vm_area_struct *vma) +{ + int ret = SWAP_AGAIN; + unsigned long cursor; + unsigned long max_nl_cursor = 0; + unsigned long max_nl_size = 0; + unsigned int mapcount; + + list_for_each_entry(vma, + &mapping->i_mmap_nonlinear, shared.nonlinear) { + + cursor = (unsigned long) vma->vm_private_data; + if (cursor > max_nl_cursor) + max_nl_cursor = cursor; + cursor = vma->vm_end - vma->vm_start; + if (cursor > max_nl_size) + max_nl_size = cursor; + } + + if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ + return SWAP_FAIL; + } + + /* + * We don't try to search for this page in the nonlinear vmas, + * and page_referenced wouldn't have found it anyway. Instead + * just walk the nonlinear vmas trying to age and unmap some. + * The mapcount of the page we came in with is irrelevant, + * but even so use it as a guide to how hard we should try? + */ + mapcount = page_mapcount(page); + if (!mapcount) + return ret; + + cond_resched(); + + max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; + if (max_nl_cursor == 0) + max_nl_cursor = CLUSTER_SIZE; + + do { + list_for_each_entry(vma, + &mapping->i_mmap_nonlinear, shared.nonlinear) { + + cursor = (unsigned long) vma->vm_private_data; + while (cursor < max_nl_cursor && + cursor < vma->vm_end - vma->vm_start) { + if (try_to_unmap_cluster(cursor, &mapcount, + vma, page) == SWAP_MLOCK) + ret = SWAP_MLOCK; + cursor += CLUSTER_SIZE; + vma->vm_private_data = (void *) cursor; + if ((int)mapcount <= 0) + return ret; + } + vma->vm_private_data = (void *) max_nl_cursor; + } + cond_resched(); + max_nl_cursor += CLUSTER_SIZE; + } while (max_nl_cursor <= max_nl_size); + + /* + * Don't loop forever (perhaps all the remaining pages are + * in locked vmas). Reset cursor on all unreserved nonlinear + * vmas, now forgetting on which ones it had fallen behind. + */ + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) + vma->vm_private_data = NULL; + + return ret; +} + bool is_vma_temporary_stack(struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); @@ -1515,10 +1588,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) pgoff_t pgoff = page->index << compound_order(page); struct vm_area_struct *vma; int ret = SWAP_AGAIN; - unsigned long cursor; - unsigned long max_nl_cursor = 0; - unsigned long max_nl_size = 0; - unsigned int mapcount; mutex_lock(&mapping->i_mmap_mutex); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { @@ -1539,64 +1608,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) if (TTU_ACTION(flags) == TTU_MUNLOCK) goto out; - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, - shared.nonlinear) { - cursor = (unsigned long) vma->vm_private_data; - if (cursor > max_nl_cursor) - max_nl_cursor = cursor; - cursor = vma->vm_end - vma->vm_start; - if (cursor > max_nl_size) - max_nl_size = cursor; - } - - if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ - ret = SWAP_FAIL; - goto out; - } - - /* - * We don't try to search for this page in the nonlinear vmas, - * and page_referenced wouldn't have found it anyway. Instead - * just walk the nonlinear vmas trying to age and unmap some. - * The mapcount of the page we came in with is irrelevant, - * but even so use it as a guide to how hard we should try? - */ - mapcount = page_mapcount(page); - if (!mapcount) - goto out; - cond_resched(); - - max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; - if (max_nl_cursor == 0) - max_nl_cursor = CLUSTER_SIZE; - - do { - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, - shared.nonlinear) { - cursor = (unsigned long) vma->vm_private_data; - while ( cursor < max_nl_cursor && - cursor < vma->vm_end - vma->vm_start) { - if (try_to_unmap_cluster(cursor, &mapcount, - vma, page) == SWAP_MLOCK) - ret = SWAP_MLOCK; - cursor += CLUSTER_SIZE; - vma->vm_private_data = (void *) cursor; - if ((int)mapcount <= 0) - goto out; - } - vma->vm_private_data = (void *) max_nl_cursor; - } - cond_resched(); - max_nl_cursor += CLUSTER_SIZE; - } while (max_nl_cursor <= max_nl_size); - - /* - * Don't loop forever (perhaps all the remaining pages are - * in locked vmas). Reset cursor on all unreserved nonlinear - * vmas, now forgetting on which ones it had fallen behind. - */ - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) - vma->vm_private_data = NULL; + ret = try_to_unmap_nonlinear(page, mapping, vma); out: mutex_unlock(&mapping->i_mmap_mutex); return ret; -- cgit v1.2.1 From faecd8dd852d4e4a63a1b8ad43e5df8e41ee0336 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:49:46 -0800 Subject: mm/rmap: factor lock function out of rmap_walk_anon() When we traverse anon_vma, we need to take a read-side anon_lock. But there is subtle difference in the situation so that we can't use same method to take a lock in each cases. Therefore, we need to make rmap_walk_anon() taking difference lock function. This patch is the first step, factoring lock function for anon_lock out of rmap_walk_anon(). It will be used in case of removing migration entry and in default of rmap_walk_anon(). Signed-off-by: Joonsoo Kim Reviewed-by: Naoya Horiguchi Cc: Mel Gorman Cc: Hugh Dickins Cc: Rik van Riel Cc: Ingo Molnar Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 7eab4ed304c1..5a79bf585e27 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1684,6 +1684,24 @@ void __put_anon_vma(struct anon_vma *anon_vma) } #ifdef CONFIG_MIGRATION +static struct anon_vma *rmap_walk_anon_lock(struct page *page) +{ + struct anon_vma *anon_vma; + + /* + * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() + * because that depends on page_mapped(); but not all its usages + * are holding mmap_sem. Users without mmap_sem are required to + * take a reference count to prevent the anon_vma disappearing + */ + anon_vma = page_anon_vma(page); + if (!anon_vma) + return NULL; + + anon_vma_lock_read(anon_vma); + return anon_vma; +} + /* * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): * Called by migrate.c to remove migration ptes, but might be used more later. @@ -1696,16 +1714,10 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, struct anon_vma_chain *avc; int ret = SWAP_AGAIN; - /* - * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() - * because that depends on page_mapped(); but not all its usages - * are holding mmap_sem. Users without mmap_sem are required to - * take a reference count to prevent the anon_vma disappearing - */ - anon_vma = page_anon_vma(page); + anon_vma = rmap_walk_anon_lock(page); if (!anon_vma) return ret; - anon_vma_lock_read(anon_vma); + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); -- cgit v1.2.1 From 051ac83adf69eea4f57a97356e4282e395a5fa6d Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:49:48 -0800 Subject: mm/rmap: make rmap_walk to get the rmap_walk_control argument In each rmap traverse case, there is some difference so that we need function pointers and arguments to them in order to handle these For this purpose, struct rmap_walk_control is introduced in this patch, and will be extended in following patch. Introducing and extending are separate, because it clarify changes. Signed-off-by: Joonsoo Kim Reviewed-by: Naoya Horiguchi Cc: Mel Gorman Cc: Hugh Dickins Cc: Rik van Riel Cc: Ingo Molnar Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 6 +++--- mm/migrate.c | 7 ++++++- mm/rmap.c | 19 ++++++++----------- 3 files changed, 17 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 175fff79dc95..c3035fee8080 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1997,8 +1997,7 @@ out: } #ifdef CONFIG_MIGRATION -int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, - struct vm_area_struct *, unsigned long, void *), void *arg) +int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { struct stable_node *stable_node; struct rmap_item *rmap_item; @@ -2033,7 +2032,8 @@ again: if ((rmap_item->mm == vma->vm_mm) == search_new_forks) continue; - ret = rmap_one(page, vma, rmap_item->address, arg); + ret = rwc->rmap_one(page, vma, + rmap_item->address, rwc->arg); if (ret != SWAP_AGAIN) { anon_vma_unlock_read(anon_vma); goto out; diff --git a/mm/migrate.c b/mm/migrate.c index 9194375b2307..11d89dc0574c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -199,7 +199,12 @@ out: */ static void remove_migration_ptes(struct page *old, struct page *new) { - rmap_walk(new, remove_migration_pte, old); + struct rmap_walk_control rwc = { + .rmap_one = remove_migration_pte, + .arg = old, + }; + + rmap_walk(new, &rwc); } /* diff --git a/mm/rmap.c b/mm/rmap.c index 5a79bf585e27..f8f10ad5d359 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1706,8 +1706,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page) * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): * Called by migrate.c to remove migration ptes, but might be used more later. */ -static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, - struct vm_area_struct *, unsigned long, void *), void *arg) +static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) { struct anon_vma *anon_vma; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -1721,7 +1720,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); - ret = rmap_one(page, vma, address, arg); + ret = rwc->rmap_one(page, vma, address, rwc->arg); if (ret != SWAP_AGAIN) break; } @@ -1729,8 +1728,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, return ret; } -static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, - struct vm_area_struct *, unsigned long, void *), void *arg) +static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) { struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << compound_order(page); @@ -1742,7 +1740,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, mutex_lock(&mapping->i_mmap_mutex); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); - ret = rmap_one(page, vma, address, arg); + ret = rwc->rmap_one(page, vma, address, rwc->arg); if (ret != SWAP_AGAIN) break; } @@ -1755,17 +1753,16 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, return ret; } -int rmap_walk(struct page *page, int (*rmap_one)(struct page *, - struct vm_area_struct *, unsigned long, void *), void *arg) +int rmap_walk(struct page *page, struct rmap_walk_control *rwc) { VM_BUG_ON(!PageLocked(page)); if (unlikely(PageKsm(page))) - return rmap_walk_ksm(page, rmap_one, arg); + return rmap_walk_ksm(page, rwc); else if (PageAnon(page)) - return rmap_walk_anon(page, rmap_one, arg); + return rmap_walk_anon(page, rwc); else - return rmap_walk_file(page, rmap_one, arg); + return rmap_walk_file(page, rwc); } #endif /* CONFIG_MIGRATION */ -- cgit v1.2.1 From 0dd1c7bbce8d1d142bb25aefaa50262dfd77cb78 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:49:49 -0800 Subject: mm/rmap: extend rmap_walk_xxx() to cope with different cases There are a lot of common parts in traversing functions, but there are also a little of uncommon parts in it. By assigning proper function pointer on each rmap_walker_control, we can handle these difference correctly. Following are differences we should handle. 1. difference of lock function in anon mapping case 2. nonlinear handling in file mapping case 3. prechecked condition: checking memcg in page_referenced(), checking VM_SHARE in page_mkclean() checking temporary vma in try_to_unmap() 4. exit condition: checking page_mapped() in try_to_unmap() So, in this patch, I introduce 4 function pointers to handle above differences. Signed-off-by: Joonsoo Kim Cc: Naoya Horiguchi Cc: Mel Gorman Cc: Hugh Dickins Cc: Rik van Riel Cc: Ingo Molnar Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 7 +++++++ mm/rmap.c | 37 +++++++++++++++++++++++++++++-------- 2 files changed, 36 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index c3035fee8080..91b8cb35f7cc 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2032,12 +2032,19 @@ again: if ((rmap_item->mm == vma->vm_mm) == search_new_forks) continue; + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) + continue; + ret = rwc->rmap_one(page, vma, rmap_item->address, rwc->arg); if (ret != SWAP_AGAIN) { anon_vma_unlock_read(anon_vma); goto out; } + if (rwc->done && rwc->done(page)) { + anon_vma_unlock_read(anon_vma); + goto out; + } } anon_vma_unlock_read(anon_vma); } diff --git a/mm/rmap.c b/mm/rmap.c index f8f10ad5d359..97bf8f0396f8 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1684,10 +1684,14 @@ void __put_anon_vma(struct anon_vma *anon_vma) } #ifdef CONFIG_MIGRATION -static struct anon_vma *rmap_walk_anon_lock(struct page *page) +static struct anon_vma *rmap_walk_anon_lock(struct page *page, + struct rmap_walk_control *rwc) { struct anon_vma *anon_vma; + if (rwc->anon_lock) + return rwc->anon_lock(page); + /* * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() * because that depends on page_mapped(); but not all its usages @@ -1713,16 +1717,22 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) struct anon_vma_chain *avc; int ret = SWAP_AGAIN; - anon_vma = rmap_walk_anon_lock(page); + anon_vma = rmap_walk_anon_lock(page, rwc); if (!anon_vma) return ret; anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); + + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) + continue; + ret = rwc->rmap_one(page, vma, address, rwc->arg); if (ret != SWAP_AGAIN) break; + if (rwc->done && rwc->done(page)) + break; } anon_vma_unlock_read(anon_vma); return ret; @@ -1740,15 +1750,26 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) mutex_lock(&mapping->i_mmap_mutex); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); + + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) + continue; + ret = rwc->rmap_one(page, vma, address, rwc->arg); if (ret != SWAP_AGAIN) - break; + goto done; + if (rwc->done && rwc->done(page)) + goto done; } - /* - * No nonlinear handling: being always shared, nonlinear vmas - * never contain migration ptes. Decide what to do about this - * limitation to linear when we need rmap_walk() on nonlinear. - */ + + if (!rwc->file_nonlinear) + goto done; + + if (list_empty(&mapping->i_mmap_nonlinear)) + goto done; + + ret = rwc->file_nonlinear(page, mapping, vma); + +done: mutex_unlock(&mapping->i_mmap_mutex); return ret; } -- cgit v1.2.1 From 52629506420ce32997f1fba0a1ab2f1aaa9a4f79 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:49:50 -0800 Subject: mm/rmap: use rmap_walk() in try_to_unmap() Now, we have an infrastructure in rmap_walk() to handle difference from variants of rmap traversing functions. So, just use it in try_to_unmap(). In this patch, I change following things. 1. enable rmap_walk() if !CONFIG_MIGRATION. 2. mechanical change to use rmap_walk() in try_to_unmap(). Signed-off-by: Joonsoo Kim Reviewed-by: Naoya Horiguchi Cc: Mel Gorman Cc: Hugh Dickins Cc: Rik van Riel Cc: Ingo Molnar Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 4 ++-- mm/rmap.c | 48 ++++++++++++++++++++++++++++++++++++------------ 2 files changed, 38 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 91b8cb35f7cc..6b4baa97f4c0 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1982,7 +1982,7 @@ again: continue; ret = try_to_unmap_one(page, vma, - rmap_item->address, flags); + rmap_item->address, (void *)flags); if (ret != SWAP_AGAIN || !page_mapped(page)) { anon_vma_unlock_read(anon_vma); goto out; @@ -1996,7 +1996,6 @@ out: return ret; } -#ifdef CONFIG_MIGRATION int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { struct stable_node *stable_node; @@ -2054,6 +2053,7 @@ out: return ret; } +#ifdef CONFIG_MIGRATION void ksm_migrate_page(struct page *newpage, struct page *oldpage) { struct stable_node *stable_node; diff --git a/mm/rmap.c b/mm/rmap.c index 97bf8f0396f8..b3263cb32361 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1179,15 +1179,18 @@ out: /* * Subfunctions of try_to_unmap: try_to_unmap_one called * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file. + * + * @arg: enum ttu_flags will be passed to this argument */ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, - unsigned long address, enum ttu_flags flags) + unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; pte_t *pte; pte_t pteval; spinlock_t *ptl; int ret = SWAP_AGAIN; + enum ttu_flags flags = (enum ttu_flags)arg; pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) @@ -1513,6 +1516,11 @@ bool is_vma_temporary_stack(struct vm_area_struct *vma) return false; } +static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) +{ + return is_vma_temporary_stack(vma); +} + /** * try_to_unmap_anon - unmap or unlock anonymous page using the object-based * rmap method @@ -1558,7 +1566,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) continue; address = vma_address(page, vma); - ret = try_to_unmap_one(page, vma, address, flags); + ret = try_to_unmap_one(page, vma, address, (void *)flags); if (ret != SWAP_AGAIN || !page_mapped(page)) break; } @@ -1592,7 +1600,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) mutex_lock(&mapping->i_mmap_mutex); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); - ret = try_to_unmap_one(page, vma, address, flags); + ret = try_to_unmap_one(page, vma, address, (void *)flags); if (ret != SWAP_AGAIN || !page_mapped(page)) goto out; } @@ -1614,6 +1622,11 @@ out: return ret; } +static int page_not_mapped(struct page *page) +{ + return !page_mapped(page); +}; + /** * try_to_unmap - try to remove all page table mappings to a page * @page: the page to get unmapped @@ -1631,16 +1644,29 @@ out: int try_to_unmap(struct page *page, enum ttu_flags flags) { int ret; + struct rmap_walk_control rwc = { + .rmap_one = try_to_unmap_one, + .arg = (void *)flags, + .done = page_not_mapped, + .file_nonlinear = try_to_unmap_nonlinear, + .anon_lock = page_lock_anon_vma_read, + }; - BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); - if (unlikely(PageKsm(page))) - ret = try_to_unmap_ksm(page, flags); - else if (PageAnon(page)) - ret = try_to_unmap_anon(page, flags); - else - ret = try_to_unmap_file(page, flags); + /* + * During exec, a temporary VMA is setup and later moved. + * The VMA is moved under the anon_vma lock but not the + * page tables leading to a race where migration cannot + * find the migration ptes. Rather than increasing the + * locking requirements of exec(), migration skips + * temporary VMAs until after exec() completes. + */ + if (flags & TTU_MIGRATION && !PageKsm(page) && PageAnon(page)) + rwc.invalid_vma = invalid_migration_vma; + + ret = rmap_walk(page, &rwc); + if (ret != SWAP_MLOCK && !page_mapped(page)) ret = SWAP_SUCCESS; return ret; @@ -1683,7 +1709,6 @@ void __put_anon_vma(struct anon_vma *anon_vma) anon_vma_free(anon_vma); } -#ifdef CONFIG_MIGRATION static struct anon_vma *rmap_walk_anon_lock(struct page *page, struct rmap_walk_control *rwc) { @@ -1785,7 +1810,6 @@ int rmap_walk(struct page *page, struct rmap_walk_control *rwc) else return rmap_walk_file(page, rwc); } -#endif /* CONFIG_MIGRATION */ #ifdef CONFIG_HUGETLB_PAGE /* -- cgit v1.2.1 From e8351ac9bfa7f4412d5d196b6742309473ca506d Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:49:52 -0800 Subject: mm/rmap: use rmap_walk() in try_to_munlock() Now, we have an infrastructure in rmap_walk() to handle difference from variants of rmap traversing functions. So, just use it in try_to_munlock(). In this patch, I change following things. 1. remove some variants of rmap traversing functions. cf> try_to_unmap_ksm, try_to_unmap_anon, try_to_unmap_file 2. mechanical change to use rmap_walk() in try_to_munlock(). 3. copy and paste comments. Signed-off-by: Joonsoo Kim Reviewed-by: Naoya Horiguchi Cc: Mel Gorman Cc: Hugh Dickins Cc: Rik van Riel Cc: Ingo Molnar Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 50 -------------------- mm/rmap.c | 154 +++++++++++++++++--------------------------------------------- 2 files changed, 42 insertions(+), 162 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 6b4baa97f4c0..646d45a6b6c8 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1946,56 +1946,6 @@ out: return referenced; } -int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) -{ - struct stable_node *stable_node; - struct rmap_item *rmap_item; - int ret = SWAP_AGAIN; - int search_new_forks = 0; - - VM_BUG_ON(!PageKsm(page)); - VM_BUG_ON(!PageLocked(page)); - - stable_node = page_stable_node(page); - if (!stable_node) - return SWAP_FAIL; -again: - hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { - struct anon_vma *anon_vma = rmap_item->anon_vma; - struct anon_vma_chain *vmac; - struct vm_area_struct *vma; - - anon_vma_lock_read(anon_vma); - anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, - 0, ULONG_MAX) { - vma = vmac->vma; - if (rmap_item->address < vma->vm_start || - rmap_item->address >= vma->vm_end) - continue; - /* - * Initially we examine only the vma which covers this - * rmap_item; but later, if there is still work to do, - * we examine covering vmas in other mms: in case they - * were forked from the original since ksmd passed. - */ - if ((rmap_item->mm == vma->vm_mm) == search_new_forks) - continue; - - ret = try_to_unmap_one(page, vma, - rmap_item->address, (void *)flags); - if (ret != SWAP_AGAIN || !page_mapped(page)) { - anon_vma_unlock_read(anon_vma); - goto out; - } - } - anon_vma_unlock_read(anon_vma); - } - if (!search_new_forks++) - goto again; -out: - return ret; -} - int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { struct stable_node *stable_node; diff --git a/mm/rmap.c b/mm/rmap.c index b3263cb32361..c73e0c645d09 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1177,9 +1177,6 @@ out: } /* - * Subfunctions of try_to_unmap: try_to_unmap_one called - * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file. - * * @arg: enum ttu_flags will be passed to this argument */ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, @@ -1521,107 +1518,6 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) return is_vma_temporary_stack(vma); } -/** - * try_to_unmap_anon - unmap or unlock anonymous page using the object-based - * rmap method - * @page: the page to unmap/unlock - * @flags: action and flags - * - * Find all the mappings of a page using the mapping pointer and the vma chains - * contained in the anon_vma struct it points to. - * - * This function is only called from try_to_unmap/try_to_munlock for - * anonymous pages. - * When called from try_to_munlock(), the mmap_sem of the mm containing the vma - * where the page was found will be held for write. So, we won't recheck - * vm_flags for that VMA. That should be OK, because that vma shouldn't be - * 'LOCKED. - */ -static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) -{ - struct anon_vma *anon_vma; - pgoff_t pgoff; - struct anon_vma_chain *avc; - int ret = SWAP_AGAIN; - - anon_vma = page_lock_anon_vma_read(page); - if (!anon_vma) - return ret; - - pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { - struct vm_area_struct *vma = avc->vma; - unsigned long address; - - /* - * During exec, a temporary VMA is setup and later moved. - * The VMA is moved under the anon_vma lock but not the - * page tables leading to a race where migration cannot - * find the migration ptes. Rather than increasing the - * locking requirements of exec(), migration skips - * temporary VMAs until after exec() completes. - */ - if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && - is_vma_temporary_stack(vma)) - continue; - - address = vma_address(page, vma); - ret = try_to_unmap_one(page, vma, address, (void *)flags); - if (ret != SWAP_AGAIN || !page_mapped(page)) - break; - } - - page_unlock_anon_vma_read(anon_vma); - return ret; -} - -/** - * try_to_unmap_file - unmap/unlock file page using the object-based rmap method - * @page: the page to unmap/unlock - * @flags: action and flags - * - * Find all the mappings of a page using the mapping pointer and the vma chains - * contained in the address_space struct it points to. - * - * This function is only called from try_to_unmap/try_to_munlock for - * object-based pages. - * When called from try_to_munlock(), the mmap_sem of the mm containing the vma - * where the page was found will be held for write. So, we won't recheck - * vm_flags for that VMA. That should be OK, because that vma shouldn't be - * 'LOCKED. - */ -static int try_to_unmap_file(struct page *page, enum ttu_flags flags) -{ - struct address_space *mapping = page->mapping; - pgoff_t pgoff = page->index << compound_order(page); - struct vm_area_struct *vma; - int ret = SWAP_AGAIN; - - mutex_lock(&mapping->i_mmap_mutex); - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { - unsigned long address = vma_address(page, vma); - ret = try_to_unmap_one(page, vma, address, (void *)flags); - if (ret != SWAP_AGAIN || !page_mapped(page)) - goto out; - } - - if (list_empty(&mapping->i_mmap_nonlinear)) - goto out; - - /* - * We don't bother to try to find the munlocked page in nonlinears. - * It's costly. Instead, later, page reclaim logic may call - * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily. - */ - if (TTU_ACTION(flags) == TTU_MUNLOCK) - goto out; - - ret = try_to_unmap_nonlinear(page, mapping, vma); -out: - mutex_unlock(&mapping->i_mmap_mutex); - return ret; -} - static int page_not_mapped(struct page *page) { return !page_mapped(page); @@ -1689,14 +1585,25 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) */ int try_to_munlock(struct page *page) { + int ret; + struct rmap_walk_control rwc = { + .rmap_one = try_to_unmap_one, + .arg = (void *)TTU_MUNLOCK, + .done = page_not_mapped, + /* + * We don't bother to try to find the munlocked page in + * nonlinears. It's costly. Instead, later, page reclaim logic + * may call try_to_unmap() and recover PG_mlocked lazily. + */ + .file_nonlinear = NULL, + .anon_lock = page_lock_anon_vma_read, + + }; + VM_BUG_ON(!PageLocked(page) || PageLRU(page)); - if (unlikely(PageKsm(page))) - return try_to_unmap_ksm(page, TTU_MUNLOCK); - else if (PageAnon(page)) - return try_to_unmap_anon(page, TTU_MUNLOCK); - else - return try_to_unmap_file(page, TTU_MUNLOCK); + ret = rmap_walk(page, &rwc); + return ret; } void __put_anon_vma(struct anon_vma *anon_vma) @@ -1732,8 +1639,18 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, } /* - * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): - * Called by migrate.c to remove migration ptes, but might be used more later. + * rmap_walk_anon - do something to anonymous page using the object-based + * rmap method + * @page: the page to be handled + * @rwc: control variable according to each walk type + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the anon_vma struct it points to. + * + * When called from try_to_munlock(), the mmap_sem of the mm containing the vma + * where the page was found will be held for write. So, we won't recheck + * vm_flags for that VMA. That should be OK, because that vma shouldn't be + * LOCKED. */ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) { @@ -1763,6 +1680,19 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) return ret; } +/* + * rmap_walk_file - do something to file page using the object-based rmap method + * @page: the page to be handled + * @rwc: control variable according to each walk type + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the address_space struct it points to. + * + * When called from try_to_munlock(), the mmap_sem of the mm containing the vma + * where the page was found will be held for write. So, we won't recheck + * vm_flags for that VMA. That should be OK, because that vma shouldn't be + * LOCKED. + */ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) { struct address_space *mapping = page->mapping; -- cgit v1.2.1 From 9f32624be943538983eb0f18b73a9052d1493c80 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:49:53 -0800 Subject: mm/rmap: use rmap_walk() in page_referenced() Now, we have an infrastructure in rmap_walk() to handle difference from variants of rmap traversing functions. So, just use it in page_referenced(). In this patch, I change following things. 1. remove some variants of rmap traversing functions. cf> page_referenced_ksm, page_referenced_anon, page_referenced_file 2. introduce new struct page_referenced_arg and pass it to page_referenced_one(), main function of rmap_walk, in order to count reference, to store vm_flags and to check finish condition. 3. mechanical change to use rmap_walk() in page_referenced(). [liwanp@linux.vnet.ibm.com: fix BUG at rmap_walk] Signed-off-by: Joonsoo Kim Reviewed-by: Naoya Horiguchi Cc: Mel Gorman Cc: Hugh Dickins Cc: Rik van Riel Cc: Ingo Molnar Cc: Hillf Danton Signed-off-by: Wanpeng Li Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 60 ++---------------- mm/rmap.c | 210 ++++++++++++++++++++++---------------------------------------- 2 files changed, 79 insertions(+), 191 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 646d45a6b6c8..3df141e5f3e0 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1891,61 +1891,6 @@ struct page *ksm_might_need_to_copy(struct page *page, return new_page; } -int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, - unsigned long *vm_flags) -{ - struct stable_node *stable_node; - struct rmap_item *rmap_item; - unsigned int mapcount = page_mapcount(page); - int referenced = 0; - int search_new_forks = 0; - - VM_BUG_ON(!PageKsm(page)); - VM_BUG_ON(!PageLocked(page)); - - stable_node = page_stable_node(page); - if (!stable_node) - return 0; -again: - hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { - struct anon_vma *anon_vma = rmap_item->anon_vma; - struct anon_vma_chain *vmac; - struct vm_area_struct *vma; - - anon_vma_lock_read(anon_vma); - anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, - 0, ULONG_MAX) { - vma = vmac->vma; - if (rmap_item->address < vma->vm_start || - rmap_item->address >= vma->vm_end) - continue; - /* - * Initially we examine only the vma which covers this - * rmap_item; but later, if there is still work to do, - * we examine covering vmas in other mms: in case they - * were forked from the original since ksmd passed. - */ - if ((rmap_item->mm == vma->vm_mm) == search_new_forks) - continue; - - if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) - continue; - - referenced += page_referenced_one(page, vma, - rmap_item->address, &mapcount, vm_flags); - if (!search_new_forks || !mapcount) - break; - } - anon_vma_unlock_read(anon_vma); - if (!mapcount) - goto out; - } - if (!search_new_forks++) - goto again; -out: - return referenced; -} - int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { struct stable_node *stable_node; @@ -1954,6 +1899,11 @@ int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) int search_new_forks = 0; VM_BUG_ON(!PageKsm(page)); + + /* + * Rely on the page lock to protect against concurrent modifications + * to that page's node of the stable tree. + */ VM_BUG_ON(!PageLocked(page)); stable_node = page_stable_node(page); diff --git a/mm/rmap.c b/mm/rmap.c index c73e0c645d09..080413036406 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -660,17 +660,22 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) return 1; } +struct page_referenced_arg { + int mapcount; + int referenced; + unsigned long vm_flags; + struct mem_cgroup *memcg; +}; /* - * Subfunctions of page_referenced: page_referenced_one called - * repeatedly from either page_referenced_anon or page_referenced_file. + * arg: page_referenced_arg will be passed */ int page_referenced_one(struct page *page, struct vm_area_struct *vma, - unsigned long address, unsigned int *mapcount, - unsigned long *vm_flags) + unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; int referenced = 0; + struct page_referenced_arg *pra = arg; if (unlikely(PageTransHuge(page))) { pmd_t *pmd; @@ -682,13 +687,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, pmd = page_check_address_pmd(page, mm, address, PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); if (!pmd) - goto out; + return SWAP_AGAIN; if (vma->vm_flags & VM_LOCKED) { spin_unlock(ptl); - *mapcount = 0; /* break early from loop */ - *vm_flags |= VM_LOCKED; - goto out; + pra->vm_flags |= VM_LOCKED; + return SWAP_FAIL; /* To break the loop */ } /* go ahead even if the pmd is pmd_trans_splitting() */ @@ -704,13 +708,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, */ pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) - goto out; + return SWAP_AGAIN; if (vma->vm_flags & VM_LOCKED) { pte_unmap_unlock(pte, ptl); - *mapcount = 0; /* break early from loop */ - *vm_flags |= VM_LOCKED; - goto out; + pra->vm_flags |= VM_LOCKED; + return SWAP_FAIL; /* To break the loop */ } if (ptep_clear_flush_young_notify(vma, address, pte)) { @@ -727,113 +730,27 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); } - (*mapcount)--; - - if (referenced) - *vm_flags |= vma->vm_flags; -out: - return referenced; -} - -static int page_referenced_anon(struct page *page, - struct mem_cgroup *memcg, - unsigned long *vm_flags) -{ - unsigned int mapcount; - struct anon_vma *anon_vma; - pgoff_t pgoff; - struct anon_vma_chain *avc; - int referenced = 0; - - anon_vma = page_lock_anon_vma_read(page); - if (!anon_vma) - return referenced; - - mapcount = page_mapcount(page); - pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { - struct vm_area_struct *vma = avc->vma; - unsigned long address = vma_address(page, vma); - /* - * If we are reclaiming on behalf of a cgroup, skip - * counting on behalf of references from different - * cgroups - */ - if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) - continue; - referenced += page_referenced_one(page, vma, address, - &mapcount, vm_flags); - if (!mapcount) - break; + if (referenced) { + pra->referenced++; + pra->vm_flags |= vma->vm_flags; } - page_unlock_anon_vma_read(anon_vma); - return referenced; + pra->mapcount--; + if (!pra->mapcount) + return SWAP_SUCCESS; /* To break the loop */ + + return SWAP_AGAIN; } -/** - * page_referenced_file - referenced check for object-based rmap - * @page: the page we're checking references on. - * @memcg: target memory control group - * @vm_flags: collect encountered vma->vm_flags who actually referenced the page - * - * For an object-based mapped page, find all the places it is mapped and - * check/clear the referenced flag. This is done by following the page->mapping - * pointer, then walking the chain of vmas it holds. It returns the number - * of references it found. - * - * This function is only called from page_referenced for object-based pages. - */ -static int page_referenced_file(struct page *page, - struct mem_cgroup *memcg, - unsigned long *vm_flags) +static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) { - unsigned int mapcount; - struct address_space *mapping = page->mapping; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - struct vm_area_struct *vma; - int referenced = 0; - - /* - * The caller's checks on page->mapping and !PageAnon have made - * sure that this is a file page: the check for page->mapping - * excludes the case just before it gets set on an anon page. - */ - BUG_ON(PageAnon(page)); + struct page_referenced_arg *pra = arg; + struct mem_cgroup *memcg = pra->memcg; - /* - * The page lock not only makes sure that page->mapping cannot - * suddenly be NULLified by truncation, it makes sure that the - * structure at mapping cannot be freed and reused yet, - * so we can safely take mapping->i_mmap_mutex. - */ - BUG_ON(!PageLocked(page)); - - mutex_lock(&mapping->i_mmap_mutex); - - /* - * i_mmap_mutex does not stabilize mapcount at all, but mapcount - * is more likely to be accurate if we note it after spinning. - */ - mapcount = page_mapcount(page); - - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { - unsigned long address = vma_address(page, vma); - /* - * If we are reclaiming on behalf of a cgroup, skip - * counting on behalf of references from different - * cgroups - */ - if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) - continue; - referenced += page_referenced_one(page, vma, address, - &mapcount, vm_flags); - if (!mapcount) - break; - } + if (!mm_match_cgroup(vma->vm_mm, memcg)) + return true; - mutex_unlock(&mapping->i_mmap_mutex); - return referenced; + return false; } /** @@ -851,32 +768,47 @@ int page_referenced(struct page *page, struct mem_cgroup *memcg, unsigned long *vm_flags) { - int referenced = 0; + int ret; int we_locked = 0; + struct page_referenced_arg pra = { + .mapcount = page_mapcount(page), + .memcg = memcg, + }; + struct rmap_walk_control rwc = { + .rmap_one = page_referenced_one, + .arg = (void *)&pra, + .anon_lock = page_lock_anon_vma_read, + }; *vm_flags = 0; - if (page_mapped(page) && page_rmapping(page)) { - if (!is_locked && (!PageAnon(page) || PageKsm(page))) { - we_locked = trylock_page(page); - if (!we_locked) { - referenced++; - goto out; - } - } - if (unlikely(PageKsm(page))) - referenced += page_referenced_ksm(page, memcg, - vm_flags); - else if (PageAnon(page)) - referenced += page_referenced_anon(page, memcg, - vm_flags); - else if (page->mapping) - referenced += page_referenced_file(page, memcg, - vm_flags); - if (we_locked) - unlock_page(page); + if (!page_mapped(page)) + return 0; + + if (!page_rmapping(page)) + return 0; + + if (!is_locked && (!PageAnon(page) || PageKsm(page))) { + we_locked = trylock_page(page); + if (!we_locked) + return 1; } -out: - return referenced; + + /* + * If we are reclaiming on behalf of a cgroup, skip + * counting on behalf of references from different + * cgroups + */ + if (memcg) { + rwc.invalid_vma = invalid_page_referenced_vma; + } + + ret = rmap_walk(page, &rwc); + *vm_flags = pra.vm_flags; + + if (we_locked) + unlock_page(page); + + return pra.referenced; } static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, @@ -1700,6 +1632,14 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) struct vm_area_struct *vma; int ret = SWAP_AGAIN; + /* + * The page lock not only makes sure that page->mapping cannot + * suddenly be NULLified by truncation, it makes sure that the + * structure at mapping cannot be freed and reused yet, + * so we can safely take mapping->i_mmap_mutex. + */ + VM_BUG_ON(!PageLocked(page)); + if (!mapping) return ret; mutex_lock(&mapping->i_mmap_mutex); @@ -1731,8 +1671,6 @@ done: int rmap_walk(struct page *page, struct rmap_walk_control *rwc) { - VM_BUG_ON(!PageLocked(page)); - if (unlikely(PageKsm(page))) return rmap_walk_ksm(page, rwc); else if (PageAnon(page)) -- cgit v1.2.1 From 9853a407b97d8d066b5a865173a4859a3e69fd8a Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:49:55 -0800 Subject: mm/rmap: use rmap_walk() in page_mkclean() Now, we have an infrastructure in rmap_walk() to handle difference from variants of rmap traversing functions. So, just use it in page_mkclean(). In this patch, I change following things. 1. remove some variants of rmap traversing functions. cf> page_mkclean_file 2. mechanical change to use rmap_walk() in page_mkclean(). Signed-off-by: Joonsoo Kim Reviewed-by: Naoya Horiguchi Cc: Mel Gorman Cc: Hugh Dickins Cc: Rik van Riel Cc: Ingo Molnar Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 51 ++++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 080413036406..962e2a1e13a0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -812,12 +812,13 @@ int page_referenced(struct page *page, } static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, - unsigned long address) + unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; pte_t *pte; spinlock_t *ptl; int ret = 0; + int *cleaned = arg; pte = page_check_address(page, mm, address, &ptl, 1); if (!pte) @@ -836,44 +837,44 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); - if (ret) + if (ret) { mmu_notifier_invalidate_page(mm, address); + (*cleaned)++; + } out: - return ret; + return SWAP_AGAIN; } -static int page_mkclean_file(struct address_space *mapping, struct page *page) +static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) { - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - struct vm_area_struct *vma; - int ret = 0; - - BUG_ON(PageAnon(page)); + if (vma->vm_flags & VM_SHARED) + return 0; - mutex_lock(&mapping->i_mmap_mutex); - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { - if (vma->vm_flags & VM_SHARED) { - unsigned long address = vma_address(page, vma); - ret += page_mkclean_one(page, vma, address); - } - } - mutex_unlock(&mapping->i_mmap_mutex); - return ret; + return 1; } int page_mkclean(struct page *page) { - int ret = 0; + int cleaned = 0; + struct address_space *mapping; + struct rmap_walk_control rwc = { + .arg = (void *)&cleaned, + .rmap_one = page_mkclean_one, + .invalid_vma = invalid_mkclean_vma, + }; BUG_ON(!PageLocked(page)); - if (page_mapped(page)) { - struct address_space *mapping = page_mapping(page); - if (mapping) - ret = page_mkclean_file(mapping, page); - } + if (!page_mapped(page)) + return 0; - return ret; + mapping = page_mapping(page); + if (!mapping) + return 0; + + rmap_walk(page, &rwc); + + return cleaned; } EXPORT_SYMBOL_GPL(page_mkclean); -- cgit v1.2.1 From 1da4db0cd5c8a31d4468ec906b413e75e604b465 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jan 2014 15:49:58 -0800 Subject: oom_kill: change oom_kill.c to use for_each_thread() Change oom_kill.c to use for_each_thread() rather than the racy while_each_thread() which can loop forever if we race with exit. Note also that most users were buggy even if while_each_thread() was fine, the task can exit even _before_ rcu_read_lock(). Fortunately the new for_each_thread() only requires the stable task_struct, so this change fixes both problems. Signed-off-by: Oleg Nesterov Reviewed-by: Sergey Dyasly Tested-by: Sergey Dyasly Reviewed-by: Sameer Nanda Cc: "Eric W. Biederman" Cc: Frederic Weisbecker Cc: Mandeep Singh Baines Cc: "Ma, Xindong" Reviewed-by: Michal Hocko Cc: "Tu, Xiaobing" Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1e4a600a6163..96d7945f75a6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -59,7 +59,7 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, { struct task_struct *start = tsk; - do { + for_each_thread(start, tsk) { if (mask) { /* * If this is a mempolicy constrained oom, tsk's @@ -77,7 +77,7 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, if (cpuset_mems_allowed_intersects(current, tsk)) return true; } - } while_each_thread(start, tsk); + } return false; } @@ -97,14 +97,14 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, */ struct task_struct *find_lock_task_mm(struct task_struct *p) { - struct task_struct *t = p; + struct task_struct *t; - do { + for_each_thread(p, t) { task_lock(t); if (likely(t->mm)) return t; task_unlock(t); - } while_each_thread(p, t); + } return NULL; } @@ -301,7 +301,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, unsigned long chosen_points = 0; rcu_read_lock(); - do_each_thread(g, p) { + for_each_process_thread(g, p) { unsigned int points; switch (oom_scan_process_thread(p, totalpages, nodemask, @@ -323,7 +323,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, chosen = p; chosen_points = points; } - } while_each_thread(g, p); + } if (chosen) get_task_struct(chosen); rcu_read_unlock(); @@ -406,7 +406,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, { struct task_struct *victim = p; struct task_struct *child; - struct task_struct *t = p; + struct task_struct *t; struct mm_struct *mm; unsigned int victim_points = 0; static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, @@ -437,7 +437,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, * still freeing memory. */ read_lock(&tasklist_lock); - do { + for_each_thread(p, t) { list_for_each_entry(child, &t->children, sibling) { unsigned int child_points; @@ -455,7 +455,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, get_task_struct(victim); } } - } while_each_thread(p, t); + } read_unlock(&tasklist_lock); rcu_read_lock(); -- cgit v1.2.1 From ad96244179fbd55b40c00f10f399bc04739b8e1f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jan 2014 15:50:00 -0800 Subject: oom_kill: has_intersects_mems_allowed() needs rcu_read_lock() At least out_of_memory() calls has_intersects_mems_allowed() without even rcu_read_lock(), this is obviously buggy. Add the necessary rcu_read_lock(). This means that we can not simply return from the loop, we need "bool ret" and "break". While at it, swap the names of task_struct's (the argument and the local). This cleans up the code a little bit and avoids the unnecessary initialization. Signed-off-by: Oleg Nesterov Reviewed-by: Sergey Dyasly Tested-by: Sergey Dyasly Reviewed-by: Sameer Nanda Cc: "Eric W. Biederman" Cc: Frederic Weisbecker Cc: Mandeep Singh Baines Cc: "Ma, Xindong" Reviewed-by: Michal Hocko Cc: "Tu, Xiaobing" Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 96d7945f75a6..0d8ad1ebd1d1 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -47,18 +47,20 @@ static DEFINE_SPINLOCK(zone_scan_lock); #ifdef CONFIG_NUMA /** * has_intersects_mems_allowed() - check task eligiblity for kill - * @tsk: task struct of which task to consider + * @start: task struct of which task to consider * @mask: nodemask passed to page allocator for mempolicy ooms * * Task eligibility is determined by whether or not a candidate task, @tsk, * shares the same mempolicy nodes as current if it is bound by such a policy * and whether or not it has the same set of allowed cpuset nodes. */ -static bool has_intersects_mems_allowed(struct task_struct *tsk, +static bool has_intersects_mems_allowed(struct task_struct *start, const nodemask_t *mask) { - struct task_struct *start = tsk; + struct task_struct *tsk; + bool ret = false; + rcu_read_lock(); for_each_thread(start, tsk) { if (mask) { /* @@ -67,19 +69,20 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, * mempolicy intersects current, otherwise it may be * needlessly killed. */ - if (mempolicy_nodemask_intersects(tsk, mask)) - return true; + ret = mempolicy_nodemask_intersects(tsk, mask); } else { /* * This is not a mempolicy constrained oom, so only * check the mems of tsk's cpuset. */ - if (cpuset_mems_allowed_intersects(current, tsk)) - return true; + ret = cpuset_mems_allowed_intersects(current, tsk); } + if (ret) + break; } + rcu_read_unlock(); - return false; + return ret; } #else static bool has_intersects_mems_allowed(struct task_struct *tsk, -- cgit v1.2.1 From 4d4048be8a93769350efa31d2482a038b7de73d0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jan 2014 15:50:01 -0800 Subject: oom_kill: add rcu_read_lock() into find_lock_task_mm() find_lock_task_mm() expects it is called under rcu or tasklist lock, but it seems that at least oom_unkillable_task()->task_in_mem_cgroup() and mem_cgroup_out_of_memory()->oom_badness() can call it lockless. Perhaps we could fix the callers, but this patch simply adds rcu lock into find_lock_task_mm(). This also allows to simplify a bit one of its callers, oom_kill_process(). Signed-off-by: Oleg Nesterov Cc: Sergey Dyasly Cc: Sameer Nanda Cc: "Eric W. Biederman" Cc: Frederic Weisbecker Cc: Mandeep Singh Baines Cc: "Ma, Xindong" Reviewed-by: Michal Hocko Cc: "Tu, Xiaobing" Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 0d8ad1ebd1d1..054ff47c4478 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -102,14 +102,19 @@ struct task_struct *find_lock_task_mm(struct task_struct *p) { struct task_struct *t; + rcu_read_lock(); + for_each_thread(p, t) { task_lock(t); if (likely(t->mm)) - return t; + goto found; task_unlock(t); } + t = NULL; +found: + rcu_read_unlock(); - return NULL; + return t; } /* return true if the task is not adequate as candidate victim task. */ @@ -461,10 +466,8 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, } read_unlock(&tasklist_lock); - rcu_read_lock(); p = find_lock_task_mm(victim); if (!p) { - rcu_read_unlock(); put_task_struct(victim); return; } else if (victim != p) { @@ -490,6 +493,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, * That thread will now get access to memory reserves since it has a * pending fatal signal. */ + rcu_read_lock(); for_each_process(p) if (p->mm == mm && !same_thread_group(p, victim) && !(p->flags & PF_KTHREAD)) { -- cgit v1.2.1 From fd615c4e671979e3e362df537d6be38f8d27aa80 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 21 Jan 2014 15:50:05 -0800 Subject: mm/memblock: debug: don't free reserved array if !ARCH_DISCARD_MEMBLOCK Now the Nobootmem allocator will always try to free memory allocated for reserved memory regions (free_low_memory_core_early()) without taking into to account current memblock debugging configuration (CONFIG_ARCH_DISCARD_MEMBLOCK and CONFIG_DEBUG_FS state). As result if: - CONFIG_DEBUG_FS defined - CONFIG_ARCH_DISCARD_MEMBLOCK not defined; - reserved memory regions array have been resized during boot then: - memory allocated for reserved memory regions array will be freed to buddy allocator; - debug_fs entry "sys/kernel/debug/memblock/reserved" will show garbage instead of state of memory reservations. like: 0: 0x98393bc0..0x9a393bbf 1: 0xff120000..0xff11ffff 2: 0x00000000..0xffffffff Hence, do not free memory allocated for reserved memory regions if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK). Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Reviewed-by: Tejun Heo Cc: Yinghai Lu Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tony Lindgren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 6a2a48a122a9..de4d9c352fd6 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -269,6 +269,19 @@ phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( if (memblock.reserved.regions == memblock_reserved_init_regions) return 0; + /* + * Don't allow nobootmem allocator to free reserved memory regions + * array if + * - CONFIG_DEBUG_FS is enabled; + * - CONFIG_ARCH_DISCARD_MEMBLOCK is not enabled; + * - reserved memory regions array have been resized during boot. + * Otherwise debug_fs entry "sys/kernel/debug/memblock/reserved" + * will show garbage instead of state of memory reservations. + */ + if (IS_ENABLED(CONFIG_DEBUG_FS) && + !IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK)) + return 0; + *addr = __pa(memblock.reserved.regions); return PAGE_ALIGN(sizeof(struct memblock_region) * -- cgit v1.2.1 From 869a84e1ca163b737236dae997db4a6a1e230b9b Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 21 Jan 2014 15:50:10 -0800 Subject: mm/memblock: remove unnecessary inclusions of bootmem.h Clean-up to remove depedency with bootmem headers. Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Reviewed-by: Tejun Heo Cc: Yinghai Lu Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Christoph Lameter Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tony Lindgren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 01e39afde1cb..af4935ee444f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.1 From 79f40fab0b3a78e0e41fac79a65a9870f4b05652 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 21 Jan 2014 15:50:12 -0800 Subject: mm/memblock: drop WARN and use SMP_CACHE_BYTES as a default alignment Don't produce warning and interpret 0 as "default align" equal to SMP_CACHE_BYTES in case if caller of memblock_alloc_base_nid() doesn't specify alignment for the block (align == 0). This is done in preparation of introducing common memblock alloc interface to make code behavior consistent. More details are in below thread : https://lkml.org/lkml/2013/10/13/117. Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Cc: Yinghai Lu Cc: Tejun Heo Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tony Lindgren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index de4d9c352fd6..6aca54812db0 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -969,8 +969,8 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, { phys_addr_t found; - if (WARN_ON(!align)) - align = __alignof__(long long); + if (!align) + align = SMP_CACHE_BYTES; /* align @size to avoid excessive fragmentation on reserved array */ size = round_up(size, align); -- cgit v1.2.1 From 87029ee9390b2297dae699d5fb135b77992116e5 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 21 Jan 2014 15:50:14 -0800 Subject: mm/memblock: reorder parameters of memblock_find_in_range_node Reorder parameters of memblock_find_in_range_node to be consistent with other memblock APIs. The change was suggested by Tejun Heo . Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Cc: Yinghai Lu Cc: Tejun Heo Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tony Lindgren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 16 ++++++++-------- mm/nobootmem.c | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 6aca54812db0..a95d6dc066d5 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -157,10 +157,10 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, /** * memblock_find_in_range_node - find free area in given range and node - * @start: start of candidate range - * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} * @size: size of free area to find * @align: alignment of free area to find + * @start: start of candidate range + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} * @nid: nid of the free area to find, %MAX_NUMNODES for any node * * Find @size free area aligned to @align in the specified range and node. @@ -176,9 +176,9 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, * RETURNS: * Found address on success, 0 on failure. */ -phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, - phys_addr_t end, phys_addr_t size, - phys_addr_t align, int nid) +phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, + phys_addr_t align, phys_addr_t start, + phys_addr_t end, int nid) { int ret; phys_addr_t kernel_end; @@ -241,8 +241,8 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align) { - return memblock_find_in_range_node(start, end, size, align, - MAX_NUMNODES); + return memblock_find_in_range_node(size, align, start, end, + MAX_NUMNODES); } static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) @@ -975,7 +975,7 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, /* align @size to avoid excessive fragmentation on reserved array */ size = round_up(size, align); - found = memblock_find_in_range_node(0, max_addr, size, align, nid); + found = memblock_find_in_range_node(size, align, 0, max_addr, nid); if (found && !memblock_reserve(found, size)) return found; diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 2c254d374655..59777e050d09 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -41,7 +41,7 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, if (limit > memblock.current_limit) limit = memblock.current_limit; - addr = memblock_find_in_range_node(goal, limit, size, align, nid); + addr = memblock_find_in_range_node(size, align, goal, limit, nid); if (!addr) return NULL; -- cgit v1.2.1 From b115423357e0cda6d8f45d0c81df537d7b004020 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 21 Jan 2014 15:50:16 -0800 Subject: mm/memblock: switch to use NUMA_NO_NODE instead of MAX_NUMNODES It's recommended to use NUMA_NO_NODE everywhere to select "process any node" behavior or to indicate that "no node id specified". Hence, update __next_free_mem_range*() API's to accept both NUMA_NO_NODE and MAX_NUMNODES, but emit warning once on MAX_NUMNODES, and correct corresponding API's documentation to describe new behavior. Also, update other memblock/nobootmem APIs where MAX_NUMNODES is used dirrectly. The change was suggested by Tejun Heo. Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Cc: Yinghai Lu Cc: Tejun Heo Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tony Lindgren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 28 +++++++++++++++++++--------- mm/nobootmem.c | 8 ++++---- 2 files changed, 23 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index a95d6dc066d5..03f1dc7b663c 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -94,7 +94,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} * @size: size of free area to find * @align: alignment of free area to find - * @nid: nid of the free area to find, %MAX_NUMNODES for any node + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * * Utility called from memblock_find_in_range_node(), find free area bottom-up. * @@ -126,7 +126,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} * @size: size of free area to find * @align: alignment of free area to find - * @nid: nid of the free area to find, %MAX_NUMNODES for any node + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * * Utility called from memblock_find_in_range_node(), find free area top-down. * @@ -161,7 +161,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, * @align: alignment of free area to find * @start: start of candidate range * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} - * @nid: nid of the free area to find, %MAX_NUMNODES for any node + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * * Find @size free area aligned to @align in the specified range and node. * @@ -242,7 +242,7 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t align) { return memblock_find_in_range_node(size, align, start, end, - MAX_NUMNODES); + NUMA_NO_NODE); } static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) @@ -754,7 +754,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) /** * __next_free_mem_range - next function for for_each_free_mem_range() * @idx: pointer to u64 loop variable - * @nid: node selector, %MAX_NUMNODES for all nodes + * @nid: node selector, %NUMA_NO_NODE for all nodes * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL * @out_nid: ptr to int for nid of the range, can be %NULL @@ -782,6 +782,11 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, struct memblock_type *rsv = &memblock.reserved; int mi = *idx & 0xffffffff; int ri = *idx >> 32; + bool check_node = (nid != NUMA_NO_NODE) && (nid != MAX_NUMNODES); + + if (nid == MAX_NUMNODES) + pr_warn_once("%s: Usage of MAX_NUMNODES is depricated. Use NUMA_NO_NODE instead\n", + __func__); for ( ; mi < mem->cnt; mi++) { struct memblock_region *m = &mem->regions[mi]; @@ -789,7 +794,7 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, phys_addr_t m_end = m->base + m->size; /* only memory regions are associated with nodes, check it */ - if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) + if (check_node && nid != memblock_get_region_node(m)) continue; /* scan areas before each reservation for intersection */ @@ -830,7 +835,7 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, /** * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() * @idx: pointer to u64 loop variable - * @nid: nid: node selector, %MAX_NUMNODES for all nodes + * @nid: nid: node selector, %NUMA_NO_NODE for all nodes * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL * @out_nid: ptr to int for nid of the range, can be %NULL @@ -850,6 +855,11 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, struct memblock_type *rsv = &memblock.reserved; int mi = *idx & 0xffffffff; int ri = *idx >> 32; + bool check_node = (nid != NUMA_NO_NODE) && (nid != MAX_NUMNODES); + + if (nid == MAX_NUMNODES) + pr_warn_once("%s: Usage of MAX_NUMNODES is depricated. Use NUMA_NO_NODE instead\n", + __func__); if (*idx == (u64)ULLONG_MAX) { mi = mem->cnt - 1; @@ -862,7 +872,7 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t m_end = m->base + m->size; /* only memory regions are associated with nodes, check it */ - if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) + if (check_node && nid != memblock_get_region_node(m)) continue; /* skip hotpluggable memory regions if needed */ @@ -989,7 +999,7 @@ phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int n phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) { - return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES); + return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE); } phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 59777e050d09..19121ceb8874 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -117,7 +117,7 @@ static unsigned long __init free_low_memory_core_early(void) phys_addr_t start, end, size; u64 i; - for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) + for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) count += __free_memory_core(start, end); /* free range that is used for reserved array if we allocate it */ @@ -161,7 +161,7 @@ unsigned long __init free_all_bootmem(void) reset_all_zones_managed_pages(); /* - * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id + * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id * because in some case like Node0 doesn't have RAM installed * low ram will be on Node1 */ @@ -215,7 +215,7 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size, restart: - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); + ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit); if (ptr) return ptr; @@ -299,7 +299,7 @@ again: if (ptr) return ptr; - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, + ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit); if (ptr) return ptr; -- cgit v1.2.1 From 26f09e9b3a0696f6fe20b021901300fba26fb579 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Tue, 21 Jan 2014 15:50:19 -0800 Subject: mm/memblock: add memblock memory allocation apis Introduce memblock memory allocation APIs which allow to support PAE or LPAE extension on 32 bits archs where the physical memory start address can be beyond 4GB. In such cases, existing bootmem APIs which operate on 32 bit addresses won't work and needs memblock layer which operates on 64 bit addresses. So we add equivalent APIs so that we can replace usage of bootmem with memblock interfaces. Architectures already converted to NO_BOOTMEM use these new memblock interfaces. The architectures which are still not converted to NO_BOOTMEM continue to function as is because we still maintain the fal lback option of bootmem back-end supporting these new interfaces. So no functional change as such. In long run, once all the architectures moves to NO_BOOTMEM, we can get rid of bootmem layer completely. This is one step to remove the core code dependency with bootmem and also gives path for architectures to move away from bootmem. The proposed interface will became active if both CONFIG_HAVE_MEMBLOCK and CONFIG_NO_BOOTMEM are specified by arch. In case !CONFIG_NO_BOOTMEM, the memblock() wrappers will fallback to the existing bootmem apis so that arch's not converted to NO_BOOTMEM continue to work as is. The meaning of MEMBLOCK_ALLOC_ACCESSIBLE and MEMBLOCK_ALLOC_ANYWHERE is kept same. [akpm@linux-foundation.org: s/depricated/deprecated/] Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Cc: Yinghai Lu Cc: Tejun Heo Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tony Lindgren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 209 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 207 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 03f1dc7b663c..018e55dc004d 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -21,6 +21,9 @@ #include #include +#include + +#include "internal.h" static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; @@ -785,7 +788,7 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, bool check_node = (nid != NUMA_NO_NODE) && (nid != MAX_NUMNODES); if (nid == MAX_NUMNODES) - pr_warn_once("%s: Usage of MAX_NUMNODES is depricated. Use NUMA_NO_NODE instead\n", + pr_warn_once("%s: Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n", __func__); for ( ; mi < mem->cnt; mi++) { @@ -858,7 +861,7 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, bool check_node = (nid != NUMA_NO_NODE) && (nid != MAX_NUMNODES); if (nid == MAX_NUMNODES) - pr_warn_once("%s: Usage of MAX_NUMNODES is depricated. Use NUMA_NO_NODE instead\n", + pr_warn_once("%s: Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n", __func__); if (*idx == (u64)ULLONG_MAX) { @@ -1029,6 +1032,208 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); } +/** + * memblock_virt_alloc_internal - allocate boot memory block + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region to allocate (phys address) + * @max_addr: the upper bound of the memory region to allocate (phys address) + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * The @min_addr limit is dropped if it can not be satisfied and the allocation + * will fall back to memory below @min_addr. Also, allocation may fall back + * to any node in the system if the specified node can not + * hold the requested memory. + * + * The allocation is performed from memory region limited by + * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE. + * + * The memory block is aligned on SMP_CACHE_BYTES if @align == 0. + * + * The phys address of allocated boot memory block is converted to virtual and + * allocated memory is reset to 0. + * + * In addition, function sets the min_count to 0 using kmemleak_alloc for + * allocated boot memory block, so that it is never reported as leaks. + * + * RETURNS: + * Virtual address of allocated memory block on success, NULL on failure. + */ +static void * __init memblock_virt_alloc_internal( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + phys_addr_t alloc; + void *ptr; + + if (nid == MAX_NUMNODES) + pr_warn("%s: usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE\n", + __func__); + + /* + * Detect any accidental use of these APIs after slab is ready, as at + * this moment memblock may be deinitialized already and its + * internal data may be destroyed (after execution of free_all_bootmem) + */ + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, nid); + + if (!align) + align = SMP_CACHE_BYTES; + + /* align @size to avoid excessive fragmentation on reserved array */ + size = round_up(size, align); + +again: + alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, + nid); + if (alloc) + goto done; + + if (nid != NUMA_NO_NODE) { + alloc = memblock_find_in_range_node(size, align, min_addr, + max_addr, NUMA_NO_NODE); + if (alloc) + goto done; + } + + if (min_addr) { + min_addr = 0; + goto again; + } else { + goto error; + } + +done: + memblock_reserve(alloc, size); + ptr = phys_to_virt(alloc); + memset(ptr, 0, size); + + /* + * The min_count is set to 0 so that bootmem allocated blocks + * are never reported as leaks. This is because many of these blocks + * are only referred via the physical address which is not + * looked up by kmemleak. + */ + kmemleak_alloc(ptr, size, 0, 0); + + return ptr; + +error: + return NULL; +} + +/** + * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region from where the allocation + * is preferred (phys address) + * @max_addr: the upper bound of the memory region from where the allocation + * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to + * allocate only from memory limited by memblock.current_limit value + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides + * additional debug information (including caller info), if enabled. + * + * RETURNS: + * Virtual address of allocated memory block on success, NULL on failure. + */ +void * __init memblock_virt_alloc_try_nid_nopanic( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", + __func__, (u64)size, (u64)align, nid, (u64)min_addr, + (u64)max_addr, (void *)_RET_IP_); + return memblock_virt_alloc_internal(size, align, min_addr, + max_addr, nid); +} + +/** + * memblock_virt_alloc_try_nid - allocate boot memory block with panicking + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region from where the allocation + * is preferred (phys address) + * @max_addr: the upper bound of the memory region from where the allocation + * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to + * allocate only from memory limited by memblock.current_limit value + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Public panicking version of _memblock_virt_alloc_try_nid_nopanic() + * which provides debug information (including caller info), if enabled, + * and panics if the request can not be satisfied. + * + * RETURNS: + * Virtual address of allocated memory block on success, NULL on failure. + */ +void * __init memblock_virt_alloc_try_nid( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + void *ptr; + + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", + __func__, (u64)size, (u64)align, nid, (u64)min_addr, + (u64)max_addr, (void *)_RET_IP_); + ptr = memblock_virt_alloc_internal(size, align, + min_addr, max_addr, nid); + if (ptr) + return ptr; + + panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n", + __func__, (u64)size, (u64)align, nid, (u64)min_addr, + (u64)max_addr); + return NULL; +} + +/** + * __memblock_free_early - free boot memory block + * @base: phys starting address of the boot memory block + * @size: size of the boot memory block in bytes + * + * Free boot memory block previously allocated by memblock_virt_alloc_xx() API. + * The freeing memory will not be released to the buddy allocator. + */ +void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) +{ + memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", + __func__, (u64)base, (u64)base + size - 1, + (void *)_RET_IP_); + kmemleak_free_part(__va(base), size); + __memblock_remove(&memblock.reserved, base, size); +} + +/* + * __memblock_free_late - free bootmem block pages directly to buddy allocator + * @addr: phys starting address of the boot memory block + * @size: size of the boot memory block in bytes + * + * This is only useful when the bootmem allocator has already been torn + * down, but we are still initializing the system. Pages are released directly + * to the buddy allocator, no bootmem metadata is updated because it is gone. + */ +void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) +{ + u64 cursor, end; + + memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", + __func__, (u64)base, (u64)base + size - 1, + (void *)_RET_IP_); + kmemleak_free_part(__va(base), size); + cursor = PFN_UP(base); + end = PFN_DOWN(base + size); + + for (; cursor < end; cursor++) { + __free_pages_bootmem(pfn_to_page(cursor), 0); + totalram_pages++; + } +} /* * Remaining API functions -- cgit v1.2.1 From 6782832eba5e8c87a749a41da8deda1c3ef67ba0 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Tue, 21 Jan 2014 15:50:25 -0800 Subject: mm/page_alloc.c: use memblock apis for early memory allocations Switch to memblock interfaces for early memory allocator instead of bootmem allocator. No functional change in beahvior than what it is in current code from bootmem users points of view. Archs already converted to NO_BOOTMEM now directly use memblock interfaces instead of bootmem wrappers build on top of memblock. And the archs which still uses bootmem, these new apis just fallback to exiting bootmem APIs. Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Cc: Yinghai Lu Cc: Tejun Heo Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tony Lindgren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4f59d1986018..b230e838883d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4215,7 +4215,6 @@ static noinline __init_refok int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) { int i; - struct pglist_data *pgdat = zone->zone_pgdat; size_t alloc_size; /* @@ -4231,7 +4230,8 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) if (!slab_is_available()) { zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node_nopanic(pgdat, alloc_size); + memblock_virt_alloc_node_nopanic( + alloc_size, zone->zone_pgdat->node_id); } else { /* * This case means that a zone whose size was 0 gets new memory @@ -4351,13 +4351,14 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) #endif /** - * free_bootmem_with_active_regions - Call free_bootmem_node for each active range + * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. - * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node + * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid * * If an architecture guarantees that all ranges registered with * add_active_ranges() contain no holes and may be freed, this - * this function may be used instead of calling free_bootmem() manually. + * this function may be used instead of calling memblock_free_early_nid() + * manually. */ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) { @@ -4369,9 +4370,9 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) end_pfn = min(end_pfn, max_low_pfn); if (start_pfn < end_pfn) - free_bootmem_node(NODE_DATA(this_nid), - PFN_PHYS(start_pfn), - (end_pfn - start_pfn) << PAGE_SHIFT); + memblock_free_early_nid(PFN_PHYS(start_pfn), + (end_pfn - start_pfn) << PAGE_SHIFT, + this_nid); } } @@ -4642,8 +4643,9 @@ static void __init setup_usemap(struct pglist_data *pgdat, unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); zone->pageblock_flags = NULL; if (usemapsize) - zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, - usemapsize); + zone->pageblock_flags = + memblock_virt_alloc_node_nopanic(usemapsize, + pgdat->node_id); } #else static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, @@ -4837,7 +4839,8 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) size = (end - start) * sizeof(struct page); map = alloc_remap(pgdat->node_id, size); if (!map) - map = alloc_bootmem_node_nopanic(pgdat, size); + map = memblock_virt_alloc_node_nopanic(size, + pgdat->node_id); pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); } #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -5887,7 +5890,7 @@ void *__init alloc_large_system_hash(const char *tablename, do { size = bucketsize << log2qty; if (flags & HASH_EARLY) - table = alloc_bootmem_nopanic(size); + table = memblock_virt_alloc_nopanic(size, 0); else if (hashdist) table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); else { -- cgit v1.2.1 From bb016b84164554725899aef544331085e08cb402 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Tue, 21 Jan 2014 15:50:34 -0800 Subject: mm/sparse: use memblock apis for early memory allocations Switch to memblock interfaces for early memory allocator instead of bootmem allocator. No functional change in beahvior than what it is in current code from bootmem users points of view. Archs already converted to NO_BOOTMEM now directly use memblock interfaces instead of bootmem wrappers build on top of memblock. And the archs which still uses bootmem, these new apis just fallback to exiting bootmem APIs. Signed-off-by: Santosh Shilimkar Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: Grygorii Strashko Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tejun Heo Cc: Tony Lindgren Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse-vmemmap.c | 6 ++++-- mm/sparse.c | 27 +++++++++++++++------------ 2 files changed, 19 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 27eeab3be757..4cba9c2783a1 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -40,7 +40,8 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node, unsigned long align, unsigned long goal) { - return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal); + return memblock_virt_alloc_try_nid(size, align, goal, + BOOTMEM_ALLOC_ACCESSIBLE, node); } static void *vmemmap_buf; @@ -226,7 +227,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, if (vmemmap_buf_start) { /* need to free left buf */ - free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf); + memblock_free_early(__pa(vmemmap_buf), + vmemmap_buf_end - vmemmap_buf); vmemmap_buf = NULL; vmemmap_buf_end = NULL; } diff --git a/mm/sparse.c b/mm/sparse.c index 8cc7be0e9590..63c3ea5c119c 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -69,7 +69,7 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) else section = kzalloc(array_size, GFP_KERNEL); } else { - section = alloc_bootmem_node(NODE_DATA(nid), array_size); + section = memblock_virt_alloc_node(array_size, nid); } return section; @@ -279,8 +279,9 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, limit = goal + (1UL << PA_SECTION_SHIFT); nid = early_pfn_to_nid(goal >> PAGE_SHIFT); again: - p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, - SMP_CACHE_BYTES, goal, limit); + p = memblock_virt_alloc_try_nid_nopanic(size, + SMP_CACHE_BYTES, goal, limit, + nid); if (!p && limit) { limit = 0; goto again; @@ -331,7 +332,7 @@ static unsigned long * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, unsigned long size) { - return alloc_bootmem_node_nopanic(pgdat, size); + return memblock_virt_alloc_node_nopanic(size, pgdat->node_id); } static void __init check_usemap_section_nr(int nid, unsigned long *usemap) @@ -376,8 +377,9 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) return map; size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); - map = __alloc_bootmem_node_high(NODE_DATA(nid), size, - PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + map = memblock_virt_alloc_try_nid(size, + PAGE_SIZE, __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, nid); return map; } void __init sparse_mem_maps_populate_node(struct page **map_map, @@ -401,8 +403,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, } size = PAGE_ALIGN(size); - map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count, - PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + map = memblock_virt_alloc_try_nid(size * map_count, + PAGE_SIZE, __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, nodeid); if (map) { for (pnum = pnum_begin; pnum < pnum_end; pnum++) { if (!present_section_nr(pnum)) @@ -545,7 +548,7 @@ void __init sparse_init(void) * sparse_early_mem_map_alloc, so allocate usemap_map at first. */ size = sizeof(unsigned long *) * NR_MEM_SECTIONS; - usemap_map = alloc_bootmem(size); + usemap_map = memblock_virt_alloc(size, 0); if (!usemap_map) panic("can not allocate usemap_map\n"); alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, @@ -553,7 +556,7 @@ void __init sparse_init(void) #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER size2 = sizeof(struct page *) * NR_MEM_SECTIONS; - map_map = alloc_bootmem(size2); + map_map = memblock_virt_alloc(size2, 0); if (!map_map) panic("can not allocate map_map\n"); alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, @@ -583,9 +586,9 @@ void __init sparse_init(void) vmemmap_populate_print_last(); #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - free_bootmem(__pa(map_map), size2); + memblock_free_early(__pa(map_map), size2); #endif - free_bootmem(__pa(usemap_map), size); + memblock_free_early(__pa(usemap_map), size); } #ifdef CONFIG_MEMORY_HOTPLUG -- cgit v1.2.1 From 8b89a1169437541a2a9b62c8f7b1a5c0ceb0fbde Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 21 Jan 2014 15:50:36 -0800 Subject: mm/hugetlb.c: use memblock apis for early memory allocations Switch to memblock interfaces for early memory allocator instead of bootmem allocator. No functional change in beahvior than what it is in current code from bootmem users points of view. Archs already converted to NO_BOOTMEM now directly use memblock interfaces instead of bootmem wrappers build on top of memblock. And the archs which still uses bootmem, these new apis just fallback to exiting bootmem APIs. Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tejun Heo Cc: Tony Lindgren Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1697ff0cc53a..04306b9de90d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1271,9 +1271,9 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { void *addr; - addr = __alloc_bootmem_node_nopanic(NODE_DATA(node), - huge_page_size(h), huge_page_size(h), 0); - + addr = memblock_virt_alloc_try_nid_nopanic( + huge_page_size(h), huge_page_size(h), + 0, BOOTMEM_ALLOC_ACCESSIBLE, node); if (addr) { /* * Use the beginning of the huge page to store the @@ -1313,8 +1313,8 @@ static void __init gather_bootmem_prealloc(void) #ifdef CONFIG_HIGHMEM page = pfn_to_page(m->phys >> PAGE_SHIFT); - free_bootmem_late((unsigned long)m, - sizeof(struct huge_bootmem_page)); + memblock_free_late(__pa(m), + sizeof(struct huge_bootmem_page)); #else page = virt_to_page(m); #endif -- cgit v1.2.1 From 0d036e9e33df8befa9348683ba68258fee7f0a00 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 21 Jan 2014 15:50:38 -0800 Subject: mm/page_cgroup.c: use memblock apis for early memory allocations Switch to memblock interfaces for early memory allocator instead of bootmem allocator. No functional change in beahvior than what it is in current code from bootmem users points of view. Archs already converted to NO_BOOTMEM now directly use memblock interfaces instead of bootmem wrappers build on top of memblock. And the archs which still uses bootmem, these new apis just fallback to exiting bootmem APIs. Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: Grygorii Strashko Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tejun Heo Cc: Tony Lindgren Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 6d757e3a872a..d8bd2c500aa4 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -54,8 +54,9 @@ static int __init alloc_node_page_cgroup(int nid) table_size = sizeof(struct page_cgroup) * nr_pages; - base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), - table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + base = memblock_virt_alloc_try_nid_nopanic( + table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, nid); if (!base) return -ENOMEM; NODE_DATA(nid)->node_page_cgroup = base; -- cgit v1.2.1 From 999c17e3de4855af4e829c0871ad32fc76a93991 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Tue, 21 Jan 2014 15:50:40 -0800 Subject: mm/percpu.c: use memblock apis for early memory allocations Switch to memblock interfaces for early memory allocator instead of bootmem allocator. No functional change in beahvior than what it is in current code from bootmem users points of view. Archs already converted to NO_BOOTMEM now directly use memblock interfaces instead of bootmem wrappers build on top of memblock. And the archs which still uses bootmem, these new apis just fallback to exiting bootmem APIs. Signed-off-by: Santosh Shilimkar Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: Grygorii Strashko Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tejun Heo Cc: Tony Lindgren Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/percpu.c | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 0d10defe951e..65fd8a749712 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1063,7 +1063,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, __alignof__(ai->groups[0].cpu_map[0])); ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); - ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size)); + ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0); if (!ptr) return NULL; ai = ptr; @@ -1088,7 +1088,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, */ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) { - free_bootmem(__pa(ai), ai->__ai_size); + memblock_free_early(__pa(ai), ai->__ai_size); } /** @@ -1246,10 +1246,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); /* process group information and build config tables accordingly */ - group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); - group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0])); - unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); - unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); + group_offsets = memblock_virt_alloc(ai->nr_groups * + sizeof(group_offsets[0]), 0); + group_sizes = memblock_virt_alloc(ai->nr_groups * + sizeof(group_sizes[0]), 0); + unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0); + unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0); for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = UINT_MAX; @@ -1311,7 +1313,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, * empty chunks. */ pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; - pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); + pcpu_slot = memblock_virt_alloc( + pcpu_nr_slots * sizeof(pcpu_slot[0]), 0); for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_slot[i]); @@ -1322,7 +1325,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, * covers static area + reserved area (mostly used for module * static percpu allocation). */ - schunk = alloc_bootmem(pcpu_chunk_struct_size); + schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&schunk->list); schunk->base_addr = base_addr; schunk->map = smap; @@ -1346,7 +1349,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* init dynamic chunk if necessary */ if (dyn_size) { - dchunk = alloc_bootmem(pcpu_chunk_struct_size); + dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&dchunk->list); dchunk->base_addr = base_addr; dchunk->map = dmap; @@ -1626,7 +1629,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); - areas = alloc_bootmem_nopanic(areas_size); + areas = memblock_virt_alloc_nopanic(areas_size, 0); if (!areas) { rc = -ENOMEM; goto out_free; @@ -1712,7 +1715,7 @@ out_free_areas: out_free: pcpu_free_alloc_info(ai); if (areas) - free_bootmem(__pa(areas), areas_size); + memblock_free_early(__pa(areas), areas_size); return rc; } #endif /* BUILD_EMBED_FIRST_CHUNK */ @@ -1760,7 +1763,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, /* unaligned allocations can't be freed, round up to page size */ pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * sizeof(pages[0])); - pages = alloc_bootmem(pages_size); + pages = memblock_virt_alloc(pages_size, 0); /* allocate pages */ j = 0; @@ -1823,7 +1826,7 @@ enomem: free_fn(page_address(pages[j]), PAGE_SIZE); rc = -ENOMEM; out_free_ar: - free_bootmem(__pa(pages), pages_size); + memblock_free_early(__pa(pages), pages_size); pcpu_free_alloc_info(ai); return rc; } @@ -1848,12 +1851,13 @@ EXPORT_SYMBOL(__per_cpu_offset); static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, size_t align) { - return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); + return memblock_virt_alloc_from_nopanic( + size, align, __pa(MAX_DMA_ADDRESS)); } static void __init pcpu_dfl_fc_free(void *ptr, size_t size) { - free_bootmem(__pa(ptr), size); + memblock_free_early(__pa(ptr), size); } void __init setup_per_cpu_areas(void) @@ -1896,7 +1900,9 @@ void __init setup_per_cpu_areas(void) void *fc; ai = pcpu_alloc_alloc_info(1, 1); - fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + fc = memblock_virt_alloc_from_nopanic(unit_size, + PAGE_SIZE, + __pa(MAX_DMA_ADDRESS)); if (!ai || !fc) panic("Failed to allocate memory for percpu areas."); /* kmemleak tracks the percpu allocations separately */ -- cgit v1.2.1 From 9e43aa2b8d1cb3137bd7e60d5fead83d0569de2b Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Tue, 21 Jan 2014 15:50:43 -0800 Subject: mm/memory_hotplug.c: use memblock apis for early memory allocations Correct ensure_zone_is_initialized() function description according to the introduced memblock APIs for early memory allocations. Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tejun Heo Cc: Tony Lindgren Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index af4935ee444f..cc2ab37220b7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -268,7 +268,7 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn, } /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or - * alloc_bootmem_node_nopanic() */ + * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */ static int __ref ensure_zone_is_initialized(struct zone *zone, unsigned long start_pfn, unsigned long num_pages) { -- cgit v1.2.1 From 560dca27a6b36015e4f69a4ceba0ee5be0707c17 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 21 Jan 2014 15:50:55 -0800 Subject: mm/memblock: use WARN_ONCE when MAX_NUMNODES passed as input parameter Check nid parameter and produce warning if it has deprecated MAX_NUMNODES value. Also re-assign NUMA_NO_NODE value to the nid parameter in this case. These will help to identify the wrong API usage (the caller) and make code simpler. Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Cc: Yinghai Lu Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 018e55dc004d..1c2ef2c7edab 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -785,11 +785,9 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, struct memblock_type *rsv = &memblock.reserved; int mi = *idx & 0xffffffff; int ri = *idx >> 32; - bool check_node = (nid != NUMA_NO_NODE) && (nid != MAX_NUMNODES); - if (nid == MAX_NUMNODES) - pr_warn_once("%s: Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n", - __func__); + if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + nid = NUMA_NO_NODE; for ( ; mi < mem->cnt; mi++) { struct memblock_region *m = &mem->regions[mi]; @@ -797,7 +795,7 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, phys_addr_t m_end = m->base + m->size; /* only memory regions are associated with nodes, check it */ - if (check_node && nid != memblock_get_region_node(m)) + if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) continue; /* scan areas before each reservation for intersection */ @@ -858,11 +856,9 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, struct memblock_type *rsv = &memblock.reserved; int mi = *idx & 0xffffffff; int ri = *idx >> 32; - bool check_node = (nid != NUMA_NO_NODE) && (nid != MAX_NUMNODES); - if (nid == MAX_NUMNODES) - pr_warn_once("%s: Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n", - __func__); + if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + nid = NUMA_NO_NODE; if (*idx == (u64)ULLONG_MAX) { mi = mem->cnt - 1; @@ -875,7 +871,7 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t m_end = m->base + m->size; /* only memory regions are associated with nodes, check it */ - if (check_node && nid != memblock_get_region_node(m)) + if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) continue; /* skip hotpluggable memory regions if needed */ @@ -1067,9 +1063,8 @@ static void * __init memblock_virt_alloc_internal( phys_addr_t alloc; void *ptr; - if (nid == MAX_NUMNODES) - pr_warn("%s: usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE\n", - __func__); + if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + nid = NUMA_NO_NODE; /* * Detect any accidental use of these APIs after slab is ready, as at -- cgit v1.2.1 From 4883e997b26ed857da8dae6a6e6aeb12830b978d Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 21 Jan 2014 15:50:56 -0800 Subject: mm/hwpoison: add '#' to hwpoison_inject Add '#' to hwpoison_inject just as done in madvise_hwpoison. Signed-off-by: Wanpeng Li Reviewed-by: Naoya Horiguchi Reviewed-by: Vladimir Murzin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hwpoison-inject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 4c84678371eb..95487c71cad5 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -55,7 +55,7 @@ static int hwpoison_inject(void *data, u64 val) return 0; inject: - printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); + pr_info("Injecting memory failure at pfn %#lx\n", pfn); return memory_failure(pfn, 18, MF_COUNT_INCREASED); } -- cgit v1.2.1 From 1c30e0177e4f41a11cb88b0f1f056ccebfe0fff4 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 15:50:58 -0800 Subject: mm: numa: make NUMA-migrate related functions static numamigrate_update_ratelimit and numamigrate_isolate_page only have callers in mm/migrate.c. This patch makes them static. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Alex Thorlton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 11d89dc0574c..41eba21f10ba 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1599,7 +1599,8 @@ bool migrate_ratelimited(int node) } /* Returns true if the node is migrate rate-limited after the update */ -bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) +static bool numamigrate_update_ratelimit(pg_data_t *pgdat, + unsigned long nr_pages) { bool rate_limited = false; @@ -1623,7 +1624,7 @@ bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) return rate_limited; } -int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) +static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) { int page_lru; -- cgit v1.2.1 From 1c5e9c27cbd966c7f0038698d5dcd5ada3574f47 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 15:50:59 -0800 Subject: mm: numa: limit scope of lock for NUMA migrate rate limiting NUMA migrate rate limiting protects a migration counter and window using a lock but in some cases this can be a contended lock. It is not critical that the number of pages be perfect, lost updates are acceptable. Reduce the importance of this lock. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Alex Thorlton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 41eba21f10ba..4612bb2e3677 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1602,26 +1602,29 @@ bool migrate_ratelimited(int node) static bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) { - bool rate_limited = false; - /* * Rate-limit the amount of data that is being migrated to a node. * Optimal placement is no good if the memory bus is saturated and * all the time is being spent migrating! */ - spin_lock(&pgdat->numabalancing_migrate_lock); if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { + spin_lock(&pgdat->numabalancing_migrate_lock); pgdat->numabalancing_migrate_nr_pages = 0; pgdat->numabalancing_migrate_next_window = jiffies + msecs_to_jiffies(migrate_interval_millisecs); + spin_unlock(&pgdat->numabalancing_migrate_lock); } if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) - rate_limited = true; - else - pgdat->numabalancing_migrate_nr_pages += nr_pages; - spin_unlock(&pgdat->numabalancing_migrate_lock); - - return rate_limited; + return true; + + /* + * This is an unlocked non-atomic update so errors are possible. + * The consequences are failing to migrate when we potentiall should + * have which is not severe enough to warrant locking. If it is ever + * a problem, it can be converted to a per-cpu counter. + */ + pgdat->numabalancing_migrate_nr_pages += nr_pages; + return false; } static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) -- cgit v1.2.1 From af1839d722c986ffeaae1e70a6ef1c75ff38dcd5 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 15:51:01 -0800 Subject: mm: numa: trace tasks that fail migration due to rate limiting A low local/remote numa hinting fault ratio is potentially explained by failed migrations. This patch adds a tracepoint that fires when migration fails due to migration rate limitation. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Alex Thorlton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 4612bb2e3677..f9e16350d09c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1614,8 +1614,11 @@ static bool numamigrate_update_ratelimit(pg_data_t *pgdat, msecs_to_jiffies(migrate_interval_millisecs); spin_unlock(&pgdat->numabalancing_migrate_lock); } - if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) + if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) { + trace_mm_numa_migrate_ratelimit(current, pgdat->node_id, + nr_pages); return true; + } /* * This is an unlocked non-atomic update so errors are possible. -- cgit v1.2.1 From 64a9a34e22896dad430e21a28ad8cb00a756fefc Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 15:51:02 -0800 Subject: mm: numa: do not automatically migrate KSM pages KSM pages can be shared between tasks that are not necessarily related to each other from a NUMA perspective. This patch causes those pages to be ignored by automatic NUMA balancing so they do not migrate and do not cause unrelated tasks to be grouped together. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Alex Thorlton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mprotect.c b/mm/mprotect.c index bb53a6591aea..7332c1785744 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -63,7 +64,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ptent = *pte; page = vm_normal_page(vma, addr, oldpte); - if (page) { + if (page && !PageKsm(page)) { if (!pte_numa(oldpte)) { ptent = pte_mknuma(ptent); set_pte_at(mm, addr, pte, ptent); -- cgit v1.2.1 From 947b3dd1a84ba3fcb7163688fdc36671941786f4 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 21 Jan 2014 15:51:04 -0800 Subject: memcg, oom: lock mem_cgroup_print_oom_info mem_cgroup_print_oom_info uses a static buffer (memcg_name) to store the name of the cgroup. This is not safe as pointed out by David Rientjes because memcg oom is locked only for its hierarchy and nothing prevents another parallel hierarchy to trigger oom as well and overwrite the already in-use buffer. This patch introduces oom_info_lock hidden inside mem_cgroup_print_oom_info which is held throughout the function. It makes access to memcg_name safe and as a bonus it also prevents parallel memcg ooms to interleave their statistics which would make the printed data hard to analyze otherwise. Signed-off-by: Michal Hocko Cc: Johannes Weiner Cc: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 08541f680d90..57b16083f046 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1647,13 +1647,13 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, */ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { - struct cgroup *task_cgrp; - struct cgroup *mem_cgrp; /* - * Need a buffer in BSS, can't rely on allocations. The code relies - * on the assumption that OOM is serialized for memory controller. - * If this assumption is broken, revisit this code. + * protects memcg_name and makes sure that parallel ooms do not + * interleave */ + static DEFINE_SPINLOCK(oom_info_lock); + struct cgroup *task_cgrp; + struct cgroup *mem_cgrp; static char memcg_name[PATH_MAX]; int ret; struct mem_cgroup *iter; @@ -1662,6 +1662,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) if (!p) return; + spin_lock(&oom_info_lock); rcu_read_lock(); mem_cgrp = memcg->css.cgroup; @@ -1730,6 +1731,7 @@ done: pr_cont("\n"); } + spin_unlock(&oom_info_lock); } /* -- cgit v1.2.1 From 0eb927c0ab789d3d7d69f68acb850f69d4e7c36f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 15:51:05 -0800 Subject: mm: compaction: trace compaction begin and end The broad goal of the series is to improve allocation success rates for huge pages through memory compaction, while trying not to increase the compaction overhead. The original objective was to reintroduce capturing of high-order pages freed by the compaction, before they are split by concurrent activity. However, several bugs and opportunities for simple improvements were found in the current implementation, mostly through extra tracepoints (which are however too ugly for now to be considered for sending). The patches mostly deal with two mechanisms that reduce compaction overhead, which is caching the progress of migrate and free scanners, and marking pageblocks where isolation failed to be skipped during further scans. Patch 1 (from mgorman) adds tracepoints that allow calculate time spent in compaction and potentially debug scanner pfn values. Patch 2 encapsulates the some functionality for handling deferred compactions for better maintainability, without a functional change type is not determined without being actually needed. Patch 3 fixes a bug where cached scanner pfn's are sometimes reset only after they have been read to initialize a compaction run. Patch 4 fixes a bug where scanners meeting is sometimes not properly detected and can lead to multiple compaction attempts quitting early without doing any work. Patch 5 improves the chances of sync compaction to process pageblocks that async compaction has skipped due to being !MIGRATE_MOVABLE. Patch 6 improves the chances of sync direct compaction to actually do anything when called after async compaction fails during allocation slowpath. The impact of patches were validated using mmtests's stress-highalloc benchmark with mmtests's stress-highalloc benchmark on a x86_64 machine with 4GB memory. Due to instability of the results (mostly related to the bugs fixed by patches 2 and 3), 10 iterations were performed, taking min,mean,max values for success rates and mean values for time and vmstat-based metrics. First, the default GFP_HIGHUSER_MOVABLE allocations were tested with the patches stacked on top of v3.13-rc2. Patch 2 is OK to serve as baseline due to no functional changes in 1 and 2. Comments below. stress-highalloc 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 2-nothp 3-nothp 4-nothp 5-nothp 6-nothp Success 1 Min 9.00 ( 0.00%) 10.00 (-11.11%) 43.00 (-377.78%) 43.00 (-377.78%) 33.00 (-266.67%) Success 1 Mean 27.50 ( 0.00%) 25.30 ( 8.00%) 45.50 (-65.45%) 45.90 (-66.91%) 46.30 (-68.36%) Success 1 Max 36.00 ( 0.00%) 36.00 ( 0.00%) 47.00 (-30.56%) 48.00 (-33.33%) 52.00 (-44.44%) Success 2 Min 10.00 ( 0.00%) 8.00 ( 20.00%) 46.00 (-360.00%) 45.00 (-350.00%) 35.00 (-250.00%) Success 2 Mean 26.40 ( 0.00%) 23.50 ( 10.98%) 47.30 (-79.17%) 47.60 (-80.30%) 48.10 (-82.20%) Success 2 Max 34.00 ( 0.00%) 33.00 ( 2.94%) 48.00 (-41.18%) 50.00 (-47.06%) 54.00 (-58.82%) Success 3 Min 65.00 ( 0.00%) 63.00 ( 3.08%) 85.00 (-30.77%) 84.00 (-29.23%) 85.00 (-30.77%) Success 3 Mean 76.70 ( 0.00%) 70.50 ( 8.08%) 86.20 (-12.39%) 85.50 (-11.47%) 86.00 (-12.13%) Success 3 Max 87.00 ( 0.00%) 86.00 ( 1.15%) 88.00 ( -1.15%) 87.00 ( 0.00%) 87.00 ( 0.00%) 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 2-nothp 3-nothp 4-nothp 5-nothp 6-nothp User 6437.72 6459.76 5960.32 5974.55 6019.67 System 1049.65 1049.09 1029.32 1031.47 1032.31 Elapsed 1856.77 1874.48 1949.97 1994.22 1983.15 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 2-nothp 3-nothp 4-nothp 5-nothp 6-nothp Minor Faults 253952267 254581900 250030122 250507333 250157829 Major Faults 420 407 506 530 530 Swap Ins 4 9 9 6 6 Swap Outs 398 375 345 346 333 Direct pages scanned 197538 189017 298574 287019 299063 Kswapd pages scanned 1809843 1801308 1846674 1873184 1861089 Kswapd pages reclaimed 1806972 1798684 1844219 1870509 1858622 Direct pages reclaimed 197227 188829 298380 286822 298835 Kswapd efficiency 99% 99% 99% 99% 99% Kswapd velocity 953.382 970.449 952.243 934.569 922.286 Direct efficiency 99% 99% 99% 99% 99% Direct velocity 104.058 101.832 153.961 143.200 148.205 Percentage direct scans 9% 9% 13% 13% 13% Zone normal velocity 347.289 359.676 348.063 339.933 332.983 Zone dma32 velocity 710.151 712.605 758.140 737.835 737.507 Zone dma velocity 0.000 0.000 0.000 0.000 0.000 Page writes by reclaim 557.600 429.000 353.600 426.400 381.800 Page writes file 159 53 7 79 48 Page writes anon 398 375 345 346 333 Page reclaim immediate 825 644 411 575 420 Sector Reads 2781750 2769780 2878547 2939128 2910483 Sector Writes 12080843 12083351 12012892 12002132 12010745 Page rescued immediate 0 0 0 0 0 Slabs scanned 1575654 1545344 1778406 1786700 1794073 Direct inode steals 9657 10037 15795 14104 14645 Kswapd inode steals 46857 46335 50543 50716 51796 Kswapd skipped wait 0 0 0 0 0 THP fault alloc 97 91 81 71 77 THP collapse alloc 456 506 546 544 565 THP splits 6 5 5 4 4 THP fault fallback 0 1 0 0 0 THP collapse fail 14 14 12 13 12 Compaction stalls 1006 980 1537 1536 1548 Compaction success 303 284 562 559 578 Compaction failures 702 696 974 976 969 Page migrate success 1177325 1070077 3927538 3781870 3877057 Page migrate failure 0 0 0 0 0 Compaction pages isolated 2547248 2306457 8301218 8008500 8200674 Compaction migrate scanned 42290478 38832618 153961130 154143900 159141197 Compaction free scanned 89199429 79189151 356529027 351943166 356326727 Compaction cost 1566 1426 5312 5156 5294 NUMA PTE updates 0 0 0 0 0 NUMA hint faults 0 0 0 0 0 NUMA hint local faults 0 0 0 0 0 NUMA hint local percent 100 100 100 100 100 NUMA pages migrated 0 0 0 0 0 AutoNUMA cost 0 0 0 0 0 Observations: - The "Success 3" line is allocation success rate with system idle (phases 1 and 2 are with background interference). I used to get stable values around 85% with vanilla 3.11. The lower min and mean values came with 3.12. This was bisected to commit 81c0a2bb ("mm: page_alloc: fair zone allocator policy") As explained in comment for patch 3, I don't think the commit is wrong, but that it makes the effect of compaction bugs worse. From patch 3 onwards, the results are OK and match the 3.11 results. - Patch 4 also clearly helps phases 1 and 2, and exceeds any results I've seen with 3.11 (I didn't measure it that thoroughly then, but it was never above 40%). - Compaction cost and number of scanned pages is higher, especially due to patch 4. However, keep in mind that patches 3 and 4 fix existing bugs in the current design of compaction overhead mitigation, they do not change it. If overhead is found unacceptable, then it should be decreased differently (and consistently, not due to random conditions) than the current implementation does. In contrast, patches 5 and 6 (which are not strictly bug fixes) do not increase the overhead (but also not success rates). This might be a limitation of the stress-highalloc benchmark as it's quite uniform. Another set of results is when configuring stress-highalloc t allocate with similar flags as THP uses: (GFP_HIGHUSER_MOVABLE|__GFP_NOMEMALLOC|__GFP_NORETRY|__GFP_NO_KSWAPD) stress-highalloc 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 2-thp 3-thp 4-thp 5-thp 6-thp Success 1 Min 2.00 ( 0.00%) 7.00 (-250.00%) 18.00 (-800.00%) 19.00 (-850.00%) 26.00 (-1200.00%) Success 1 Mean 19.20 ( 0.00%) 17.80 ( 7.29%) 29.20 (-52.08%) 29.90 (-55.73%) 32.80 (-70.83%) Success 1 Max 27.00 ( 0.00%) 29.00 ( -7.41%) 35.00 (-29.63%) 36.00 (-33.33%) 37.00 (-37.04%) Success 2 Min 3.00 ( 0.00%) 8.00 (-166.67%) 21.00 (-600.00%) 21.00 (-600.00%) 32.00 (-966.67%) Success 2 Mean 19.30 ( 0.00%) 17.90 ( 7.25%) 32.20 (-66.84%) 32.60 (-68.91%) 35.70 (-84.97%) Success 2 Max 27.00 ( 0.00%) 30.00 (-11.11%) 36.00 (-33.33%) 37.00 (-37.04%) 39.00 (-44.44%) Success 3 Min 62.00 ( 0.00%) 62.00 ( 0.00%) 85.00 (-37.10%) 75.00 (-20.97%) 64.00 ( -3.23%) Success 3 Mean 66.30 ( 0.00%) 65.50 ( 1.21%) 85.60 (-29.11%) 83.40 (-25.79%) 83.50 (-25.94%) Success 3 Max 70.00 ( 0.00%) 69.00 ( 1.43%) 87.00 (-24.29%) 86.00 (-22.86%) 87.00 (-24.29%) 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 2-thp 3-thp 4-thp 5-thp 6-thp User 6547.93 6475.85 6265.54 6289.46 6189.96 System 1053.42 1047.28 1043.23 1042.73 1038.73 Elapsed 1835.43 1821.96 1908.67 1912.74 1956.38 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 3.13-rc2 2-thp 3-thp 4-thp 5-thp 6-thp Minor Faults 256805673 253106328 253222299 249830289 251184418 Major Faults 395 375 423 434 448 Swap Ins 12 10 10 12 9 Swap Outs 530 537 487 455 415 Direct pages scanned 71859 86046 153244 152764 190713 Kswapd pages scanned 1900994 1870240 1898012 1892864 1880520 Kswapd pages reclaimed 1897814 1867428 1894939 1890125 1877924 Direct pages reclaimed 71766 85908 153167 152643 190600 Kswapd efficiency 99% 99% 99% 99% 99% Kswapd velocity 1029.000 1067.782 1000.091 991.049 951.218 Direct efficiency 99% 99% 99% 99% 99% Direct velocity 38.897 49.127 80.747 79.983 96.468 Percentage direct scans 3% 4% 7% 7% 9% Zone normal velocity 351.377 372.494 348.910 341.689 335.310 Zone dma32 velocity 716.520 744.414 731.928 729.343 712.377 Zone dma velocity 0.000 0.000 0.000 0.000 0.000 Page writes by reclaim 669.300 604.000 545.700 538.900 429.900 Page writes file 138 66 58 83 14 Page writes anon 530 537 487 455 415 Page reclaim immediate 806 655 772 548 517 Sector Reads 2711956 2703239 2811602 2818248 2839459 Sector Writes 12163238 12018662 12038248 11954736 11994892 Page rescued immediate 0 0 0 0 0 Slabs scanned 1385088 1388364 1507968 1513292 1558656 Direct inode steals 1739 2564 4622 5496 6007 Kswapd inode steals 47461 46406 47804 48013 48466 Kswapd skipped wait 0 0 0 0 0 THP fault alloc 110 82 84 69 70 THP collapse alloc 445 482 467 462 539 THP splits 6 5 4 5 3 THP fault fallback 3 0 0 0 0 THP collapse fail 15 14 14 14 13 Compaction stalls 659 685 1033 1073 1111 Compaction success 222 225 410 427 456 Compaction failures 436 460 622 646 655 Page migrate success 446594 439978 1085640 1095062 1131716 Page migrate failure 0 0 0 0 0 Compaction pages isolated 1029475 1013490 2453074 2482698 2565400 Compaction migrate scanned 9955461 11344259 24375202 27978356 30494204 Compaction free scanned 27715272 28544654 80150615 82898631 85756132 Compaction cost 552 555 1344 1379 1436 NUMA PTE updates 0 0 0 0 0 NUMA hint faults 0 0 0 0 0 NUMA hint local faults 0 0 0 0 0 NUMA hint local percent 100 100 100 100 100 NUMA pages migrated 0 0 0 0 0 AutoNUMA cost 0 0 0 0 0 There are some differences from the previous results for THP-like allocations: - Here, the bad result for unpatched kernel in phase 3 is much more consistent to be between 65-70% and not related to the "regression" in 3.12. Still there is the improvement from patch 4 onwards, which brings it on par with simple GFP_HIGHUSER_MOVABLE allocations. - Compaction costs have increased, but nowhere near as much as the non-THP case. Again, the patches should be worth the gained determininsm. - Patches 5 and 6 somewhat increase the number of migrate-scanned pages. This is most likely due to __GFP_NO_KSWAPD flag, which means the cached pfn's and pageblock skip bits are not reset by kswapd that often (at least in phase 3 where no concurrent activity would wake up kswapd) and the patches thus help the sync-after-async compaction. It doesn't however show that the sync compaction would help so much with success rates, which can be again seen as a limitation of the benchmark scenario. This patch (of 6): Add two tracepoints for compaction begin and end of a zone. Using this it is possible to calculate how much time a workload is spending within compaction and potentially debug problems related to cached pfns for scanning. In combination with the direct reclaim and slab trace points it should be possible to estimate most allocation-related overhead for a workload. Signed-off-by: Mel Gorman Signed-off-by: Vlastimil Babka Cc: Rik van Riel Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index f58bcd016f43..a03995eddedb 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -970,6 +970,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) __reset_isolation_suitable(zone); + trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); + migrate_prep_local(); while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { @@ -1015,6 +1017,8 @@ out: cc->nr_freepages -= release_freepages(&cc->freepages); VM_BUG_ON(cc->nr_freepages != 0); + trace_mm_compaction_end(ret); + return ret; } -- cgit v1.2.1 From de6c60a6c115acaa721cfd499e028a413d1fcbf3 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 21 Jan 2014 15:51:07 -0800 Subject: mm: compaction: encapsulate defer reset logic Currently there are several functions to manipulate the deferred compaction state variables. The remaining case where the variables are touched directly is when a successful allocation occurs in direct compaction, or is expected to be successful in the future by kswapd. Here, the lowest order that is expected to fail is updated, and in the case of successful allocation, the deferred status and counter is reset completely. Create a new function compaction_defer_reset() to encapsulate this functionality and make it easier to understand the code. No functional change. Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 9 ++++----- mm/page_alloc.c | 5 +---- 2 files changed, 5 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index a03995eddedb..927de97cab8d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1124,12 +1124,11 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) compact_zone(zone, cc); if (cc->order > 0) { - int ok = zone_watermark_ok(zone, cc->order, - low_wmark_pages(zone), 0, 0); - if (ok && cc->order >= zone->compact_order_failed) - zone->compact_order_failed = cc->order + 1; + if (zone_watermark_ok(zone, cc->order, + low_wmark_pages(zone), 0, 0)) + compaction_defer_reset(zone, cc->order, false); /* Currently async compaction is never deferred. */ - else if (!ok && cc->sync) + else if (cc->sync) defer_compaction(zone, cc->order); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b230e838883d..84da0e3bc886 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2235,10 +2235,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, preferred_zone, migratetype); if (page) { preferred_zone->compact_blockskip_flush = false; - preferred_zone->compact_considered = 0; - preferred_zone->compact_defer_shift = 0; - if (order >= preferred_zone->compact_order_failed) - preferred_zone->compact_order_failed = order + 1; + compaction_defer_reset(preferred_zone, order, true); count_vm_event(COMPACTSUCCESS); return page; } -- cgit v1.2.1 From d3132e4b83e6bd383c74d716f7281d7c3136089c Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 21 Jan 2014 15:51:08 -0800 Subject: mm: compaction: reset cached scanner pfn's before reading them Compaction caches pfn's for its migrate and free scanners to avoid scanning the whole zone each time. In compact_zone(), the cached values are read to set up initial values for the scanners. There are several situations when these cached pfn's are reset to the first and last pfn of the zone, respectively. One of these situations is when a compaction has been deferred for a zone and is now being restarted during a direct compaction, which is also done in compact_zone(). However, compact_zone() currently reads the cached pfn's *before* resetting them. This means the reset doesn't affect the compaction that performs it, and with good chance also subsequent compactions, as update_pageblock_skip() is likely to be called and update the cached pfn's to those being processed. Another chance for a successful reset is when a direct compaction detects that migration and free scanners meet (which has its own problems addressed by another patch) and sets update_pageblock_skip flag which kswapd uses to do the reset because it goes to sleep. This is clearly a bug that results in non-deterministic behavior, so this patch moves the cached pfn reset to be performed *before* the values are read. Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Acked-by: Rik van Riel Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 927de97cab8d..f4e2c166880b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -946,6 +946,14 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) ; } + /* + * Clear pageblock skip if there were failures recently and compaction + * is about to be retried after being deferred. kswapd does not do + * this reset as it'll reset the cached information when going to sleep. + */ + if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) + __reset_isolation_suitable(zone); + /* * Setup to move all movable pages to the end of the zone. Used cached * information on where the scanners should start but check that it @@ -962,14 +970,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) zone->compact_cached_migrate_pfn = cc->migrate_pfn; } - /* - * Clear pageblock skip if there were failures recently and compaction - * is about to be retried after being deferred. kswapd does not do - * this reset as it'll reset the cached information when going to sleep. - */ - if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) - __reset_isolation_suitable(zone); - trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); migrate_prep_local(); -- cgit v1.2.1 From 7ed695e069c3cbea5e1fd08f84a04536da91f584 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 21 Jan 2014 15:51:09 -0800 Subject: mm: compaction: detect when scanners meet in isolate_freepages Compaction of a zone is finished when the migrate scanner (which begins at the zone's lowest pfn) meets the free page scanner (which begins at the zone's highest pfn). This is detected in compact_zone() and in the case of direct compaction, the compact_blockskip_flush flag is set so that kswapd later resets the cached scanner pfn's, and a new compaction may again start at the zone's borders. The meeting of the scanners can happen during either scanner's activity. However, it may currently fail to be detected when it occurs in the free page scanner, due to two problems. First, isolate_freepages() keeps free_pfn at the highest block where it isolated pages from, for the purposes of not missing the pages that are returned back to allocator when migration fails. Second, failing to isolate enough free pages due to scanners meeting results in -ENOMEM being returned by migrate_pages(), which makes compact_zone() bail out immediately without calling compact_finished() that would detect scanners meeting. This failure to detect scanners meeting might result in repeated attempts at compaction of a zone that keep starting from the cached pfn's close to the meeting point, and quickly failing through the -ENOMEM path, without the cached pfns being reset, over and over. This has been observed (through additional tracepoints) in the third phase of the mmtests stress-highalloc benchmark, where the allocator runs on an otherwise idle system. The problem was observed in the DMA32 zone, which was used as a fallback to the preferred Normal zone, but on the 4GB system it was actually the largest zone. The problem is even amplified for such fallback zone - the deferred compaction logic, which could (after being fixed by a previous patch) reset the cached scanner pfn's, is only applied to the preferred zone and not for the fallbacks. The problem in the third phase of the benchmark was further amplified by commit 81c0a2bb515f ("mm: page_alloc: fair zone allocator policy") which resulted in a non-deterministic regression of the allocation success rate from ~85% to ~65%. This occurs in about half of benchmark runs, making bisection problematic. It is unlikely that the commit itself is buggy, but it should put more pressure on the DMA32 zone during phases 1 and 2, which may leave it more fragmented in phase 3 and expose the bugs that this patch fixes. The fix is to make scanners meeting in isolate_freepage() stay that way, and to check in compact_zone() for scanners meeting when migrate_pages() returns -ENOMEM. The result is that compact_finished() also detects scanners meeting and sets the compact_blockskip_flush flag to make kswapd reset the scanner pfn's. The results in stress-highalloc benchmark show that the "regression" by commit 81c0a2bb515f in phase 3 no longer occurs, and phase 1 and 2 allocation success rates are also significantly improved. Signed-off-by: Vlastimil Babka Cc: Mel Gorman Cc: Rik van Riel Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index f4e2c166880b..cc46db36e708 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -660,7 +660,7 @@ static void isolate_freepages(struct zone *zone, * is the end of the pageblock the migration scanner is using. */ pfn = cc->free_pfn; - low_pfn = cc->migrate_pfn + pageblock_nr_pages; + low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); /* * Take care that if the migration scanner is at the end of the zone @@ -676,7 +676,7 @@ static void isolate_freepages(struct zone *zone, * pages on cc->migratepages. We stop searching if the migrate * and free page scanners meet or enough free pages are isolated. */ - for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; + for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages; pfn -= pageblock_nr_pages) { unsigned long isolated; @@ -738,7 +738,14 @@ static void isolate_freepages(struct zone *zone, /* split_free_page does not map the pages */ map_pages(freelist); - cc->free_pfn = high_pfn; + /* + * If we crossed the migrate scanner, we want to keep it that way + * so that compact_finished() may detect this + */ + if (pfn < low_pfn) + cc->free_pfn = max(pfn, zone->zone_start_pfn); + else + cc->free_pfn = high_pfn; cc->nr_freepages = nr_freepages; } @@ -1005,7 +1012,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) if (err) { putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; - if (err == -ENOMEM) { + /* + * migrate_pages() may return -ENOMEM when scanners meet + * and we want compact_finished() to detect it + */ + if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) { ret = COMPACT_PARTIAL; goto out; } -- cgit v1.2.1 From 50b5b094e683f8e51e82c6dfe97b1608cf97e6c0 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 21 Jan 2014 15:51:10 -0800 Subject: mm: compaction: do not mark unmovable pageblocks as skipped in async compaction Compaction temporarily marks pageblocks where it fails to isolate pages as to-be-skipped in further compactions, in order to improve efficiency. One of the reasons to fail isolating pages is that isolation is not attempted in pageblocks that are not of MIGRATE_MOVABLE (or CMA) type. The problem is that blocks skipped due to not being MIGRATE_MOVABLE in async compaction become skipped due to the temporary mark also in future sync compaction. Moreover, this may follow quite soon during __alloc_page_slowpath, without much time for kswapd to clear the pageblock skip marks. This goes against the idea that sync compaction should try to scan these blocks more thoroughly than the async compaction. The fix is to ensure in async compaction that these !MIGRATE_MOVABLE blocks are not marked to be skipped. Note this should not affect performance or locking impact of further async compactions, as skipping a block due to being !MIGRATE_MOVABLE is done soon after skipping a block marked to be skipped, both without locking. Signed-off-by: Vlastimil Babka Cc: Rik van Riel Acked-by: Mel Gorman Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index cc46db36e708..32a033cb5c65 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -459,6 +459,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, unsigned long flags; bool locked = false; struct page *page = NULL, *valid_page = NULL; + bool skipped_async_unsuitable = false; /* * Ensure that there are not too many pages isolated from the LRU @@ -534,6 +535,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, if (!cc->sync && last_pageblock_nr != pageblock_nr && !migrate_async_suitable(get_pageblock_migratetype(page))) { cc->finished_update_migrate = true; + skipped_async_unsuitable = true; goto next_pageblock; } @@ -627,8 +629,13 @@ next_pageblock: if (locked) spin_unlock_irqrestore(&zone->lru_lock, flags); - /* Update the pageblock-skip if the whole pageblock was scanned */ - if (low_pfn == end_pfn) + /* + * Update the pageblock-skip information and cached scanner pfn, + * if the whole pageblock was scanned without isolating any page. + * This is not done when pageblock was skipped due to being unsuitable + * for async compaction, so that eventual sync compaction can try. + */ + if (low_pfn == end_pfn && !skipped_async_unsuitable) update_pageblock_skip(cc, valid_page, nr_isolated, true); trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); -- cgit v1.2.1 From 55b7c4c99f6a448f72179297fe6432544f220063 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 21 Jan 2014 15:51:11 -0800 Subject: mm: compaction: reset scanner positions immediately when they meet Compaction used to start its migrate and free page scaners at the zone's lowest and highest pfn, respectively. Later, caching was introduced to remember the scanners' progress across compaction attempts so that pageblocks are not re-scanned uselessly. Additionally, pageblocks where isolation failed are marked to be quickly skipped when encountered again in future compactions. Currently, both the reset of cached pfn's and clearing of the pageblock skip information for a zone is done in __reset_isolation_suitable(). This function gets called when: - compaction is restarting after being deferred - compact_blockskip_flush flag is set in compact_finished() when the scanners meet (and not again cleared when direct compaction succeeds in allocation) and kswapd acts upon this flag before going to sleep This behavior is suboptimal for several reasons: - when direct sync compaction is called after async compaction fails (in the allocation slowpath), it will effectively do nothing, unless kswapd happens to process the compact_blockskip_flush flag meanwhile. This is racy and goes against the purpose of sync compaction to more thoroughly retry the compaction of a zone where async compaction has failed. The restart-after-deferring path cannot help here as deferring happens only after the sync compaction fails. It is also done only for the preferred zone, while the compaction might be done for a fallback zone. - the mechanism of marking pageblock to be skipped has little value since the cached pfn's are reset only together with the pageblock skip flags. This effectively limits pageblock skip usage to parallel compactions. This patch changes compact_finished() so that cached pfn's are reset immediately when the scanners meet. Clearing pageblock skip flags is unchanged, as well as the other situations where cached pfn's are reset. This allows the sync-after-async compaction to retry pageblocks not marked as skipped, such as blocks !MIGRATE_MOVABLE blocks that async compactions now skips without marking them. Signed-off-by: Vlastimil Babka Cc: Rik van Riel Acked-by: Mel Gorman Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 32a033cb5c65..3a91a2ea3d34 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -851,6 +851,10 @@ static int compact_finished(struct zone *zone, /* Compaction run completes if the migrate and free scanner meet */ if (cc->free_pfn <= cc->migrate_pfn) { + /* Let the next compaction start anew. */ + zone->compact_cached_migrate_pfn = zone->zone_start_pfn; + zone->compact_cached_free_pfn = zone_end_pfn(zone); + /* * Mark that the PG_migrate_skip information should be cleared * by kswapd when it goes to sleep. kswapd does not set the -- cgit v1.2.1 From aed0a0e32de387da831284fda25021de32477195 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 21 Jan 2014 15:51:12 -0800 Subject: mm, page_alloc: warn for non-blockable __GFP_NOFAIL allocation failure __GFP_NOFAIL may return NULL when coupled with GFP_NOWAIT or GFP_ATOMIC. Luckily, nothing currently does such craziness. So instead of causing such allocations to loop (potentially forever), we maintain the current behavior and also warn about the new users of the deprecated flag. Suggested-by: Andrew Morton Signed-off-by: David Rientjes Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 84da0e3bc886..533e2147d14f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2525,8 +2525,15 @@ rebalance: } /* Atomic allocations - we can't balance anything */ - if (!wait) + if (!wait) { + /* + * All existing users of the deprecated __GFP_NOFAIL are + * blockable, so warn of any new users that actually allow this + * type of allocation to fail. + */ + WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL); goto nopage; + } /* Avoid recursion of direct reclaim */ if (current->flags & PF_MEMALLOC) -- cgit v1.2.1 From 354a3363363724c21ea2e4b28370e27983c2452e Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Tue, 21 Jan 2014 15:51:14 -0800 Subject: mm/migrate: add comment about permanent failure path Let's add a comment about where the failed page goes to, which makes code more readable. Signed-off-by: Naoya Horiguchi Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Reviewed-by: Wanpeng Li Acked-by: Rafael Aquini Cc: Vlastimil Babka Cc: Wanpeng Li Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index f9e16350d09c..626ca3c5d07b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1125,7 +1125,12 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, nr_succeeded++; break; default: - /* Permanent failure */ + /* + * Permanent failure (-EBUSY, -ENOSYS, etc.): + * unlike -EAGAIN case, the failed page is + * removed from migration page list and not + * retried in the next outer loop. + */ nr_failed++; break; } -- cgit v1.2.1 From 32665f2bbfed2e325d37236d9b0071a11a69124e Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:51:15 -0800 Subject: mm/migrate: correct failure handling if !hugepage_migration_support() We should remove the page from the list if we fail with ENOSYS, since migrate_pages() consider error cases except -ENOMEM and -EAGAIN as permanent failure and it assumes that the page would be removed from the list. Without this patch, we could overcount number of failure. In addition, we should put back the new hugepage if !hugepage_migration_support(). If not, we would leak hugepage memory. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Reviewed-by: Wanpeng Li Reviewed-by: Naoya Horiguchi Cc: Rafael Aquini Cc: Vlastimil Babka Cc: Wanpeng Li Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 626ca3c5d07b..13bedcc4656b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1013,7 +1013,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, { int rc = 0; int *result = NULL; - struct page *new_hpage = get_new_page(hpage, private, &result); + struct page *new_hpage; struct anon_vma *anon_vma = NULL; /* @@ -1023,9 +1023,12 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, * tables or check whether the hugepage is pmd-based or not before * kicking migration. */ - if (!hugepage_migration_support(page_hstate(hpage))) + if (!hugepage_migration_support(page_hstate(hpage))) { + putback_active_hugepage(hpage); return -ENOSYS; + } + new_hpage = get_new_page(hpage, private, &result); if (!new_hpage) return -ENOMEM; -- cgit v1.2.1 From 59c82b70dcd9cc273c21fae5abc29e41fc732a17 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:51:17 -0800 Subject: mm/migrate: remove putback_lru_pages, fix comment on putback_movable_pages Some part of putback_lru_pages() and putback_movable_pages() is duplicated, so it could confuse us what we should use. We can remove putback_lru_pages() since it is not really needed now. This makes us undestand and maintain the code more easily. And comment on putback_movable_pages() is stale now, so fix it. Signed-off-by: Joonsoo Kim Reviewed-by: Wanpeng Li Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Rafael Aquini Cc: Vlastimil Babka Cc: Wanpeng Li Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 8 +++++++- mm/migrate.c | 29 +++++++++-------------------- 2 files changed, 16 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9fa6586d5275..b25ed321e667 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1585,7 +1585,13 @@ static int __soft_offline_page(struct page *page, int flags) ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { - putback_lru_pages(&pagelist); + if (!list_empty(&pagelist)) { + list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + putback_lru_page(page); + } + pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) diff --git a/mm/migrate.c b/mm/migrate.c index 13bedcc4656b..8a73d66be102 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -71,29 +71,13 @@ int migrate_prep_local(void) return 0; } -/* - * Add isolated pages on the list back to the LRU under page lock - * to avoid leaking evictable pages back onto unevictable list. - */ -void putback_lru_pages(struct list_head *l) -{ - struct page *page; - struct page *page2; - - list_for_each_entry_safe(page, page2, l, lru) { - list_del(&page->lru); - dec_zone_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); - putback_lru_page(page); - } -} - /* * Put previously isolated pages back onto the appropriate lists * from where they were once taken off for compaction/migration. * - * This function shall be used instead of putback_lru_pages(), - * whenever the isolated pageset has been built by isolate_migratepages_range() + * This function shall be used whenever the isolated pageset has been + * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() + * and isolate_huge_page(). */ void putback_movable_pages(struct list_head *l) { @@ -1725,7 +1709,12 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); if (nr_remaining) { - putback_lru_pages(&migratepages); + if (!list_empty(&migratepages)) { + list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + putback_lru_page(page); + } isolated = 0; } else count_vm_numa_event(NUMA_PAGE_MIGRATE); -- cgit v1.2.1 From 78d5506e82b21a1a1de68c24182db2c2fe521422 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 21 Jan 2014 15:51:18 -0800 Subject: mm/migrate: remove unused function, fail_migrate_page() fail_migrate_page() isn't used anywhere, so remove it. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Reviewed-by: Naoya Horiguchi Reviewed-by: Wanpeng Li Cc: Rafael Aquini Cc: Vlastimil Babka Cc: Wanpeng Li Cc: Mel Gorman Cc: Rik van Riel Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 8a73d66be102..a8025befc323 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -552,14 +552,6 @@ void migrate_page_copy(struct page *newpage, struct page *page) * Migration functions ***********************************************************/ -/* Always fail migration. Used for mappings that are not movable */ -int fail_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) -{ - return -EIO; -} -EXPORT_SYMBOL(fail_migrate_page); - /* * Common logic to directly migrate a single page suitable for * pages that do not use PagePrivate/PagePrivate2. -- cgit v1.2.1