From 71baba4b92dc1fa1bc461742c6ab1942ec6034e9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 6 Nov 2015 16:28:28 -0800 Subject: mm, page_alloc: rename __GFP_WAIT to __GFP_RECLAIM __GFP_WAIT was used to signal that the caller was in atomic context and could not sleep. Now it is possible to distinguish between true atomic context and callers that are not willing to sleep. The latter should clear __GFP_DIRECT_RECLAIM so kswapd will still wake. As clearing __GFP_WAIT behaves differently, there is a risk that people will clear the wrong flags. This patch renames __GFP_WAIT to __GFP_RECLAIM to clearly indicate what it does -- setting it allows all reclaim activity, clearing them prevents it. [akpm@linux-foundation.org: fix build] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Mel Gorman Acked-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Johannes Weiner Cc: Christoph Lameter Acked-by: David Rientjes Cc: Vitaly Wool Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f5c08b46fef8..9812d4618651 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -786,7 +786,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) { - return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; + return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_RECLAIM)) | extra_gfp; } /* Caller must hold page table lock. */ -- cgit v1.2.1 From 974a786e63c96a2401a78ddba926f34c128474f1 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 6 Nov 2015 16:28:34 -0800 Subject: mm, page_alloc: remove MIGRATE_RESERVE MIGRATE_RESERVE preserves an old property of the buddy allocator that existed prior to fragmentation avoidance -- min_free_kbytes worth of pages tended to remain contiguous until the only alternative was to fail the allocation. At the time it was discovered that high-order atomic allocations relied on this property so MIGRATE_RESERVE was introduced. A later patch will introduce an alternative MIGRATE_HIGHATOMIC so this patch deletes MIGRATE_RESERVE and supporting code so it'll be easier to review. Note that this patch in isolation may look like a false regression if someone was bisecting high-order atomic allocation failures. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Johannes Weiner Cc: Michal Hocko Cc: Vitaly Wool Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9812d4618651..dabd247df535 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -116,7 +116,7 @@ static void set_recommended_min_free_kbytes(void) for_each_populated_zone(zone) nr_zones++; - /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */ + /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ recommended_min = pageblock_nr_pages * nr_zones * 2; /* -- cgit v1.2.1 From d6669d689f397137381fe6729293e0eba1ef09a8 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Fri, 6 Nov 2015 16:28:52 -0800 Subject: thp: remove unused vma parameter from khugepaged_alloc_page The "vma" parameter to khugepaged_alloc_page() is unused. It has to remain unused or the drop read lock 'map_sem' optimisation introduce by commit 8b1645685acf ("mm, THP: don't hold mmap_sem in khugepaged when allocating THP") wouldn't be safe. So let's remove it. Signed-off-by: Aaron Tomlin Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index dabd247df535..73266ee7274c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2413,8 +2413,7 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) static struct page * khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - int node) + unsigned long address, int node) { VM_BUG_ON_PAGE(*hpage, *hpage); @@ -2481,8 +2480,7 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) static struct page * khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - int node) + unsigned long address, int node) { up_read(&mm->mmap_sem); VM_BUG_ON(!*hpage); @@ -2530,7 +2528,7 @@ static void collapse_huge_page(struct mm_struct *mm, __GFP_THISNODE; /* release the mmap_sem read lock. */ - new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node); + new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node); if (!new_page) return; -- cgit v1.2.1 From 1d798ca3f16437c71ff63e36597ff07f9c12e4d6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 6 Nov 2015 16:29:54 -0800 Subject: mm: make compound_head() robust Hugh has pointed that compound_head() call can be unsafe in some context. There's one example: CPU0 CPU1 isolate_migratepages_block() page_count() compound_head() !!PageTail() == true put_page() tail->first_page = NULL head = tail->first_page alloc_pages(__GFP_COMP) prep_compound_page() tail->first_page = head __SetPageTail(p); !!PageTail() == true The race is pure theoretical. I don't it's possible to trigger it in practice. But who knows. We can fix the race by changing how encode PageTail() and compound_head() within struct page to be able to update them in one shot. The patch introduces page->compound_head into third double word block in front of compound_dtor and compound_order. Bit 0 encodes PageTail() and the rest bits are pointer to head page if bit zero is set. The patch moves page->pmd_huge_pte out of word, just in case if an architecture defines pgtable_t into something what can have the bit 0 set. hugetlb_cgroup uses page->lru.next in the second tail page to store pointer struct hugetlb_cgroup. The patch switch it to use page->private in the second tail page instead. The space is free since ->first_page is removed from the union. The patch also opens possibility to remove HUGETLB_CGROUP_MIN_ORDER limitation, since there's now space in first tail page to store struct hugetlb_cgroup pointer. But that's out of scope of the patch. That means page->compound_head shares storage space with: - page->lru.next; - page->next; - page->rcu_head.next; That's too long list to be absolutely sure, but looks like nobody uses bit 0 of the word. page->rcu_head.next guaranteed[1] to have bit 0 clean as long as we use call_rcu(), call_rcu_bh(), call_rcu_sched(), or call_srcu(). But future call_rcu_lazy() is not allowed as it makes use of the bit and we can get false positive PageTail(). [1] http://lkml.kernel.org/g/20150827163634.GD4029@linux.vnet.ibm.com Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Reviewed-by: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Cc: Vlastimil Babka Acked-by: Paul E. McKenney Cc: Aneesh Kumar K.V Cc: Andi Kleen Cc: Christoph Lameter Cc: Joonsoo Kim Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 73266ee7274c..e1ccc83f73d3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1755,8 +1755,7 @@ static void __split_huge_page_refcount(struct page *page, (1L << PG_unevictable))); page_tail->flags |= (1L << PG_dirty); - /* clear PageTail before overwriting first_page */ - smp_wmb(); + clear_compound_head(page_tail); if (page_is_young(page)) set_page_young(page_tail); -- cgit v1.2.1