From 8fb5debc5fcd450470cdd789c2d80ef95ebb8cf4 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 22 Feb 2017 15:42:52 -0800 Subject: userfaultfd: hugetlbfs: add hugetlb_mcopy_atomic_pte for userfaultfd support hugetlb_mcopy_atomic_pte is the low level routine that implements the userfaultfd UFFDIO_COPY command. It is based on the existing mcopy_atomic_pte routine with modifications for huge pages. Link: http://lkml.kernel.org/r/20161216144821.5183-18-aarcange@redhat.com Signed-off-by: Mike Kravetz Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Rapoport Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c7025c132670..dec628b26f59 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3948,6 +3948,87 @@ out_mutex: return ret; } +/* + * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with + * modifications for huge pages. + */ +int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, + pte_t *dst_pte, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **pagep) +{ + struct hstate *h = hstate_vma(dst_vma); + pte_t _dst_pte; + spinlock_t *ptl; + int ret; + struct page *page; + + if (!*pagep) { + ret = -ENOMEM; + page = alloc_huge_page(dst_vma, dst_addr, 0); + if (IS_ERR(page)) + goto out; + + ret = copy_huge_page_from_user(page, + (const void __user *) src_addr, + pages_per_huge_page(h)); + + /* fallback to copy_from_user outside mmap_sem */ + if (unlikely(ret)) { + ret = -EFAULT; + *pagep = page; + /* don't free the page */ + goto out; + } + } else { + page = *pagep; + *pagep = NULL; + } + + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + set_page_huge_active(page); + + ptl = huge_pte_lockptr(h, dst_mm, dst_pte); + spin_lock(ptl); + + ret = -EEXIST; + if (!huge_pte_none(huge_ptep_get(dst_pte))) + goto out_release_unlock; + + ClearPagePrivate(page); + hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); + + _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE); + if (dst_vma->vm_flags & VM_WRITE) + _dst_pte = huge_pte_mkdirty(_dst_pte); + _dst_pte = pte_mkyoung(_dst_pte); + + set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + + (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte, + dst_vma->vm_flags & VM_WRITE); + hugetlb_count_add(pages_per_huge_page(h), dst_mm); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + + spin_unlock(ptl); + ret = 0; +out: + return ret; +out_release_unlock: + spin_unlock(ptl); + put_page(page); + goto out; +} + long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, unsigned long *nr_pages, -- cgit v1.2.1 From 810a56b943e265bbabfcd5a8e54cb8d3b16cd6e4 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 22 Feb 2017 15:42:58 -0800 Subject: userfaultfd: hugetlbfs: fix __mcopy_atomic_hugetlb retry/error processing The new routine copy_huge_page_from_user() uses kmap_atomic() to map PAGE_SIZE pages. However, this prevents page faults in the subsequent call to copy_from_user(). This is OK in the case where the routine is copied with mmap_sema held. However, in another case we want to allow page faults. So, add a new argument allow_pagefault to indicate if the routine should allow page faults. [dan.carpenter@oracle.com: unmap the correct pointer] Link: http://lkml.kernel.org/r/20170113082608.GA3548@mwanda [akpm@linux-foundation.org: kunmap() takes a page*, per Hugh] Link: http://lkml.kernel.org/r/20161216144821.5183-20-aarcange@redhat.com Signed-off-by: Mike Kravetz Signed-off-by: Andrea Arcangeli Signed-off-by: Dan Carpenter Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Rapoport Cc: Pavel Emelyanov Cc: Hugh Dickins Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dec628b26f59..5d20af921a30 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3973,7 +3973,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, ret = copy_huge_page_from_user(page, (const void __user *) src_addr, - pages_per_huge_page(h)); + pages_per_huge_page(h), false); /* fallback to copy_from_user outside mmap_sem */ if (unlikely(ret)) { -- cgit v1.2.1 From 1a1aad8a9b7bd34f60cdf98cd7915f00ae892c45 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 22 Feb 2017 15:43:01 -0800 Subject: userfaultfd: hugetlbfs: add userfaultfd hugetlb hook When processing a hugetlb fault for no page present, check the vma to determine if faults are to be handled via userfaultfd. If so, drop the hugetlb_fault_mutex and call handle_userfault(). Link: http://lkml.kernel.org/r/20161216144821.5183-21-aarcange@redhat.com Signed-off-by: Mike Kravetz Signed-off-by: Andrea Arcangeli Acked-by: Hillf Danton Cc: "Dr. David Alan Gilbert" Cc: Michael Rapoport Cc: Mike Rapoport Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5d20af921a30..a4b29054cc3f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "internal.h" int hugepages_treat_as_movable; @@ -3680,6 +3681,38 @@ retry: size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto out; + + /* + * Check for page in userfault range + */ + if (userfaultfd_missing(vma)) { + u32 hash; + struct vm_fault vmf = { + .vma = vma, + .address = address, + .flags = flags, + /* + * Hard to debug if it ends up being + * used by a callee that assumes + * something about the other + * uninitialized fields... same as in + * memory.c + */ + }; + + /* + * hugetlb_fault_mutex must be dropped before + * handling userfault. Reacquire after handling + * fault to make calling code simpler. + */ + hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, + idx, address); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + ret = handle_userfault(&vmf, VM_UFFD_MISSING); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + goto out; + } + page = alloc_huge_page(vma, address, 0); if (IS_ERR(page)) { ret = PTR_ERR(page); -- cgit v1.2.1 From 87ffc118b54dcd4cc642723603d944673248152f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 22 Feb 2017 15:43:13 -0800 Subject: userfaultfd: hugetlbfs: gup: support VM_FAULT_RETRY Add support for VM_FAULT_RETRY to follow_hugetlb_page() so that get_user_pages_unlocked/locked and "nonblocking/FOLL_NOWAIT" features will work on hugetlbfs. This is required for fully functional userfaultfd non-present support on hugetlbfs. Link: http://lkml.kernel.org/r/20161216144821.5183-25-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Reviewed-by: Mike Kravetz Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Rapoport Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 48 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 8 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a4b29054cc3f..f6c7ff316daf 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4065,7 +4065,7 @@ out_release_unlock: long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, unsigned long *nr_pages, - long i, unsigned int flags) + long i, unsigned int flags, int *nonblocking) { unsigned long pfn_offset; unsigned long vaddr = *position; @@ -4128,16 +4128,43 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, ((flags & FOLL_WRITE) && !huge_pte_write(huge_ptep_get(pte)))) { int ret; + unsigned int fault_flags = 0; if (pte) spin_unlock(ptl); - ret = hugetlb_fault(mm, vma, vaddr, - (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); - if (!(ret & VM_FAULT_ERROR)) - continue; - - remainder = 0; - break; + if (flags & FOLL_WRITE) + fault_flags |= FAULT_FLAG_WRITE; + if (nonblocking) + fault_flags |= FAULT_FLAG_ALLOW_RETRY; + if (flags & FOLL_NOWAIT) + fault_flags |= FAULT_FLAG_ALLOW_RETRY | + FAULT_FLAG_RETRY_NOWAIT; + if (flags & FOLL_TRIED) { + VM_WARN_ON_ONCE(fault_flags & + FAULT_FLAG_ALLOW_RETRY); + fault_flags |= FAULT_FLAG_TRIED; + } + ret = hugetlb_fault(mm, vma, vaddr, fault_flags); + if (ret & VM_FAULT_ERROR) { + remainder = 0; + break; + } + if (ret & VM_FAULT_RETRY) { + if (nonblocking) + *nonblocking = 0; + *nr_pages = 0; + /* + * VM_FAULT_RETRY must not return an + * error, it will return zero + * instead. + * + * No need to update "position" as the + * caller will not check it after + * *nr_pages is set to 0. + */ + return i; + } + continue; } pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; @@ -4166,6 +4193,11 @@ same_page: spin_unlock(ptl); } *nr_pages = remainder; + /* + * setting position is actually required only if remainder is + * not zero but it's faster not to add a "if (remainder)" + * branch. + */ *position = vaddr; return i ? i : -EFAULT; -- cgit v1.2.1 From 1c9e8def43a3452e7af658b340f5f4f4ecde5c38 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 22 Feb 2017 15:43:43 -0800 Subject: userfaultfd: hugetlbfs: add UFFDIO_COPY support for shared mappings When userfaultfd hugetlbfs support was originally added, it followed the pattern of anon mappings and did not support any vmas marked VM_SHARED. As such, support was only added for private mappings. Remove this limitation and support shared mappings. The primary functional change required is adding pages to the page cache. More subtle changes are required for huge page reservation handling in error paths. A lengthy comment in the code describes the reservation handling. [mike.kravetz@oracle.com: update] Link: http://lkml.kernel.org/r/c9c8cafe-baa7-05b4-34ea-1dfa5523a85f@oracle.com Link: http://lkml.kernel.org/r/1487195210-12839-1-git-send-email-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Andrea Arcangeli Cc: Andrew Morton Cc: Mike Rapoport Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Mike Kravetz Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f6c7ff316daf..30e7709a5121 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3992,6 +3992,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, unsigned long src_addr, struct page **pagep) { + int vm_shared = dst_vma->vm_flags & VM_SHARED; struct hstate *h = hstate_vma(dst_vma); pte_t _dst_pte; spinlock_t *ptl; @@ -4028,6 +4029,18 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, __SetPageUptodate(page); set_page_huge_active(page); + /* + * If shared, add to page cache + */ + if (vm_shared) { + struct address_space *mapping = dst_vma->vm_file->f_mapping; + pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); + + ret = huge_add_to_page_cache(page, mapping, idx); + if (ret) + goto out_release_nounlock; + } + ptl = huge_pte_lockptr(h, dst_mm, dst_pte); spin_lock(ptl); @@ -4035,8 +4048,12 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, if (!huge_pte_none(huge_ptep_get(dst_pte))) goto out_release_unlock; - ClearPagePrivate(page); - hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); + if (vm_shared) { + page_dup_rmap(page, true); + } else { + ClearPagePrivate(page); + hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); + } _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE); if (dst_vma->vm_flags & VM_WRITE) @@ -4053,11 +4070,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, update_mmu_cache(dst_vma, dst_addr, dst_pte); spin_unlock(ptl); + if (vm_shared) + unlock_page(page); ret = 0; out: return ret; out_release_unlock: spin_unlock(ptl); +out_release_nounlock: + if (vm_shared) + unlock_page(page); put_page(page); goto out; } -- cgit v1.2.1 From 11bac80004499ea59f361ef2a5516c84b6eab675 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 24 Feb 2017 14:56:41 -0800 Subject: mm, fs: reduce fault, page_mkwrite, and pfn_mkwrite to take only vmf ->fault(), ->page_mkwrite(), and ->pfn_mkwrite() calls do not need to take a vma and vmf parameter when the vma already resides in vmf. Remove the vma parameter to simplify things. [arnd@arndb.de: fix ARM build] Link: http://lkml.kernel.org/r/20170125223558.1451224-1-arnd@arndb.de Link: http://lkml.kernel.org/r/148521301778.19116.10840599906674778980.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Dave Jiang Signed-off-by: Arnd Bergmann Reviewed-by: Ross Zwisler Cc: Theodore Ts'o Cc: Darrick J. Wong Cc: Matthew Wilcox Cc: Dave Hansen Cc: Christoph Hellwig Cc: Jan Kara Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 30e7709a5121..167fd0722c15 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3142,7 +3142,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get * this far. */ -static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int hugetlb_vm_op_fault(struct vm_fault *vmf) { BUG(); return 0; -- cgit v1.2.1 From ca96b625341027f611c3e61351a70311077ebcf5 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 24 Feb 2017 14:58:37 -0800 Subject: mm: alloc_contig_range: allow to specify GFP mask Currently alloc_contig_range assumes that the compaction should be done with the default GFP_KERNEL flags. This is probably right for all current uses of this interface, but may change as CMA is used in more use-cases (including being the default DMA memory allocator on some platforms). Change the function prototype, to allow for passing through the GFP mask set by upper layers. Also respect global restrictions by applying memalloc_noio_flags to the passed in flags. Link: http://lkml.kernel.org/r/20170127172328.18574-1-l.stach@pengutronix.de Signed-off-by: Lucas Stach Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Radim Krcmar Cc: Catalin Marinas Cc: Will Deacon Cc: Chris Zankel Cc: Ralf Baechle Cc: Paolo Bonzini Cc: Alexander Graf Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 167fd0722c15..2e0e8159ce8e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1052,7 +1052,8 @@ static int __alloc_gigantic_page(unsigned long start_pfn, unsigned long nr_pages) { unsigned long end_pfn = start_pfn + nr_pages; - return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE); + return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, + GFP_KERNEL); } static bool pfn_range_valid_gigantic(struct zone *z, -- cgit v1.2.1 From 174cd4b1e5fbd0d74c68cf3a74f5bd4923485512 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 2 Feb 2017 19:15:33 +0100 Subject: sched/headers: Prepare to move signal wakeup & sigpending methods from into Fix up affected files that include this signal functionality via sched.h. Acked-by: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- mm/hugetlb.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2e0e8159ce8e..a7aa811b7d14 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.1