diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 5 | ||||
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/hmm.c | 523 | ||||
-rw-r--r-- | mm/ksm.c | 14 | ||||
-rw-r--r-- | mm/maccess.c | 70 | ||||
-rw-r--r-- | mm/mapping_dirty_helpers.c | 315 | ||||
-rw-r--r-- | mm/memcontrol.c | 2 | ||||
-rw-r--r-- | mm/memory.c | 104 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 16 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 557 | ||||
-rw-r--r-- | mm/nommu.c | 15 | ||||
-rw-r--r-- | mm/pagewalk.c | 99 | ||||
-rw-r--r-- | mm/vmalloc.c | 20 |
13 files changed, 1214 insertions, 527 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index a5dae9a7eb51..f332efe751dd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -284,6 +284,7 @@ config VIRT_TO_BUS config MMU_NOTIFIER bool select SRCU + select INTERVAL_TREE config KSM bool "Enable KSM for page merging" @@ -674,7 +675,6 @@ config DEV_PAGEMAP_OPS config HMM_MIRROR bool depends on MMU - depends on MMU_NOTIFIER config DEVICE_PRIVATE bool "Unaddressable device memory (GPU memory, ...)" @@ -736,4 +736,7 @@ config ARCH_HAS_PTE_SPECIAL config ARCH_HAS_HUGEPD bool +config MAPPING_DIRTY_HELPERS + bool + endmenu diff --git a/mm/Makefile b/mm/Makefile index d996846697ef..1937cc251883 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -107,3 +107,4 @@ obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o obj-$(CONFIG_ZONE_DEVICE) += memremap.o obj-$(CONFIG_HMM_MIRROR) += hmm.o obj-$(CONFIG_MEMFD_CREATE) += memfd.o +obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o @@ -26,193 +26,6 @@ #include <linux/mmu_notifier.h> #include <linux/memory_hotplug.h> -static struct mmu_notifier *hmm_alloc_notifier(struct mm_struct *mm) -{ - struct hmm *hmm; - - hmm = kzalloc(sizeof(*hmm), GFP_KERNEL); - if (!hmm) - return ERR_PTR(-ENOMEM); - - init_waitqueue_head(&hmm->wq); - INIT_LIST_HEAD(&hmm->mirrors); - init_rwsem(&hmm->mirrors_sem); - INIT_LIST_HEAD(&hmm->ranges); - spin_lock_init(&hmm->ranges_lock); - hmm->notifiers = 0; - return &hmm->mmu_notifier; -} - -static void hmm_free_notifier(struct mmu_notifier *mn) -{ - struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); - - WARN_ON(!list_empty(&hmm->ranges)); - WARN_ON(!list_empty(&hmm->mirrors)); - kfree(hmm); -} - -static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) -{ - struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); - struct hmm_mirror *mirror; - - /* - * Since hmm_range_register() holds the mmget() lock hmm_release() is - * prevented as long as a range exists. - */ - WARN_ON(!list_empty_careful(&hmm->ranges)); - - down_read(&hmm->mirrors_sem); - list_for_each_entry(mirror, &hmm->mirrors, list) { - /* - * Note: The driver is not allowed to trigger - * hmm_mirror_unregister() from this thread. - */ - if (mirror->ops->release) - mirror->ops->release(mirror); - } - up_read(&hmm->mirrors_sem); -} - -static void notifiers_decrement(struct hmm *hmm) -{ - unsigned long flags; - - spin_lock_irqsave(&hmm->ranges_lock, flags); - hmm->notifiers--; - if (!hmm->notifiers) { - struct hmm_range *range; - - list_for_each_entry(range, &hmm->ranges, list) { - if (range->valid) - continue; - range->valid = true; - } - wake_up_all(&hmm->wq); - } - spin_unlock_irqrestore(&hmm->ranges_lock, flags); -} - -static int hmm_invalidate_range_start(struct mmu_notifier *mn, - const struct mmu_notifier_range *nrange) -{ - struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); - struct hmm_mirror *mirror; - struct hmm_range *range; - unsigned long flags; - int ret = 0; - - spin_lock_irqsave(&hmm->ranges_lock, flags); - hmm->notifiers++; - list_for_each_entry(range, &hmm->ranges, list) { - if (nrange->end < range->start || nrange->start >= range->end) - continue; - - range->valid = false; - } - spin_unlock_irqrestore(&hmm->ranges_lock, flags); - - if (mmu_notifier_range_blockable(nrange)) - down_read(&hmm->mirrors_sem); - else if (!down_read_trylock(&hmm->mirrors_sem)) { - ret = -EAGAIN; - goto out; - } - - list_for_each_entry(mirror, &hmm->mirrors, list) { - int rc; - - rc = mirror->ops->sync_cpu_device_pagetables(mirror, nrange); - if (rc) { - if (WARN_ON(mmu_notifier_range_blockable(nrange) || - rc != -EAGAIN)) - continue; - ret = -EAGAIN; - break; - } - } - up_read(&hmm->mirrors_sem); - -out: - if (ret) - notifiers_decrement(hmm); - return ret; -} - -static void hmm_invalidate_range_end(struct mmu_notifier *mn, - const struct mmu_notifier_range *nrange) -{ - struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); - - notifiers_decrement(hmm); -} - -static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { - .release = hmm_release, - .invalidate_range_start = hmm_invalidate_range_start, - .invalidate_range_end = hmm_invalidate_range_end, - .alloc_notifier = hmm_alloc_notifier, - .free_notifier = hmm_free_notifier, -}; - -/* - * hmm_mirror_register() - register a mirror against an mm - * - * @mirror: new mirror struct to register - * @mm: mm to register against - * Return: 0 on success, -ENOMEM if no memory, -EINVAL if invalid arguments - * - * To start mirroring a process address space, the device driver must register - * an HMM mirror struct. - * - * The caller cannot unregister the hmm_mirror while any ranges are - * registered. - * - * Callers using this function must put a call to mmu_notifier_synchronize() - * in their module exit functions. - */ -int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) -{ - struct mmu_notifier *mn; - - lockdep_assert_held_write(&mm->mmap_sem); - - /* Sanity check */ - if (!mm || !mirror || !mirror->ops) - return -EINVAL; - - mn = mmu_notifier_get_locked(&hmm_mmu_notifier_ops, mm); - if (IS_ERR(mn)) - return PTR_ERR(mn); - mirror->hmm = container_of(mn, struct hmm, mmu_notifier); - - down_write(&mirror->hmm->mirrors_sem); - list_add(&mirror->list, &mirror->hmm->mirrors); - up_write(&mirror->hmm->mirrors_sem); - - return 0; -} -EXPORT_SYMBOL(hmm_mirror_register); - -/* - * hmm_mirror_unregister() - unregister a mirror - * - * @mirror: mirror struct to unregister - * - * Stop mirroring a process address space, and cleanup. - */ -void hmm_mirror_unregister(struct hmm_mirror *mirror) -{ - struct hmm *hmm = mirror->hmm; - - down_write(&hmm->mirrors_sem); - list_del(&mirror->list); - up_write(&hmm->mirrors_sem); - mmu_notifier_put(&hmm->mmu_notifier); -} -EXPORT_SYMBOL(hmm_mirror_unregister); - struct hmm_vma_walk { struct hmm_range *range; struct dev_pagemap *pgmap; @@ -252,18 +65,15 @@ err: return -EFAULT; } -static int hmm_pfns_bad(unsigned long addr, - unsigned long end, - struct mm_walk *walk) +static int hmm_pfns_fill(unsigned long addr, unsigned long end, + struct hmm_range *range, enum hmm_pfn_value_e value) { - struct hmm_vma_walk *hmm_vma_walk = walk->private; - struct hmm_range *range = hmm_vma_walk->range; uint64_t *pfns = range->pfns; unsigned long i; i = (addr - range->start) >> PAGE_SHIFT; for (; addr < end; addr += PAGE_SIZE, i++) - pfns[i] = range->values[HMM_PFN_ERROR]; + pfns[i] = range->values[value]; return 0; } @@ -532,8 +342,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, if (unlikely(!hmm_vma_walk->pgmap)) return -EBUSY; } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) { - *pfn = range->values[HMM_PFN_SPECIAL]; - return -EFAULT; + if (!is_zero_pfn(pte_pfn(pte))) { + *pfn = range->values[HMM_PFN_SPECIAL]; + return -EFAULT; + } + /* + * Since each architecture defines a struct page for the zero + * page, just fall through and treat it like a normal page. + */ } *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags; @@ -584,7 +400,7 @@ again: } return 0; } else if (!pmd_present(pmd)) - return hmm_pfns_bad(start, end, walk); + return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { /* @@ -612,7 +428,7 @@ again: * recover. */ if (pmd_bad(pmd)) - return hmm_pfns_bad(start, end, walk); + return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); ptep = pte_offset_map(pmdp, addr); i = (addr - range->start) >> PAGE_SHIFT; @@ -770,93 +586,55 @@ unlock: #define hmm_vma_walk_hugetlb_entry NULL #endif /* CONFIG_HUGETLB_PAGE */ -static void hmm_pfns_clear(struct hmm_range *range, - uint64_t *pfns, - unsigned long addr, - unsigned long end) -{ - for (; addr < end; addr += PAGE_SIZE, pfns++) - *pfns = range->values[HMM_PFN_NONE]; -} - -/* - * hmm_range_register() - start tracking change to CPU page table over a range - * @range: range - * @mm: the mm struct for the range of virtual address - * - * Return: 0 on success, -EFAULT if the address space is no longer valid - * - * Track updates to the CPU page table see include/linux/hmm.h - */ -int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror) +static int hmm_vma_walk_test(unsigned long start, unsigned long end, + struct mm_walk *walk) { - struct hmm *hmm = mirror->hmm; - unsigned long flags; - - range->valid = false; - range->hmm = NULL; - - if ((range->start & (PAGE_SIZE - 1)) || (range->end & (PAGE_SIZE - 1))) - return -EINVAL; - if (range->start >= range->end) - return -EINVAL; + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + struct vm_area_struct *vma = walk->vma; - /* Prevent hmm_release() from running while the range is valid */ - if (!mmget_not_zero(hmm->mmu_notifier.mm)) + /* + * Skip vma ranges that don't have struct page backing them or + * map I/O devices directly. + */ + if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) return -EFAULT; - /* Initialize range to track CPU page table updates. */ - spin_lock_irqsave(&hmm->ranges_lock, flags); - - range->hmm = hmm; - list_add(&range->list, &hmm->ranges); - /* - * If there are any concurrent notifiers we have to wait for them for - * the range to be valid (see hmm_range_wait_until_valid()). + * If the vma does not allow read access, then assume that it does not + * allow write access either. HMM does not support architectures + * that allow write without read. */ - if (!hmm->notifiers) - range->valid = true; - spin_unlock_irqrestore(&hmm->ranges_lock, flags); - - return 0; -} -EXPORT_SYMBOL(hmm_range_register); + if (!(vma->vm_flags & VM_READ)) { + bool fault, write_fault; -/* - * hmm_range_unregister() - stop tracking change to CPU page table over a range - * @range: range - * - * Range struct is used to track updates to the CPU page table after a call to - * hmm_range_register(). See include/linux/hmm.h for how to use it. - */ -void hmm_range_unregister(struct hmm_range *range) -{ - struct hmm *hmm = range->hmm; - unsigned long flags; + /* + * Check to see if a fault is requested for any page in the + * range. + */ + hmm_range_need_fault(hmm_vma_walk, range->pfns + + ((start - range->start) >> PAGE_SHIFT), + (end - start) >> PAGE_SHIFT, + 0, &fault, &write_fault); + if (fault || write_fault) + return -EFAULT; - spin_lock_irqsave(&hmm->ranges_lock, flags); - list_del_init(&range->list); - spin_unlock_irqrestore(&hmm->ranges_lock, flags); + hmm_pfns_fill(start, end, range, HMM_PFN_NONE); + hmm_vma_walk->last = end; - /* Drop reference taken by hmm_range_register() */ - mmput(hmm->mmu_notifier.mm); + /* Skip this vma and continue processing the next vma. */ + return 1; + } - /* - * The range is now invalid and the ref on the hmm is dropped, so - * poison the pointer. Leave other fields in place, for the caller's - * use. - */ - range->valid = false; - memset(&range->hmm, POISON_INUSE, sizeof(range->hmm)); + return 0; } -EXPORT_SYMBOL(hmm_range_unregister); static const struct mm_walk_ops hmm_walk_ops = { .pud_entry = hmm_vma_walk_pud, .pmd_entry = hmm_vma_walk_pmd, .pte_hole = hmm_vma_walk_hole, .hugetlb_entry = hmm_vma_walk_hugetlb_entry, + .test_walk = hmm_vma_walk_test, }; /** @@ -889,210 +667,27 @@ static const struct mm_walk_ops hmm_walk_ops = { */ long hmm_range_fault(struct hmm_range *range, unsigned int flags) { - const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; - unsigned long start = range->start, end; - struct hmm_vma_walk hmm_vma_walk; - struct hmm *hmm = range->hmm; - struct vm_area_struct *vma; + struct hmm_vma_walk hmm_vma_walk = { + .range = range, + .last = range->start, + .flags = flags, + }; + struct mm_struct *mm = range->notifier->mm; int ret; - lockdep_assert_held(&hmm->mmu_notifier.mm->mmap_sem); + lockdep_assert_held(&mm->mmap_sem); do { /* If range is no longer valid force retry. */ - if (!range->valid) + if (mmu_interval_check_retry(range->notifier, + range->notifier_seq)) return -EBUSY; + ret = walk_page_range(mm, hmm_vma_walk.last, range->end, + &hmm_walk_ops, &hmm_vma_walk); + } while (ret == -EBUSY); - vma = find_vma(hmm->mmu_notifier.mm, start); - if (vma == NULL || (vma->vm_flags & device_vma)) - return -EFAULT; - - if (!(vma->vm_flags & VM_READ)) { - /* - * If vma do not allow read access, then assume that it - * does not allow write access, either. HMM does not - * support architecture that allow write without read. - */ - hmm_pfns_clear(range, range->pfns, - range->start, range->end); - return -EPERM; - } - - hmm_vma_walk.pgmap = NULL; - hmm_vma_walk.last = start; - hmm_vma_walk.flags = flags; - hmm_vma_walk.range = range; - end = min(range->end, vma->vm_end); - - walk_page_range(vma->vm_mm, start, end, &hmm_walk_ops, - &hmm_vma_walk); - - do { - ret = walk_page_range(vma->vm_mm, start, end, - &hmm_walk_ops, &hmm_vma_walk); - start = hmm_vma_walk.last; - - /* Keep trying while the range is valid. */ - } while (ret == -EBUSY && range->valid); - - if (ret) { - unsigned long i; - - i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; - hmm_pfns_clear(range, &range->pfns[i], - hmm_vma_walk.last, range->end); - return ret; - } - start = end; - - } while (start < range->end); - + if (ret) + return ret; return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; } EXPORT_SYMBOL(hmm_range_fault); - -/** - * hmm_range_dma_map - hmm_range_fault() and dma map page all in one. - * @range: range being faulted - * @device: device to map page to - * @daddrs: array of dma addresses for the mapped pages - * @flags: HMM_FAULT_* - * - * Return: the number of pages mapped on success (including zero), or any - * status return from hmm_range_fault() otherwise. - */ -long hmm_range_dma_map(struct hmm_range *range, struct device *device, - dma_addr_t *daddrs, unsigned int flags) -{ - unsigned long i, npages, mapped; - long ret; - - ret = hmm_range_fault(range, flags); - if (ret <= 0) - return ret ? ret : -EBUSY; - - npages = (range->end - range->start) >> PAGE_SHIFT; - for (i = 0, mapped = 0; i < npages; ++i) { - enum dma_data_direction dir = DMA_TO_DEVICE; - struct page *page; - - /* - * FIXME need to update DMA API to provide invalid DMA address - * value instead of a function to test dma address value. This - * would remove lot of dumb code duplicated accross many arch. - * - * For now setting it to 0 here is good enough as the pfns[] - * value is what is use to check what is valid and what isn't. - */ - daddrs[i] = 0; - - page = hmm_device_entry_to_page(range, range->pfns[i]); - if (page == NULL) - continue; - - /* Check if range is being invalidated */ - if (!range->valid) { - ret = -EBUSY; - goto unmap; - } - - /* If it is read and write than map bi-directional. */ - if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) - dir = DMA_BIDIRECTIONAL; - - daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir); - if (dma_mapping_error(device, daddrs[i])) { - ret = -EFAULT; - goto unmap; - } - - mapped++; - } - - return mapped; - -unmap: - for (npages = i, i = 0; (i < npages) && mapped; ++i) { - enum dma_data_direction dir = DMA_TO_DEVICE; - struct page *page; - - page = hmm_device_entry_to_page(range, range->pfns[i]); - if (page == NULL) - continue; - - if (dma_mapping_error(device, daddrs[i])) - continue; - - /* If it is read and write than map bi-directional. */ - if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) - dir = DMA_BIDIRECTIONAL; - - dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); - mapped--; - } - - return ret; -} -EXPORT_SYMBOL(hmm_range_dma_map); - -/** - * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map() - * @range: range being unmapped - * @device: device against which dma map was done - * @daddrs: dma address of mapped pages - * @dirty: dirty page if it had the write flag set - * Return: number of page unmapped on success, -EINVAL otherwise - * - * Note that caller MUST abide by mmu notifier or use HMM mirror and abide - * to the sync_cpu_device_pagetables() callback so that it is safe here to - * call set_page_dirty(). Caller must also take appropriate locks to avoid - * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress. - */ -long hmm_range_dma_unmap(struct hmm_range *range, - struct device *device, - dma_addr_t *daddrs, - bool dirty) -{ - unsigned long i, npages; - long cpages = 0; - - /* Sanity check. */ - if (range->end <= range->start) - return -EINVAL; - if (!daddrs) - return -EINVAL; - if (!range->pfns) - return -EINVAL; - - npages = (range->end - range->start) >> PAGE_SHIFT; - for (i = 0; i < npages; ++i) { - enum dma_data_direction dir = DMA_TO_DEVICE; - struct page *page; - - page = hmm_device_entry_to_page(range, range->pfns[i]); - if (page == NULL) - continue; - - /* If it is read and write than map bi-directional. */ - if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) { - dir = DMA_BIDIRECTIONAL; - - /* - * See comments in function description on why it is - * safe here to call set_page_dirty() - */ - if (dirty) - set_page_dirty(page); - } - - /* Unmap and clear pfns/dma address */ - dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); - range->pfns[i] = range->values[HMM_PFN_NONE]; - /* FIXME see comments in hmm_vma_dma_map() */ - daddrs[i] = 0; - cpages++; - } - - return cpages; -} -EXPORT_SYMBOL(hmm_range_dma_unmap); @@ -885,13 +885,13 @@ static int remove_stable_node(struct stable_node *stable_node) return 0; } - if (WARN_ON_ONCE(page_mapped(page))) { - /* - * This should not happen: but if it does, just refuse to let - * merge_across_nodes be switched - there is no need to panic. - */ - err = -EBUSY; - } else { + /* + * Page could be still mapped if this races with __mmput() running in + * between ksm_exit() and exit_mmap(). Just refuse to let + * merge_across_nodes/max_page_sharing be switched. + */ + err = -EBUSY; + if (!page_mapped(page)) { /* * The stable node did not yet appear stale to get_ksm_page(), * since that allows for an unmapped ksm page to be recognized diff --git a/mm/maccess.c b/mm/maccess.c index d065736f6b87..3ca8d97e5010 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -18,6 +18,18 @@ probe_read_common(void *dst, const void __user *src, size_t size) return ret ? -EFAULT : 0; } +static __always_inline long +probe_write_common(void __user *dst, const void *src, size_t size) +{ + long ret; + + pagefault_disable(); + ret = __copy_to_user_inatomic(dst, src, size); + pagefault_enable(); + + return ret ? -EFAULT : 0; +} + /** * probe_kernel_read(): safely attempt to read from a kernel-space location * @dst: pointer to the buffer that shall take the data @@ -31,11 +43,20 @@ probe_read_common(void *dst, const void __user *src, size_t size) * do_page_fault() doesn't attempt to take mmap_sem. This makes * probe_kernel_read() suitable for use within regions where the caller * already holds mmap_sem, or other locks which nest inside mmap_sem. + * + * probe_kernel_read_strict() is the same as probe_kernel_read() except for + * the case where architectures have non-overlapping user and kernel address + * ranges: probe_kernel_read_strict() will additionally return -EFAULT for + * probing memory on a user address range where probe_user_read() is supposed + * to be used instead. */ long __weak probe_kernel_read(void *dst, const void *src, size_t size) __attribute__((alias("__probe_kernel_read"))); +long __weak probe_kernel_read_strict(void *dst, const void *src, size_t size) + __attribute__((alias("__probe_kernel_read"))); + long __probe_kernel_read(void *dst, const void *src, size_t size) { long ret; @@ -85,6 +106,7 @@ EXPORT_SYMBOL_GPL(probe_user_read); * Safely write to address @dst from the buffer at @src. If a kernel fault * happens, handle that and return -EFAULT. */ + long __weak probe_kernel_write(void *dst, const void *src, size_t size) __attribute__((alias("__probe_kernel_write"))); @@ -94,15 +116,39 @@ long __probe_kernel_write(void *dst, const void *src, size_t size) mm_segment_t old_fs = get_fs(); set_fs(KERNEL_DS); - pagefault_disable(); - ret = __copy_to_user_inatomic((__force void __user *)dst, src, size); - pagefault_enable(); + ret = probe_write_common((__force void __user *)dst, src, size); set_fs(old_fs); - return ret ? -EFAULT : 0; + return ret; } EXPORT_SYMBOL_GPL(probe_kernel_write); +/** + * probe_user_write(): safely attempt to write to a user-space location + * @dst: address to write to + * @src: pointer to the data that shall be written + * @size: size of the data chunk + * + * Safely write to address @dst from the buffer at @src. If a kernel fault + * happens, handle that and return -EFAULT. + */ + +long __weak probe_user_write(void __user *dst, const void *src, size_t size) + __attribute__((alias("__probe_user_write"))); + +long __probe_user_write(void __user *dst, const void *src, size_t size) +{ + long ret = -EFAULT; + mm_segment_t old_fs = get_fs(); + + set_fs(USER_DS); + if (access_ok(dst, size)) + ret = probe_write_common(dst, src, size); + set_fs(old_fs); + + return ret; +} +EXPORT_SYMBOL_GPL(probe_user_write); /** * strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address. @@ -120,8 +166,22 @@ EXPORT_SYMBOL_GPL(probe_kernel_write); * * If @count is smaller than the length of the string, copies @count-1 bytes, * sets the last byte of @dst buffer to NUL and returns @count. + * + * strncpy_from_unsafe_strict() is the same as strncpy_from_unsafe() except + * for the case where architectures have non-overlapping user and kernel address + * ranges: strncpy_from_unsafe_strict() will additionally return -EFAULT for + * probing memory on a user address range where strncpy_from_unsafe_user() is + * supposed to be used instead. */ -long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) + +long __weak strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) + __attribute__((alias("__strncpy_from_unsafe"))); + +long __weak strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr, + long count) + __attribute__((alias("__strncpy_from_unsafe"))); + +long __strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) { mm_segment_t old_fs = get_fs(); const void *src = unsafe_addr; diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c new file mode 100644 index 000000000000..71070dda9643 --- /dev/null +++ b/mm/mapping_dirty_helpers.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/pagewalk.h> +#include <linux/hugetlb.h> +#include <linux/bitops.h> +#include <linux/mmu_notifier.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> + +/** + * struct wp_walk - Private struct for pagetable walk callbacks + * @range: Range for mmu notifiers + * @tlbflush_start: Address of first modified pte + * @tlbflush_end: Address of last modified pte + 1 + * @total: Total number of modified ptes + */ +struct wp_walk { + struct mmu_notifier_range range; + unsigned long tlbflush_start; + unsigned long tlbflush_end; + unsigned long total; +}; + +/** + * wp_pte - Write-protect a pte + * @pte: Pointer to the pte + * @addr: The virtual page address + * @walk: pagetable walk callback argument + * + * The function write-protects a pte and records the range in + * virtual address space of touched ptes for efficient range TLB flushes. + */ +static int wp_pte(pte_t *pte, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct wp_walk *wpwalk = walk->private; + pte_t ptent = *pte; + + if (pte_write(ptent)) { + pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte); + + ptent = pte_wrprotect(old_pte); + ptep_modify_prot_commit(walk->vma, addr, pte, old_pte, ptent); + wpwalk->total++; + wpwalk->tlbflush_start = min(wpwalk->tlbflush_start, addr); + wpwalk->tlbflush_end = max(wpwalk->tlbflush_end, + addr + PAGE_SIZE); + } + + return 0; +} + +/** + * struct clean_walk - Private struct for the clean_record_pte function. + * @base: struct wp_walk we derive from + * @bitmap_pgoff: Address_space Page offset of the first bit in @bitmap + * @bitmap: Bitmap with one bit for each page offset in the address_space range + * covered. + * @start: Address_space page offset of first modified pte relative + * to @bitmap_pgoff + * @end: Address_space page offset of last modified pte relative + * to @bitmap_pgoff + */ +struct clean_walk { + struct wp_walk base; + pgoff_t bitmap_pgoff; + unsigned long *bitmap; + pgoff_t start; + pgoff_t end; +}; + +#define to_clean_walk(_wpwalk) container_of(_wpwalk, struct clean_walk, base) + +/** + * clean_record_pte - Clean a pte and record its address space offset in a + * bitmap + * @pte: Pointer to the pte + * @addr: The virtual page address + * @walk: pagetable walk callback argument + * + * The function cleans a pte and records the range in + * virtual address space of touched ptes for efficient TLB flushes. + * It also records dirty ptes in a bitmap representing page offsets + * in the address_space, as well as the first and last of the bits + * touched. + */ +static int clean_record_pte(pte_t *pte, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct wp_walk *wpwalk = walk->private; + struct clean_walk *cwalk = to_clean_walk(wpwalk); + pte_t ptent = *pte; + + if (pte_dirty(ptent)) { + pgoff_t pgoff = ((addr - walk->vma->vm_start) >> PAGE_SHIFT) + + walk->vma->vm_pgoff - cwalk->bitmap_pgoff; + pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte); + + ptent = pte_mkclean(old_pte); + ptep_modify_prot_commit(walk->vma, addr, pte, old_pte, ptent); + + wpwalk->total++; + wpwalk->tlbflush_start = min(wpwalk->tlbflush_start, addr); + wpwalk->tlbflush_end = max(wpwalk->tlbflush_end, + addr + PAGE_SIZE); + + __set_bit(pgoff, cwalk->bitmap); + cwalk->start = min(cwalk->start, pgoff); + cwalk->end = max(cwalk->end, pgoff + 1); + } + + return 0; +} + +/* wp_clean_pmd_entry - The pagewalk pmd callback. */ +static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + /* Dirty-tracking should be handled on the pte level */ + pmd_t pmdval = pmd_read_atomic(pmd); + + if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) + WARN_ON(pmd_write(pmdval) || pmd_dirty(pmdval)); + + return 0; +} + +/* wp_clean_pud_entry - The pagewalk pud callback. */ +static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + /* Dirty-tracking should be handled on the pte level */ + pud_t pudval = READ_ONCE(*pud); + + if (pud_trans_huge(pudval) || pud_devmap(pudval)) + WARN_ON(pud_write(pudval) || pud_dirty(pudval)); + + return 0; +} + +/* + * wp_clean_pre_vma - The pagewalk pre_vma callback. + * + * The pre_vma callback performs the cache flush, stages the tlb flush + * and calls the necessary mmu notifiers. + */ +static int wp_clean_pre_vma(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct wp_walk *wpwalk = walk->private; + + wpwalk->tlbflush_start = end; + wpwalk->tlbflush_end = start; + + mmu_notifier_range_init(&wpwalk->range, MMU_NOTIFY_PROTECTION_PAGE, 0, + walk->vma, walk->mm, start, end); + mmu_notifier_invalidate_range_start(&wpwalk->range); + flush_cache_range(walk->vma, start, end); + + /* + * We're not using tlb_gather_mmu() since typically + * only a small subrange of PTEs are affected, whereas + * tlb_gather_mmu() records the full range. + */ + inc_tlb_flush_pending(walk->mm); + + return 0; +} + +/* + * wp_clean_post_vma - The pagewalk post_vma callback. + * + * The post_vma callback performs the tlb flush and calls necessary mmu + * notifiers. + */ +static void wp_clean_post_vma(struct mm_walk *walk) +{ + struct wp_walk *wpwalk = walk->private; + + if (mm_tlb_flush_nested(walk->mm)) + flush_tlb_range(walk->vma, wpwalk->range.start, + wpwalk->range.end); + else if (wpwalk->tlbflush_end > wpwalk->tlbflush_start) + flush_tlb_range(walk->vma, wpwalk->tlbflush_start, + wpwalk->tlbflush_end); + + mmu_notifier_invalidate_range_end(&wpwalk->range); + dec_tlb_flush_pending(walk->mm); +} + +/* + * wp_clean_test_walk - The pagewalk test_walk callback. + * + * Won't perform dirty-tracking on COW, read-only or HUGETLB vmas. + */ +static int wp_clean_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + unsigned long vm_flags = READ_ONCE(walk->vma->vm_flags); + + /* Skip non-applicable VMAs */ + if ((vm_flags & (VM_SHARED | VM_MAYWRITE | VM_HUGETLB)) != + (VM_SHARED | VM_MAYWRITE)) + return 1; + + return 0; +} + +static const struct mm_walk_ops clean_walk_ops = { + .pte_entry = clean_record_pte, + .pmd_entry = wp_clean_pmd_entry, + .pud_entry = wp_clean_pud_entry, + .test_walk = wp_clean_test_walk, + .pre_vma = wp_clean_pre_vma, + .post_vma = wp_clean_post_vma +}; + +static const struct mm_walk_ops wp_walk_ops = { + .pte_entry = wp_pte, + .pmd_entry = wp_clean_pmd_entry, + .pud_entry = wp_clean_pud_entry, + .test_walk = wp_clean_test_walk, + .pre_vma = wp_clean_pre_vma, + .post_vma = wp_clean_post_vma +}; + +/** + * wp_shared_mapping_range - Write-protect all ptes in an address space range + * @mapping: The address_space we want to write protect + * @first_index: The first page offset in the range + * @nr: Number of incremental page offsets to cover + * + * Note: This function currently skips transhuge page-table entries, since + * it's intended for dirty-tracking on the PTE level. It will warn on + * encountering transhuge write-enabled entries, though, and can easily be + * extended to handle them as well. + * + * Return: The number of ptes actually write-protected. Note that + * already write-protected ptes are not counted. + */ +unsigned long wp_shared_mapping_range(struct address_space *mapping, + pgoff_t first_index, pgoff_t nr) +{ + struct wp_walk wpwalk = { .total = 0 }; + + i_mmap_lock_read(mapping); + WARN_ON(walk_page_mapping(mapping, first_index, nr, &wp_walk_ops, + &wpwalk)); + i_mmap_unlock_read(mapping); + + return wpwalk.total; +} +EXPORT_SYMBOL_GPL(wp_shared_mapping_range); + +/** + * clean_record_shared_mapping_range - Clean and record all ptes in an + * address space range + * @mapping: The address_space we want to clean + * @first_index: The first page offset in the range + * @nr: Number of incremental page offsets to cover + * @bitmap_pgoff: The page offset of the first bit in @bitmap + * @bitmap: Pointer to a bitmap of at least @nr bits. The bitmap needs to + * cover the whole range @first_index..@first_index + @nr. + * @start: Pointer to number of the first set bit in @bitmap. + * is modified as new bits are set by the function. + * @end: Pointer to the number of the last set bit in @bitmap. + * none set. The value is modified as new bits are set by the function. + * + * Note: When this function returns there is no guarantee that a CPU has + * not already dirtied new ptes. However it will not clean any ptes not + * reported in the bitmap. The guarantees are as follows: + * a) All ptes dirty when the function starts executing will end up recorded + * in the bitmap. + * b) All ptes dirtied after that will either remain dirty, be recorded in the + * bitmap or both. + * + * If a caller needs to make sure all dirty ptes are picked up and none + * additional are added, it first needs to write-protect the address-space + * range and make sure new writers are blocked in page_mkwrite() or + * pfn_mkwrite(). And then after a TLB flush following the write-protection + * pick up all dirty bits. + * + * Note: This function currently skips transhuge page-table entries, since + * it's intended for dirty-tracking on the PTE level. It will warn on + * encountering transhuge dirty entries, though, and can easily be extended + * to handle them as well. + * + * Return: The number of dirty ptes actually cleaned. + */ +unsigned long clean_record_shared_mapping_range(struct address_space *mapping, + pgoff_t first_index, pgoff_t nr, + pgoff_t bitmap_pgoff, + unsigned long *bitmap, + pgoff_t *start, + pgoff_t *end) +{ + bool none_set = (*start >= *end); + struct clean_walk cwalk = { + .base = { .total = 0 }, + .bitmap_pgoff = bitmap_pgoff, + .bitmap = bitmap, + .start = none_set ? nr : *start, + .end = none_set ? 0 : *end, + }; + + i_mmap_lock_read(mapping); + WARN_ON(walk_page_mapping(mapping, first_index, nr, &clean_walk_ops, + &cwalk.base)); + i_mmap_unlock_read(mapping); + + *start = cwalk.start; + *end = cwalk.end; + + return cwalk.base.total; +} +EXPORT_SYMBOL_GPL(clean_record_shared_mapping_range); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 46ad252e6d6a..01f3f8b665e9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1800,7 +1800,7 @@ static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) struct mem_cgroup *iter; spin_lock(&memcg_oom_lock); - mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_); + mutex_release(&memcg_oom_lock_dep_map, _RET_IP_); for_each_mem_cgroup_tree(iter, memcg) iter->oom_lock = false; spin_unlock(&memcg_oom_lock); diff --git a/mm/memory.c b/mm/memory.c index b1ca51a079f2..b6a5d6a08438 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -118,6 +118,18 @@ int randomize_va_space __read_mostly = 2; #endif +#ifndef arch_faults_on_old_pte +static inline bool arch_faults_on_old_pte(void) +{ + /* + * Those arches which don't have hw access flag feature need to + * implement their own helper. By default, "true" means pagefault + * will be hit on old pte. + */ + return true; +} +#endif + static int __init disable_randmaps(char *s) { randomize_va_space = 0; @@ -2145,32 +2157,82 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, return same; } -static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) +static inline bool cow_user_page(struct page *dst, struct page *src, + struct vm_fault *vmf) { + bool ret; + void *kaddr; + void __user *uaddr; + bool force_mkyoung; + struct vm_area_struct *vma = vmf->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long addr = vmf->address; + debug_dma_assert_idle(src); + if (likely(src)) { + copy_user_highpage(dst, src, addr, vma); + return true; + } + /* * If the source page was a PFN mapping, we don't have * a "struct page" for it. We do a best-effort copy by * just copying from the original user address. If that * fails, we just zero-fill it. Live with it. */ - if (unlikely(!src)) { - void *kaddr = kmap_atomic(dst); - void __user *uaddr = (void __user *)(va & PAGE_MASK); + kaddr = kmap_atomic(dst); + uaddr = (void __user *)(addr & PAGE_MASK); + + /* + * On architectures with software "accessed" bits, we would + * take a double page fault, so mark it accessed here. + */ + force_mkyoung = arch_faults_on_old_pte() && !pte_young(vmf->orig_pte); + if (force_mkyoung) { + pte_t entry; + + vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); + if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { + /* + * Other thread has already handled the fault + * and we don't need to do anything. If it's + * not the case, the fault will be triggered + * again on the same address. + */ + ret = false; + goto pte_unlock; + } + entry = pte_mkyoung(vmf->orig_pte); + if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0)) + update_mmu_cache(vma, addr, vmf->pte); + } + + /* + * This really shouldn't fail, because the page is there + * in the page tables. But it might just be unreadable, + * in which case we just give up and fill the result with + * zeroes. + */ + if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { /* - * This really shouldn't fail, because the page is there - * in the page tables. But it might just be unreadable, - * in which case we just give up and fill the result with - * zeroes. + * Give a warn in case there can be some obscure + * use-case */ - if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) - clear_page(kaddr); - kunmap_atomic(kaddr); - flush_dcache_page(dst); - } else - copy_user_highpage(dst, src, va, vma); + WARN_ON_ONCE(1); + clear_page(kaddr); + } + + ret = true; + +pte_unlock: + if (force_mkyoung) + pte_unmap_unlock(vmf->pte, vmf->ptl); + kunmap_atomic(kaddr); + flush_dcache_page(dst); + + return ret; } static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) @@ -2327,7 +2389,19 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) vmf->address); if (!new_page) goto oom; - cow_user_page(new_page, old_page, vmf->address, vma); + + if (!cow_user_page(new_page, old_page, vmf)) { + /* + * COW failed, if the fault was solved by other, + * it's fine. If not, userspace would re-fault on + * the same address and we will handle the fault + * from the second attempt. + */ + put_page(new_page); + if (old_page) + put_page(old_page); + return 0; + } } if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false)) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3b62a9ff8ea0..f307bd82d750 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -331,7 +331,7 @@ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone, unsigned long end_pfn) { for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) { - if (unlikely(!pfn_valid(start_pfn))) + if (unlikely(!pfn_to_online_page(start_pfn))) continue; if (unlikely(pfn_to_nid(start_pfn) != nid)) @@ -356,7 +356,7 @@ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone, /* pfn is the end pfn of a memory section. */ pfn = end_pfn - 1; for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) { - if (unlikely(!pfn_valid(pfn))) + if (unlikely(!pfn_to_online_page(pfn))) continue; if (unlikely(pfn_to_nid(pfn) != nid)) @@ -415,7 +415,7 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, */ pfn = zone_start_pfn; for (; pfn < zone_end_pfn; pfn += PAGES_PER_SUBSECTION) { - if (unlikely(!pfn_valid(pfn))) + if (unlikely(!pfn_to_online_page(pfn))) continue; if (page_zone(pfn_to_page(pfn)) != zone) @@ -471,6 +471,16 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn, struct pglist_data *pgdat = zone->zone_pgdat; unsigned long flags; +#ifdef CONFIG_ZONE_DEVICE + /* + * Zone shrinking code cannot properly deal with ZONE_DEVICE. So + * we will not try to shrink the zones - which is okay as + * set_zone_contiguous() cannot deal with ZONE_DEVICE either way. + */ + if (zone_idx(zone) == ZONE_DEVICE) + return; +#endif + pgdat_resize_lock(zone->zone_pgdat, &flags); shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); update_pgdat_span(pgdat); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 9a889e456168..f76ea05b1cb0 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -12,6 +12,7 @@ #include <linux/export.h> #include <linux/mm.h> #include <linux/err.h> +#include <linux/interval_tree.h> #include <linux/srcu.h> #include <linux/rcupdate.h> #include <linux/sched.h> @@ -28,6 +29,254 @@ struct lockdep_map __mmu_notifier_invalidate_range_start_map = { #endif /* + * The mmu notifier_mm structure is allocated and installed in + * mm->mmu_notifier_mm inside the mm_take_all_locks() protected + * critical section and it's released only when mm_count reaches zero + * in mmdrop(). + */ +struct mmu_notifier_mm { + /* all mmu notifiers registered in this mm are queued in this list */ + struct hlist_head list; + bool has_itree; + /* to serialize the list modifications and hlist_unhashed */ + spinlock_t lock; + unsigned long invalidate_seq; + unsigned long active_invalidate_ranges; + struct rb_root_cached itree; + wait_queue_head_t wq; + struct hlist_head deferred_list; +}; + +/* + * This is a collision-retry read-side/write-side 'lock', a lot like a + * seqcount, however this allows multiple write-sides to hold it at + * once. Conceptually the write side is protecting the values of the PTEs in + * this mm, such that PTES cannot be read into SPTEs (shadow PTEs) while any + * writer exists. + * + * Note that the core mm creates nested invalidate_range_start()/end() regions + * within the same thread, and runs invalidate_range_start()/end() in parallel + * on multiple CPUs. This is designed to not reduce concurrency or block + * progress on the mm side. + * + * As a secondary function, holding the full write side also serves to prevent + * writers for the itree, this is an optimization to avoid extra locking + * during invalidate_range_start/end notifiers. + * + * The write side has two states, fully excluded: + * - mm->active_invalidate_ranges != 0 + * - mnn->invalidate_seq & 1 == True (odd) + * - some range on the mm_struct is being invalidated + * - the itree is not allowed to change + * + * And partially excluded: + * - mm->active_invalidate_ranges != 0 + * - mnn->invalidate_seq & 1 == False (even) + * - some range on the mm_struct is being invalidated + * - the itree is allowed to change + * + * Operations on mmu_notifier_mm->invalidate_seq (under spinlock): + * seq |= 1 # Begin writing + * seq++ # Release the writing state + * seq & 1 # True if a writer exists + * + * The later state avoids some expensive work on inv_end in the common case of + * no mni monitoring the VA. + */ +static bool mn_itree_is_invalidating(struct mmu_notifier_mm *mmn_mm) +{ + lockdep_assert_held(&mmn_mm->lock); + return mmn_mm->invalidate_seq & 1; +} + +static struct mmu_interval_notifier * +mn_itree_inv_start_range(struct mmu_notifier_mm *mmn_mm, + const struct mmu_notifier_range *range, + unsigned long *seq) +{ + struct interval_tree_node *node; + struct mmu_interval_notifier *res = NULL; + + spin_lock(&mmn_mm->lock); + mmn_mm->active_invalidate_ranges++; + node = interval_tree_iter_first(&mmn_mm->itree, range->start, + range->end - 1); + if (node) { + mmn_mm->invalidate_seq |= 1; + res = container_of(node, struct mmu_interval_notifier, + interval_tree); + } + + *seq = mmn_mm->invalidate_seq; + spin_unlock(&mmn_mm->lock); + return res; +} + +static struct mmu_interval_notifier * +mn_itree_inv_next(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range) +{ + struct interval_tree_node *node; + + node = interval_tree_iter_next(&mni->interval_tree, range->start, + range->end - 1); + if (!node) + return NULL; + return container_of(node, struct mmu_interval_notifier, interval_tree); +} + +static void mn_itree_inv_end(struct mmu_notifier_mm *mmn_mm) +{ + struct mmu_interval_notifier *mni; + struct hlist_node *next; + + spin_lock(&mmn_mm->lock); + if (--mmn_mm->active_invalidate_ranges || + !mn_itree_is_invalidating(mmn_mm)) { + spin_unlock(&mmn_mm->lock); + return; + } + + /* Make invalidate_seq even */ + mmn_mm->invalidate_seq++; + + /* + * The inv_end incorporates a deferred mechanism like rtnl_unlock(). + * Adds and removes are queued until the final inv_end happens then + * they are progressed. This arrangement for tree updates is used to + * avoid using a blocking lock during invalidate_range_start. + */ + hlist_for_each_entry_safe(mni, next, &mmn_mm->deferred_list, + deferred_item) { + if (RB_EMPTY_NODE(&mni->interval_tree.rb)) + interval_tree_insert(&mni->interval_tree, + &mmn_mm->itree); + else + interval_tree_remove(&mni->interval_tree, + &mmn_mm->itree); + hlist_del(&mni->deferred_item); + } + spin_unlock(&mmn_mm->lock); + + wake_up_all(&mmn_mm->wq); +} + +/** + * mmu_interval_read_begin - Begin a read side critical section against a VA + * range + * mni: The range to use + * + * mmu_iterval_read_begin()/mmu_iterval_read_retry() implement a + * collision-retry scheme similar to seqcount for the VA range under mni. If + * the mm invokes invalidation during the critical section then + * mmu_interval_read_retry() will return true. + * + * This is useful to obtain shadow PTEs where teardown or setup of the SPTEs + * require a blocking context. The critical region formed by this can sleep, + * and the required 'user_lock' can also be a sleeping lock. + * + * The caller is required to provide a 'user_lock' to serialize both teardown + * and setup. + * + * The return value should be passed to mmu_interval_read_retry(). + */ +unsigned long mmu_interval_read_begin(struct mmu_interval_notifier *mni) +{ + struct mmu_notifier_mm *mmn_mm = mni->mm->mmu_notifier_mm; + unsigned long seq; + bool is_invalidating; + + /* + * If the mni has a different seq value under the user_lock than we + * started with then it has collided. + * + * If the mni currently has the same seq value as the mmn_mm seq, then + * it is currently between invalidate_start/end and is colliding. + * + * The locking looks broadly like this: + * mn_tree_invalidate_start(): mmu_interval_read_begin(): + * spin_lock + * seq = READ_ONCE(mni->invalidate_seq); + * seq == mmn_mm->invalidate_seq + * spin_unlock + * spin_lock + * seq = ++mmn_mm->invalidate_seq + * spin_unlock + * op->invalidate_range(): + * user_lock + * mmu_interval_set_seq() + * mni->invalidate_seq = seq + * user_unlock + * + * [Required: mmu_interval_read_retry() == true] + * + * mn_itree_inv_end(): + * spin_lock + * seq = ++mmn_mm->invalidate_seq + * spin_unlock + * + * user_lock + * mmu_interval_read_retry(): + * mni->invalidate_seq != seq + * user_unlock + * + * Barriers are not needed here as any races here are closed by an + * eventual mmu_interval_read_retry(), which provides a barrier via the + * user_lock. + */ + spin_lock(&mmn_mm->lock); + /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */ + seq = READ_ONCE(mni->invalidate_seq); + is_invalidating = seq == mmn_mm->invalidate_seq; + spin_unlock(&mmn_mm->lock); + + /* + * mni->invalidate_seq must always be set to an odd value via + * mmu_interval_set_seq() using the provided cur_seq from + * mn_itree_inv_start_range(). This ensures that if seq does wrap we + * will always clear the below sleep in some reasonable time as + * mmn_mm->invalidate_seq is even in the idle state. + */ + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); + lock_map_release(&__mmu_notifier_invalidate_range_start_map); + if (is_invalidating) + wait_event(mmn_mm->wq, + READ_ONCE(mmn_mm->invalidate_seq) != seq); + + /* + * Notice that mmu_interval_read_retry() can already be true at this + * point, avoiding loops here allows the caller to provide a global + * time bound. + */ + + return seq; +} +EXPORT_SYMBOL_GPL(mmu_interval_read_begin); + +static void mn_itree_release(struct mmu_notifier_mm *mmn_mm, + struct mm_struct *mm) +{ + struct mmu_notifier_range range = { + .flags = MMU_NOTIFIER_RANGE_BLOCKABLE, + .event = MMU_NOTIFY_RELEASE, + .mm = mm, + .start = 0, + .end = ULONG_MAX, + }; + struct mmu_interval_notifier *mni; + unsigned long cur_seq; + bool ret; + + for (mni = mn_itree_inv_start_range(mmn_mm, &range, &cur_seq); mni; + mni = mn_itree_inv_next(mni, &range)) { + ret = mni->ops->invalidate(mni, &range, cur_seq); + WARN_ON(!ret); + } + + mn_itree_inv_end(mmn_mm); +} + +/* * This function can't run concurrently against mmu_notifier_register * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap * runs with mm_users == 0. Other tasks may still invoke mmu notifiers @@ -39,7 +288,8 @@ struct lockdep_map __mmu_notifier_invalidate_range_start_map = { * can't go away from under us as exit_mmap holds an mm_count pin * itself. */ -void __mmu_notifier_release(struct mm_struct *mm) +static void mn_hlist_release(struct mmu_notifier_mm *mmn_mm, + struct mm_struct *mm) { struct mmu_notifier *mn; int id; @@ -49,7 +299,7 @@ void __mmu_notifier_release(struct mm_struct *mm) * ->release returns. */ id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) + hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist) /* * If ->release runs before mmu_notifier_unregister it must be * handled, as it's the only way for the driver to flush all @@ -59,10 +309,9 @@ void __mmu_notifier_release(struct mm_struct *mm) if (mn->ops->release) mn->ops->release(mn, mm); - spin_lock(&mm->mmu_notifier_mm->lock); - while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { - mn = hlist_entry(mm->mmu_notifier_mm->list.first, - struct mmu_notifier, + spin_lock(&mmn_mm->lock); + while (unlikely(!hlist_empty(&mmn_mm->list))) { + mn = hlist_entry(mmn_mm->list.first, struct mmu_notifier, hlist); /* * We arrived before mmu_notifier_unregister so @@ -72,7 +321,7 @@ void __mmu_notifier_release(struct mm_struct *mm) */ hlist_del_init_rcu(&mn->hlist); } - spin_unlock(&mm->mmu_notifier_mm->lock); + spin_unlock(&mmn_mm->lock); srcu_read_unlock(&srcu, id); /* @@ -87,6 +336,17 @@ void __mmu_notifier_release(struct mm_struct *mm) synchronize_srcu(&srcu); } +void __mmu_notifier_release(struct mm_struct *mm) +{ + struct mmu_notifier_mm *mmn_mm = mm->mmu_notifier_mm; + + if (mmn_mm->has_itree) + mn_itree_release(mmn_mm, mm); + + if (!hlist_empty(&mmn_mm->list)) + mn_hlist_release(mmn_mm, mm); +} + /* * If no young bitflag is supported by the hardware, ->clear_flush_young can * unmap the address and return 1 or 0 depending if the mapping previously @@ -159,14 +419,43 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, srcu_read_unlock(&srcu, id); } -int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) +static int mn_itree_invalidate(struct mmu_notifier_mm *mmn_mm, + const struct mmu_notifier_range *range) +{ + struct mmu_interval_notifier *mni; + unsigned long cur_seq; + + for (mni = mn_itree_inv_start_range(mmn_mm, range, &cur_seq); mni; + mni = mn_itree_inv_next(mni, range)) { + bool ret; + + ret = mni->ops->invalidate(mni, range, cur_seq); + if (!ret) { + if (WARN_ON(mmu_notifier_range_blockable(range))) + continue; + goto out_would_block; + } + } + return 0; + +out_would_block: + /* + * On -EAGAIN the non-blocking caller is not allowed to call + * invalidate_range_end() + */ + mn_itree_inv_end(mmn_mm); + return -EAGAIN; +} + +static int mn_hlist_invalidate_range_start(struct mmu_notifier_mm *mmn_mm, + struct mmu_notifier_range *range) { struct mmu_notifier *mn; int ret = 0; int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) { + hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist) { if (mn->ops->invalidate_range_start) { int _ret; @@ -190,15 +479,30 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) return ret; } -void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, - bool only_end) +int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) +{ + struct mmu_notifier_mm *mmn_mm = range->mm->mmu_notifier_mm; + int ret; + + if (mmn_mm->has_itree) { + ret = mn_itree_invalidate(mmn_mm, range); + if (ret) + return ret; + } + if (!hlist_empty(&mmn_mm->list)) + return mn_hlist_invalidate_range_start(mmn_mm, range); + return 0; +} + +static void mn_hlist_invalidate_end(struct mmu_notifier_mm *mmn_mm, + struct mmu_notifier_range *range, + bool only_end) { struct mmu_notifier *mn; int id; - lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) { + hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist) { /* * Call invalidate_range here too to avoid the need for the * subsystem of having to register an invalidate_range_end @@ -225,6 +529,19 @@ void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, } } srcu_read_unlock(&srcu, id); +} + +void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, + bool only_end) +{ + struct mmu_notifier_mm *mmn_mm = range->mm->mmu_notifier_mm; + + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); + if (mmn_mm->has_itree) + mn_itree_inv_end(mmn_mm); + + if (!hlist_empty(&mmn_mm->list)) + mn_hlist_invalidate_end(mmn_mm, range, only_end); lock_map_release(&__mmu_notifier_invalidate_range_start_map); } @@ -243,8 +560,9 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm, } /* - * Same as mmu_notifier_register but here the caller must hold the - * mmap_sem in write mode. + * Same as mmu_notifier_register but here the caller must hold the mmap_sem in + * write mode. A NULL mn signals the notifier is being registered for itree + * mode. */ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) { @@ -261,9 +579,6 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) fs_reclaim_release(GFP_KERNEL); } - mn->mm = mm; - mn->users = 1; - if (!mm->mmu_notifier_mm) { /* * kmalloc cannot be called under mm_take_all_locks(), but we @@ -271,21 +586,22 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) * the write side of the mmap_sem. */ mmu_notifier_mm = - kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); + kzalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); if (!mmu_notifier_mm) return -ENOMEM; INIT_HLIST_HEAD(&mmu_notifier_mm->list); spin_lock_init(&mmu_notifier_mm->lock); + mmu_notifier_mm->invalidate_seq = 2; + mmu_notifier_mm->itree = RB_ROOT_CACHED; + init_waitqueue_head(&mmu_notifier_mm->wq); + INIT_HLIST_HEAD(&mmu_notifier_mm->deferred_list); } ret = mm_take_all_locks(mm); if (unlikely(ret)) goto out_clean; - /* Pairs with the mmdrop in mmu_notifier_unregister_* */ - mmgrab(mm); - /* * Serialize the update against mmu_notifier_unregister. A * side note: mmu_notifier_release can't run concurrently with @@ -293,13 +609,28 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) * current->mm or explicitly with get_task_mm() or similar). * We can't race against any other mmu notifier method either * thanks to mm_take_all_locks(). + * + * release semantics on the initialization of the mmu_notifier_mm's + * contents are provided for unlocked readers. acquire can only be + * used while holding the mmgrab or mmget, and is safe because once + * created the mmu_notififer_mm is not freed until the mm is + * destroyed. As above, users holding the mmap_sem or one of the + * mm_take_all_locks() do not need to use acquire semantics. */ if (mmu_notifier_mm) - mm->mmu_notifier_mm = mmu_notifier_mm; + smp_store_release(&mm->mmu_notifier_mm, mmu_notifier_mm); - spin_lock(&mm->mmu_notifier_mm->lock); - hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list); - spin_unlock(&mm->mmu_notifier_mm->lock); + if (mn) { + /* Pairs with the mmdrop in mmu_notifier_unregister_* */ + mmgrab(mm); + mn->mm = mm; + mn->users = 1; + + spin_lock(&mm->mmu_notifier_mm->lock); + hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list); + spin_unlock(&mm->mmu_notifier_mm->lock); + } else + mm->mmu_notifier_mm->has_itree = true; mm_drop_all_locks(mm); BUG_ON(atomic_read(&mm->mm_users) <= 0); @@ -516,6 +847,180 @@ out_unlock: } EXPORT_SYMBOL_GPL(mmu_notifier_put); +static int __mmu_interval_notifier_insert( + struct mmu_interval_notifier *mni, struct mm_struct *mm, + struct mmu_notifier_mm *mmn_mm, unsigned long start, + unsigned long length, const struct mmu_interval_notifier_ops *ops) +{ + mni->mm = mm; + mni->ops = ops; + RB_CLEAR_NODE(&mni->interval_tree.rb); + mni->interval_tree.start = start; + /* + * Note that the representation of the intervals in the interval tree + * considers the ending point as contained in the interval. + */ + if (length == 0 || + check_add_overflow(start, length - 1, &mni->interval_tree.last)) + return -EOVERFLOW; + + /* Must call with a mmget() held */ + if (WARN_ON(atomic_read(&mm->mm_count) <= 0)) + return -EINVAL; + + /* pairs with mmdrop in mmu_interval_notifier_remove() */ + mmgrab(mm); + + /* + * If some invalidate_range_start/end region is going on in parallel + * we don't know what VA ranges are affected, so we must assume this + * new range is included. + * + * If the itree is invalidating then we are not allowed to change + * it. Retrying until invalidation is done is tricky due to the + * possibility for live lock, instead defer the add to + * mn_itree_inv_end() so this algorithm is deterministic. + * + * In all cases the value for the mni->invalidate_seq should be + * odd, see mmu_interval_read_begin() + */ + spin_lock(&mmn_mm->lock); + if (mmn_mm->active_invalidate_ranges) { + if (mn_itree_is_invalidating(mmn_mm)) + hlist_add_head(&mni->deferred_item, + &mmn_mm->deferred_list); + else { + mmn_mm->invalidate_seq |= 1; + interval_tree_insert(&mni->interval_tree, + &mmn_mm->itree); + } + mni->invalidate_seq = mmn_mm->invalidate_seq; + } else { + WARN_ON(mn_itree_is_invalidating(mmn_mm)); + /* + * The starting seq for a mni not under invalidation should be + * odd, not equal to the current invalidate_seq and + * invalidate_seq should not 'wrap' to the new seq any time + * soon. + */ + mni->invalidate_seq = mmn_mm->invalidate_seq - 1; + interval_tree_insert(&mni->interval_tree, &mmn_mm->itree); + } + spin_unlock(&mmn_mm->lock); + return 0; +} + +/** + * mmu_interval_notifier_insert - Insert an interval notifier + * @mni: Interval notifier to register + * @start: Starting virtual address to monitor + * @length: Length of the range to monitor + * @mm : mm_struct to attach to + * + * This function subscribes the interval notifier for notifications from the + * mm. Upon return the ops related to mmu_interval_notifier will be called + * whenever an event that intersects with the given range occurs. + * + * Upon return the range_notifier may not be present in the interval tree yet. + * The caller must use the normal interval notifier read flow via + * mmu_interval_read_begin() to establish SPTEs for this range. + */ +int mmu_interval_notifier_insert(struct mmu_interval_notifier *mni, + struct mm_struct *mm, unsigned long start, + unsigned long length, + const struct mmu_interval_notifier_ops *ops) +{ + struct mmu_notifier_mm *mmn_mm; + int ret; + + might_lock(&mm->mmap_sem); + + mmn_mm = smp_load_acquire(&mm->mmu_notifier_mm); + if (!mmn_mm || !mmn_mm->has_itree) { + ret = mmu_notifier_register(NULL, mm); + if (ret) + return ret; + mmn_mm = mm->mmu_notifier_mm; + } + return __mmu_interval_notifier_insert(mni, mm, mmn_mm, start, length, + ops); +} +EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert); + +int mmu_interval_notifier_insert_locked( + struct mmu_interval_notifier *mni, struct mm_struct *mm, + unsigned long start, unsigned long length, + const struct mmu_interval_notifier_ops *ops) +{ + struct mmu_notifier_mm *mmn_mm; + int ret; + + lockdep_assert_held_write(&mm->mmap_sem); + + mmn_mm = mm->mmu_notifier_mm; + if (!mmn_mm || !mmn_mm->has_itree) { + ret = __mmu_notifier_register(NULL, mm); + if (ret) + return ret; + mmn_mm = mm->mmu_notifier_mm; + } + return __mmu_interval_notifier_insert(mni, mm, mmn_mm, start, length, + ops); +} +EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked); + +/** + * mmu_interval_notifier_remove - Remove a interval notifier + * @mni: Interval notifier to unregister + * + * This function must be paired with mmu_interval_notifier_insert(). It cannot + * be called from any ops callback. + * + * Once this returns ops callbacks are no longer running on other CPUs and + * will not be called in future. + */ +void mmu_interval_notifier_remove(struct mmu_interval_notifier *mni) +{ + struct mm_struct *mm = mni->mm; + struct mmu_notifier_mm *mmn_mm = mm->mmu_notifier_mm; + unsigned long seq = 0; + + might_sleep(); + + spin_lock(&mmn_mm->lock); + if (mn_itree_is_invalidating(mmn_mm)) { + /* + * remove is being called after insert put this on the + * deferred list, but before the deferred list was processed. + */ + if (RB_EMPTY_NODE(&mni->interval_tree.rb)) { + hlist_del(&mni->deferred_item); + } else { + hlist_add_head(&mni->deferred_item, + &mmn_mm->deferred_list); + seq = mmn_mm->invalidate_seq; + } + } else { + WARN_ON(RB_EMPTY_NODE(&mni->interval_tree.rb)); + interval_tree_remove(&mni->interval_tree, &mmn_mm->itree); + } + spin_unlock(&mmn_mm->lock); + + /* + * The possible sleep on progress in the invalidation requires the + * caller not hold any locks held by invalidation callbacks. + */ + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); + lock_map_release(&__mmu_notifier_invalidate_range_start_map); + if (seq) + wait_event(mmn_mm->wq, + READ_ONCE(mmn_mm->invalidate_seq) != seq); + + /* pairs with mmgrab in mmu_interval_notifier_insert() */ + mmdrop(mm); +} +EXPORT_SYMBOL_GPL(mmu_interval_notifier_remove); + /** * mmu_notifier_synchronize - Ensure all mmu_notifiers are freed * diff --git a/mm/nommu.c b/mm/nommu.c index 99b7ec318824..7de592058ab4 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -155,11 +155,11 @@ void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags) return __vmalloc(size, flags, PAGE_KERNEL); } -void *vmalloc_user(unsigned long size) +static void *__vmalloc_user_flags(unsigned long size, gfp_t flags) { void *ret; - ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); + ret = __vmalloc(size, flags, PAGE_KERNEL); if (ret) { struct vm_area_struct *vma; @@ -172,8 +172,19 @@ void *vmalloc_user(unsigned long size) return ret; } + +void *vmalloc_user(unsigned long size) +{ + return __vmalloc_user_flags(size, GFP_KERNEL | __GFP_ZERO); +} EXPORT_SYMBOL(vmalloc_user); +void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags) +{ + return __vmalloc_user_flags(size, flags | __GFP_ZERO); +} +EXPORT_SYMBOL(vmalloc_user_node_flags); + struct page *vmalloc_to_page(const void *addr) { return virt_to_page(addr); diff --git a/mm/pagewalk.c b/mm/pagewalk.c index d48c2a986ea3..ea0b9e606ad1 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -10,8 +10,9 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t *pte; int err = 0; const struct mm_walk_ops *ops = walk->ops; + spinlock_t *ptl; - pte = pte_offset_map(pmd, addr); + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); for (;;) { err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); if (err) @@ -22,7 +23,7 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte++; } - pte_unmap(pte); + pte_unmap_unlock(pte, ptl); return err; } @@ -253,13 +254,23 @@ static int __walk_page_range(unsigned long start, unsigned long end, { int err = 0; struct vm_area_struct *vma = walk->vma; + const struct mm_walk_ops *ops = walk->ops; + + if (vma && ops->pre_vma) { + err = ops->pre_vma(start, end, walk); + if (err) + return err; + } if (vma && is_vm_hugetlb_page(vma)) { - if (walk->ops->hugetlb_entry) + if (ops->hugetlb_entry) err = walk_hugetlb_range(start, end, walk); } else err = walk_pgd_range(start, end, walk); + if (vma && ops->post_vma) + ops->post_vma(walk); + return err; } @@ -290,6 +301,11 @@ static int __walk_page_range(unsigned long start, unsigned long end, * its vm_flags. walk_page_test() and @ops->test_walk() are used for this * purpose. * + * If operations need to be staged before and committed after a vma is walked, + * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(), + * since it is intended to handle commit-type operations, can't return any + * errors. + * * struct mm_walk keeps current values of some common data like vma and pmd, * which are useful for the access from callbacks. If you want to pass some * caller-specific data to callbacks, @private should be helpful. @@ -376,3 +392,80 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, return err; return __walk_page_range(vma->vm_start, vma->vm_end, &walk); } + +/** + * walk_page_mapping - walk all memory areas mapped into a struct address_space. + * @mapping: Pointer to the struct address_space + * @first_index: First page offset in the address_space + * @nr: Number of incremental page offsets to cover + * @ops: operation to call during the walk + * @private: private data for callbacks' usage + * + * This function walks all memory areas mapped into a struct address_space. + * The walk is limited to only the given page-size index range, but if + * the index boundaries cross a huge page-table entry, that entry will be + * included. + * + * Also see walk_page_range() for additional information. + * + * Locking: + * This function can't require that the struct mm_struct::mmap_sem is held, + * since @mapping may be mapped by multiple processes. Instead + * @mapping->i_mmap_rwsem must be held. This might have implications in the + * callbacks, and it's up tho the caller to ensure that the + * struct mm_struct::mmap_sem is not needed. + * + * Also this means that a caller can't rely on the struct + * vm_area_struct::vm_flags to be constant across a call, + * except for immutable flags. Callers requiring this shouldn't use + * this function. + * + * Return: 0 on success, negative error code on failure, positive number on + * caller defined premature termination. + */ +int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, + pgoff_t nr, const struct mm_walk_ops *ops, + void *private) +{ + struct mm_walk walk = { + .ops = ops, + .private = private, + }; + struct vm_area_struct *vma; + pgoff_t vba, vea, cba, cea; + unsigned long start_addr, end_addr; + int err = 0; + + lockdep_assert_held(&mapping->i_mmap_rwsem); + vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index, + first_index + nr - 1) { + /* Clip to the vma */ + vba = vma->vm_pgoff; + vea = vba + vma_pages(vma); + cba = first_index; + cba = max(cba, vba); + cea = first_index + nr; + cea = min(cea, vea); + + start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start; + end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start; + if (start_addr >= end_addr) + continue; + + walk.vma = vma; + walk.mm = vma->vm_mm; + + err = walk_page_test(vma->vm_start, vma->vm_end, &walk); + if (err > 0) { + err = 0; + break; + } else if (err < 0) + break; + + err = __walk_page_range(start_addr, end_addr, &walk); + if (err) + break; + } + + return err; +} diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a3c70e275f4e..4a7d7459c4f9 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2672,6 +2672,26 @@ void *vzalloc_node(unsigned long size, int node) EXPORT_SYMBOL(vzalloc_node); /** + * vmalloc_user_node_flags - allocate memory for userspace on a specific node + * @size: allocation size + * @node: numa node + * @flags: flags for the page level allocator + * + * The resulting memory area is zeroed so it can be mapped to userspace + * without leaking data. + * + * Return: pointer to the allocated memory or %NULL on error + */ +void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags) +{ + return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + flags | __GFP_ZERO, PAGE_KERNEL, + VM_USERMAP, node, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(vmalloc_user_node_flags); + +/** * vmalloc_exec - allocate virtually contiguous, executable memory * @size: allocation size * |