diff options
Diffstat (limited to 'mm')
47 files changed, 1740 insertions, 861 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index a5dae9a7eb51..f332efe751dd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -284,6 +284,7 @@ config VIRT_TO_BUS config MMU_NOTIFIER bool select SRCU + select INTERVAL_TREE config KSM bool "Enable KSM for page merging" @@ -674,7 +675,6 @@ config DEV_PAGEMAP_OPS config HMM_MIRROR bool depends on MMU - depends on MMU_NOTIFIER config DEVICE_PRIVATE bool "Unaddressable device memory (GPU memory, ...)" @@ -736,4 +736,7 @@ config ARCH_HAS_PTE_SPECIAL config ARCH_HAS_HUGEPD bool +config MAPPING_DIRTY_HELPERS + bool + endmenu diff --git a/mm/Makefile b/mm/Makefile index d996846697ef..1937cc251883 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -107,3 +107,4 @@ obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o obj-$(CONFIG_ZONE_DEVICE) += memremap.o obj-$(CONFIG_HMM_MIRROR) += hmm.o obj-$(CONFIG_MEMFD_CREATE) += memfd.o +obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index d9daa3e422d0..c360f6a6c844 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -239,8 +239,8 @@ static int __init default_bdi_init(void) { int err; - bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE | - WQ_UNBOUND | WQ_SYSFS, 0); + bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND | + WQ_SYSFS, 0); if (!bdi_wq) return -ENOMEM; diff --git a/mm/compaction.c b/mm/compaction.c index ce08b39d85d4..672d3c78c6ab 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -270,14 +270,15 @@ __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source, /* Ensure the start of the pageblock or zone is online and valid */ block_pfn = pageblock_start_pfn(pfn); - block_page = pfn_to_online_page(max(block_pfn, zone->zone_start_pfn)); + block_pfn = max(block_pfn, zone->zone_start_pfn); + block_page = pfn_to_online_page(block_pfn); if (block_page) { page = block_page; pfn = block_pfn; } /* Ensure the end of the pageblock or zone is online and valid */ - block_pfn += pageblock_nr_pages; + block_pfn = pageblock_end_pfn(pfn) - 1; block_pfn = min(block_pfn, zone_end_pfn(zone) - 1); end_page = pfn_to_online_page(block_pfn); if (!end_page) @@ -303,7 +304,7 @@ __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source, page += (1 << PAGE_ALLOC_COSTLY_ORDER); pfn += (1 << PAGE_ALLOC_COSTLY_ORDER); - } while (page < end_page); + } while (page <= end_page); return false; } diff --git a/mm/debug.c b/mm/debug.c index 8345bb6e4769..0461df1207cb 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -67,28 +67,31 @@ void __dump_page(struct page *page, const char *reason) */ mapcount = PageSlab(page) ? 0 : page_mapcount(page); - pr_warn("page:%px refcount:%d mapcount:%d mapping:%px index:%#lx", - page, page_ref_count(page), mapcount, - page->mapping, page_to_pgoff(page)); if (PageCompound(page)) - pr_cont(" compound_mapcount: %d", compound_mapcount(page)); - pr_cont("\n"); - if (PageAnon(page)) - pr_warn("anon "); - else if (PageKsm(page)) - pr_warn("ksm "); + pr_warn("page:%px refcount:%d mapcount:%d mapping:%px " + "index:%#lx compound_mapcount: %d\n", + page, page_ref_count(page), mapcount, + page->mapping, page_to_pgoff(page), + compound_mapcount(page)); + else + pr_warn("page:%px refcount:%d mapcount:%d mapping:%px index:%#lx\n", + page, page_ref_count(page), mapcount, + page->mapping, page_to_pgoff(page)); + if (PageKsm(page)) + pr_warn("ksm flags: %#lx(%pGp)\n", page->flags, &page->flags); + else if (PageAnon(page)) + pr_warn("anon flags: %#lx(%pGp)\n", page->flags, &page->flags); else if (mapping) { - pr_warn("%ps ", mapping->a_ops); if (mapping->host && mapping->host->i_dentry.first) { struct dentry *dentry; dentry = container_of(mapping->host->i_dentry.first, struct dentry, d_u.d_alias); - pr_warn("name:\"%pd\" ", dentry); - } + pr_warn("%ps name:\"%pd\"\n", mapping->a_ops, dentry); + } else + pr_warn("%ps\n", mapping->a_ops); + pr_warn("flags: %#lx(%pGp)\n", page->flags, &page->flags); } BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); - pr_warn("flags: %#lx(%pGp)\n", page->flags, &page->flags); - hex_only: print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32, sizeof(unsigned long), page, diff --git a/mm/filemap.c b/mm/filemap.c index 1146fcfa3215..85b7d087eb45 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -40,6 +40,7 @@ #include <linux/rmap.h> #include <linux/delayacct.h> #include <linux/psi.h> +#include <linux/ramfs.h> #include "internal.h" #define CREATE_TRACE_POINTS @@ -1973,7 +1973,8 @@ static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, } static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) + unsigned long end, unsigned int flags, + struct page **pages, int *nr) { unsigned long pte_end; struct page *head, *page; @@ -1986,7 +1987,7 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, pte = READ_ONCE(*ptep); - if (!pte_access_permitted(pte, write)) + if (!pte_access_permitted(pte, flags & FOLL_WRITE)) return 0; /* hugepages are never "special" */ @@ -2023,7 +2024,7 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, } static int gup_huge_pd(hugepd_t hugepd, unsigned long addr, - unsigned int pdshift, unsigned long end, int write, + unsigned int pdshift, unsigned long end, unsigned int flags, struct page **pages, int *nr) { pte_t *ptep; @@ -2033,7 +2034,7 @@ static int gup_huge_pd(hugepd_t hugepd, unsigned long addr, ptep = hugepte_offset(hugepd, addr, pdshift); do { next = hugepte_addr_end(addr, end, sz); - if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr)) + if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr)) return 0; } while (ptep++, addr = next, addr != end); @@ -2041,7 +2042,7 @@ static int gup_huge_pd(hugepd_t hugepd, unsigned long addr, } #else static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr, - unsigned pdshift, unsigned long end, int write, + unsigned int pdshift, unsigned long end, unsigned int flags, struct page **pages, int *nr) { return 0; @@ -2049,7 +2050,8 @@ static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr, #endif /* CONFIG_ARCH_HAS_HUGEPD */ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, - unsigned long end, unsigned int flags, struct page **pages, int *nr) + unsigned long end, unsigned int flags, + struct page **pages, int *nr) { struct page *head, *page; int refs; @@ -26,193 +26,6 @@ #include <linux/mmu_notifier.h> #include <linux/memory_hotplug.h> -static struct mmu_notifier *hmm_alloc_notifier(struct mm_struct *mm) -{ - struct hmm *hmm; - - hmm = kzalloc(sizeof(*hmm), GFP_KERNEL); - if (!hmm) - return ERR_PTR(-ENOMEM); - - init_waitqueue_head(&hmm->wq); - INIT_LIST_HEAD(&hmm->mirrors); - init_rwsem(&hmm->mirrors_sem); - INIT_LIST_HEAD(&hmm->ranges); - spin_lock_init(&hmm->ranges_lock); - hmm->notifiers = 0; - return &hmm->mmu_notifier; -} - -static void hmm_free_notifier(struct mmu_notifier *mn) -{ - struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); - - WARN_ON(!list_empty(&hmm->ranges)); - WARN_ON(!list_empty(&hmm->mirrors)); - kfree(hmm); -} - -static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) -{ - struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); - struct hmm_mirror *mirror; - - /* - * Since hmm_range_register() holds the mmget() lock hmm_release() is - * prevented as long as a range exists. - */ - WARN_ON(!list_empty_careful(&hmm->ranges)); - - down_read(&hmm->mirrors_sem); - list_for_each_entry(mirror, &hmm->mirrors, list) { - /* - * Note: The driver is not allowed to trigger - * hmm_mirror_unregister() from this thread. - */ - if (mirror->ops->release) - mirror->ops->release(mirror); - } - up_read(&hmm->mirrors_sem); -} - -static void notifiers_decrement(struct hmm *hmm) -{ - unsigned long flags; - - spin_lock_irqsave(&hmm->ranges_lock, flags); - hmm->notifiers--; - if (!hmm->notifiers) { - struct hmm_range *range; - - list_for_each_entry(range, &hmm->ranges, list) { - if (range->valid) - continue; - range->valid = true; - } - wake_up_all(&hmm->wq); - } - spin_unlock_irqrestore(&hmm->ranges_lock, flags); -} - -static int hmm_invalidate_range_start(struct mmu_notifier *mn, - const struct mmu_notifier_range *nrange) -{ - struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); - struct hmm_mirror *mirror; - struct hmm_range *range; - unsigned long flags; - int ret = 0; - - spin_lock_irqsave(&hmm->ranges_lock, flags); - hmm->notifiers++; - list_for_each_entry(range, &hmm->ranges, list) { - if (nrange->end < range->start || nrange->start >= range->end) - continue; - - range->valid = false; - } - spin_unlock_irqrestore(&hmm->ranges_lock, flags); - - if (mmu_notifier_range_blockable(nrange)) - down_read(&hmm->mirrors_sem); - else if (!down_read_trylock(&hmm->mirrors_sem)) { - ret = -EAGAIN; - goto out; - } - - list_for_each_entry(mirror, &hmm->mirrors, list) { - int rc; - - rc = mirror->ops->sync_cpu_device_pagetables(mirror, nrange); - if (rc) { - if (WARN_ON(mmu_notifier_range_blockable(nrange) || - rc != -EAGAIN)) - continue; - ret = -EAGAIN; - break; - } - } - up_read(&hmm->mirrors_sem); - -out: - if (ret) - notifiers_decrement(hmm); - return ret; -} - -static void hmm_invalidate_range_end(struct mmu_notifier *mn, - const struct mmu_notifier_range *nrange) -{ - struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); - - notifiers_decrement(hmm); -} - -static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { - .release = hmm_release, - .invalidate_range_start = hmm_invalidate_range_start, - .invalidate_range_end = hmm_invalidate_range_end, - .alloc_notifier = hmm_alloc_notifier, - .free_notifier = hmm_free_notifier, -}; - -/* - * hmm_mirror_register() - register a mirror against an mm - * - * @mirror: new mirror struct to register - * @mm: mm to register against - * Return: 0 on success, -ENOMEM if no memory, -EINVAL if invalid arguments - * - * To start mirroring a process address space, the device driver must register - * an HMM mirror struct. - * - * The caller cannot unregister the hmm_mirror while any ranges are - * registered. - * - * Callers using this function must put a call to mmu_notifier_synchronize() - * in their module exit functions. - */ -int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) -{ - struct mmu_notifier *mn; - - lockdep_assert_held_write(&mm->mmap_sem); - - /* Sanity check */ - if (!mm || !mirror || !mirror->ops) - return -EINVAL; - - mn = mmu_notifier_get_locked(&hmm_mmu_notifier_ops, mm); - if (IS_ERR(mn)) - return PTR_ERR(mn); - mirror->hmm = container_of(mn, struct hmm, mmu_notifier); - - down_write(&mirror->hmm->mirrors_sem); - list_add(&mirror->list, &mirror->hmm->mirrors); - up_write(&mirror->hmm->mirrors_sem); - - return 0; -} -EXPORT_SYMBOL(hmm_mirror_register); - -/* - * hmm_mirror_unregister() - unregister a mirror - * - * @mirror: mirror struct to unregister - * - * Stop mirroring a process address space, and cleanup. - */ -void hmm_mirror_unregister(struct hmm_mirror *mirror) -{ - struct hmm *hmm = mirror->hmm; - - down_write(&hmm->mirrors_sem); - list_del(&mirror->list); - up_write(&hmm->mirrors_sem); - mmu_notifier_put(&hmm->mmu_notifier); -} -EXPORT_SYMBOL(hmm_mirror_unregister); - struct hmm_vma_walk { struct hmm_range *range; struct dev_pagemap *pgmap; @@ -252,18 +65,15 @@ err: return -EFAULT; } -static int hmm_pfns_bad(unsigned long addr, - unsigned long end, - struct mm_walk *walk) +static int hmm_pfns_fill(unsigned long addr, unsigned long end, + struct hmm_range *range, enum hmm_pfn_value_e value) { - struct hmm_vma_walk *hmm_vma_walk = walk->private; - struct hmm_range *range = hmm_vma_walk->range; uint64_t *pfns = range->pfns; unsigned long i; i = (addr - range->start) >> PAGE_SHIFT; for (; addr < end; addr += PAGE_SIZE, i++) - pfns[i] = range->values[HMM_PFN_ERROR]; + pfns[i] = range->values[value]; return 0; } @@ -532,8 +342,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, if (unlikely(!hmm_vma_walk->pgmap)) return -EBUSY; } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) { - *pfn = range->values[HMM_PFN_SPECIAL]; - return -EFAULT; + if (!is_zero_pfn(pte_pfn(pte))) { + *pfn = range->values[HMM_PFN_SPECIAL]; + return -EFAULT; + } + /* + * Since each architecture defines a struct page for the zero + * page, just fall through and treat it like a normal page. + */ } *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags; @@ -584,7 +400,7 @@ again: } return 0; } else if (!pmd_present(pmd)) - return hmm_pfns_bad(start, end, walk); + return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { /* @@ -612,7 +428,7 @@ again: * recover. */ if (pmd_bad(pmd)) - return hmm_pfns_bad(start, end, walk); + return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); ptep = pte_offset_map(pmdp, addr); i = (addr - range->start) >> PAGE_SHIFT; @@ -770,93 +586,55 @@ unlock: #define hmm_vma_walk_hugetlb_entry NULL #endif /* CONFIG_HUGETLB_PAGE */ -static void hmm_pfns_clear(struct hmm_range *range, - uint64_t *pfns, - unsigned long addr, - unsigned long end) -{ - for (; addr < end; addr += PAGE_SIZE, pfns++) - *pfns = range->values[HMM_PFN_NONE]; -} - -/* - * hmm_range_register() - start tracking change to CPU page table over a range - * @range: range - * @mm: the mm struct for the range of virtual address - * - * Return: 0 on success, -EFAULT if the address space is no longer valid - * - * Track updates to the CPU page table see include/linux/hmm.h - */ -int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror) +static int hmm_vma_walk_test(unsigned long start, unsigned long end, + struct mm_walk *walk) { - struct hmm *hmm = mirror->hmm; - unsigned long flags; - - range->valid = false; - range->hmm = NULL; - - if ((range->start & (PAGE_SIZE - 1)) || (range->end & (PAGE_SIZE - 1))) - return -EINVAL; - if (range->start >= range->end) - return -EINVAL; + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + struct vm_area_struct *vma = walk->vma; - /* Prevent hmm_release() from running while the range is valid */ - if (!mmget_not_zero(hmm->mmu_notifier.mm)) + /* + * Skip vma ranges that don't have struct page backing them or + * map I/O devices directly. + */ + if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) return -EFAULT; - /* Initialize range to track CPU page table updates. */ - spin_lock_irqsave(&hmm->ranges_lock, flags); - - range->hmm = hmm; - list_add(&range->list, &hmm->ranges); - /* - * If there are any concurrent notifiers we have to wait for them for - * the range to be valid (see hmm_range_wait_until_valid()). + * If the vma does not allow read access, then assume that it does not + * allow write access either. HMM does not support architectures + * that allow write without read. */ - if (!hmm->notifiers) - range->valid = true; - spin_unlock_irqrestore(&hmm->ranges_lock, flags); - - return 0; -} -EXPORT_SYMBOL(hmm_range_register); + if (!(vma->vm_flags & VM_READ)) { + bool fault, write_fault; -/* - * hmm_range_unregister() - stop tracking change to CPU page table over a range - * @range: range - * - * Range struct is used to track updates to the CPU page table after a call to - * hmm_range_register(). See include/linux/hmm.h for how to use it. - */ -void hmm_range_unregister(struct hmm_range *range) -{ - struct hmm *hmm = range->hmm; - unsigned long flags; + /* + * Check to see if a fault is requested for any page in the + * range. + */ + hmm_range_need_fault(hmm_vma_walk, range->pfns + + ((start - range->start) >> PAGE_SHIFT), + (end - start) >> PAGE_SHIFT, + 0, &fault, &write_fault); + if (fault || write_fault) + return -EFAULT; - spin_lock_irqsave(&hmm->ranges_lock, flags); - list_del_init(&range->list); - spin_unlock_irqrestore(&hmm->ranges_lock, flags); + hmm_pfns_fill(start, end, range, HMM_PFN_NONE); + hmm_vma_walk->last = end; - /* Drop reference taken by hmm_range_register() */ - mmput(hmm->mmu_notifier.mm); + /* Skip this vma and continue processing the next vma. */ + return 1; + } - /* - * The range is now invalid and the ref on the hmm is dropped, so - * poison the pointer. Leave other fields in place, for the caller's - * use. - */ - range->valid = false; - memset(&range->hmm, POISON_INUSE, sizeof(range->hmm)); + return 0; } -EXPORT_SYMBOL(hmm_range_unregister); static const struct mm_walk_ops hmm_walk_ops = { .pud_entry = hmm_vma_walk_pud, .pmd_entry = hmm_vma_walk_pmd, .pte_hole = hmm_vma_walk_hole, .hugetlb_entry = hmm_vma_walk_hugetlb_entry, + .test_walk = hmm_vma_walk_test, }; /** @@ -889,210 +667,27 @@ static const struct mm_walk_ops hmm_walk_ops = { */ long hmm_range_fault(struct hmm_range *range, unsigned int flags) { - const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; - unsigned long start = range->start, end; - struct hmm_vma_walk hmm_vma_walk; - struct hmm *hmm = range->hmm; - struct vm_area_struct *vma; + struct hmm_vma_walk hmm_vma_walk = { + .range = range, + .last = range->start, + .flags = flags, + }; + struct mm_struct *mm = range->notifier->mm; int ret; - lockdep_assert_held(&hmm->mmu_notifier.mm->mmap_sem); + lockdep_assert_held(&mm->mmap_sem); do { /* If range is no longer valid force retry. */ - if (!range->valid) + if (mmu_interval_check_retry(range->notifier, + range->notifier_seq)) return -EBUSY; + ret = walk_page_range(mm, hmm_vma_walk.last, range->end, + &hmm_walk_ops, &hmm_vma_walk); + } while (ret == -EBUSY); - vma = find_vma(hmm->mmu_notifier.mm, start); - if (vma == NULL || (vma->vm_flags & device_vma)) - return -EFAULT; - - if (!(vma->vm_flags & VM_READ)) { - /* - * If vma do not allow read access, then assume that it - * does not allow write access, either. HMM does not - * support architecture that allow write without read. - */ - hmm_pfns_clear(range, range->pfns, - range->start, range->end); - return -EPERM; - } - - hmm_vma_walk.pgmap = NULL; - hmm_vma_walk.last = start; - hmm_vma_walk.flags = flags; - hmm_vma_walk.range = range; - end = min(range->end, vma->vm_end); - - walk_page_range(vma->vm_mm, start, end, &hmm_walk_ops, - &hmm_vma_walk); - - do { - ret = walk_page_range(vma->vm_mm, start, end, - &hmm_walk_ops, &hmm_vma_walk); - start = hmm_vma_walk.last; - - /* Keep trying while the range is valid. */ - } while (ret == -EBUSY && range->valid); - - if (ret) { - unsigned long i; - - i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; - hmm_pfns_clear(range, &range->pfns[i], - hmm_vma_walk.last, range->end); - return ret; - } - start = end; - - } while (start < range->end); - + if (ret) + return ret; return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; } EXPORT_SYMBOL(hmm_range_fault); - -/** - * hmm_range_dma_map - hmm_range_fault() and dma map page all in one. - * @range: range being faulted - * @device: device to map page to - * @daddrs: array of dma addresses for the mapped pages - * @flags: HMM_FAULT_* - * - * Return: the number of pages mapped on success (including zero), or any - * status return from hmm_range_fault() otherwise. - */ -long hmm_range_dma_map(struct hmm_range *range, struct device *device, - dma_addr_t *daddrs, unsigned int flags) -{ - unsigned long i, npages, mapped; - long ret; - - ret = hmm_range_fault(range, flags); - if (ret <= 0) - return ret ? ret : -EBUSY; - - npages = (range->end - range->start) >> PAGE_SHIFT; - for (i = 0, mapped = 0; i < npages; ++i) { - enum dma_data_direction dir = DMA_TO_DEVICE; - struct page *page; - - /* - * FIXME need to update DMA API to provide invalid DMA address - * value instead of a function to test dma address value. This - * would remove lot of dumb code duplicated accross many arch. - * - * For now setting it to 0 here is good enough as the pfns[] - * value is what is use to check what is valid and what isn't. - */ - daddrs[i] = 0; - - page = hmm_device_entry_to_page(range, range->pfns[i]); - if (page == NULL) - continue; - - /* Check if range is being invalidated */ - if (!range->valid) { - ret = -EBUSY; - goto unmap; - } - - /* If it is read and write than map bi-directional. */ - if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) - dir = DMA_BIDIRECTIONAL; - - daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir); - if (dma_mapping_error(device, daddrs[i])) { - ret = -EFAULT; - goto unmap; - } - - mapped++; - } - - return mapped; - -unmap: - for (npages = i, i = 0; (i < npages) && mapped; ++i) { - enum dma_data_direction dir = DMA_TO_DEVICE; - struct page *page; - - page = hmm_device_entry_to_page(range, range->pfns[i]); - if (page == NULL) - continue; - - if (dma_mapping_error(device, daddrs[i])) - continue; - - /* If it is read and write than map bi-directional. */ - if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) - dir = DMA_BIDIRECTIONAL; - - dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); - mapped--; - } - - return ret; -} -EXPORT_SYMBOL(hmm_range_dma_map); - -/** - * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map() - * @range: range being unmapped - * @device: device against which dma map was done - * @daddrs: dma address of mapped pages - * @dirty: dirty page if it had the write flag set - * Return: number of page unmapped on success, -EINVAL otherwise - * - * Note that caller MUST abide by mmu notifier or use HMM mirror and abide - * to the sync_cpu_device_pagetables() callback so that it is safe here to - * call set_page_dirty(). Caller must also take appropriate locks to avoid - * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress. - */ -long hmm_range_dma_unmap(struct hmm_range *range, - struct device *device, - dma_addr_t *daddrs, - bool dirty) -{ - unsigned long i, npages; - long cpages = 0; - - /* Sanity check. */ - if (range->end <= range->start) - return -EINVAL; - if (!daddrs) - return -EINVAL; - if (!range->pfns) - return -EINVAL; - - npages = (range->end - range->start) >> PAGE_SHIFT; - for (i = 0; i < npages; ++i) { - enum dma_data_direction dir = DMA_TO_DEVICE; - struct page *page; - - page = hmm_device_entry_to_page(range, range->pfns[i]); - if (page == NULL) - continue; - - /* If it is read and write than map bi-directional. */ - if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) { - dir = DMA_BIDIRECTIONAL; - - /* - * See comments in function description on why it is - * safe here to call set_page_dirty() - */ - if (dirty) - set_page_dirty(page); - } - - /* Unmap and clear pfns/dma address */ - dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); - range->pfns[i] = range->values[HMM_PFN_NONE]; - /* FIXME see comments in hmm_vma_dma_map() */ - daddrs[i] = 0; - cpages++; - } - - return cpages; -} -EXPORT_SYMBOL(hmm_range_dma_unmap); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c5cb6dcd6c69..13cc93785006 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2789,8 +2789,13 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) ds_queue->split_queue_len--; list_del(page_deferred_list(head)); } - if (mapping) - __dec_node_page_state(page, NR_SHMEM_THPS); + if (mapping) { + if (PageSwapBacked(page)) + __dec_node_page_state(page, NR_SHMEM_THPS); + else + __dec_node_page_state(page, NR_FILE_THPS); + } + spin_unlock(&ds_queue->split_queue_lock); __split_huge_page(page, list, end, flags); if (PageSwapCache(head)) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ef37c85423a5..b45a95363a84 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1084,11 +1084,10 @@ static bool pfn_range_valid_gigantic(struct zone *z, struct page *page; for (i = start_pfn; i < end_pfn; i++) { - if (!pfn_valid(i)) + page = pfn_to_online_page(i); + if (!page) return false; - page = pfn_to_page(i); - if (page_zone(page) != z) return false; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index f1930fa0b445..2ac38bdc18a1 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -196,7 +196,7 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, again: rcu_read_lock(); h_cg = hugetlb_cgroup_from_task(current); - if (!css_tryget_online(&h_cg->css)) { + if (!css_tryget(&h_cg->css)) { rcu_read_unlock(); goto again; } diff --git a/mm/init-mm.c b/mm/init-mm.c index fb1e15028ef0..19603302a77f 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -5,6 +5,7 @@ #include <linux/spinlock.h> #include <linux/list.h> #include <linux/cpumask.h> +#include <linux/mman.h> #include <linux/atomic.h> #include <linux/user_namespace.h> diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 0a1b4b484ac5..a8a57bebb5fa 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1028,12 +1028,13 @@ static void collapse_huge_page(struct mm_struct *mm, anon_vma_lock_write(vma->anon_vma); - pte = pte_offset_map(pmd, address); - pte_ptl = pte_lockptr(mm, pmd); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, address, address + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); + + pte = pte_offset_map(pmd, address); + pte_ptl = pte_lockptr(mm, pmd); + pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ /* * After this gup_fast can't run anymore. This also removes @@ -1601,17 +1602,6 @@ static void collapse_file(struct mm_struct *mm, result = SCAN_FAIL; goto xa_unlocked; } - } else if (!PageUptodate(page)) { - xas_unlock_irq(&xas); - wait_on_page_locked(page); - if (!trylock_page(page)) { - result = SCAN_PAGE_LOCK; - goto xa_unlocked; - } - get_page(page); - } else if (PageDirty(page)) { - result = SCAN_FAIL; - goto xa_locked; } else if (trylock_page(page)) { get_page(page); xas_unlock_irq(&xas); @@ -1626,7 +1616,12 @@ static void collapse_file(struct mm_struct *mm, * without racing with truncate. */ VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageUptodate(page), page); + + /* make sure the page is up to date */ + if (unlikely(!PageUptodate(page))) { + result = SCAN_FAIL; + goto out_unlock; + } /* * If file was truncated then extended, or hole-punched, before @@ -1642,6 +1637,16 @@ static void collapse_file(struct mm_struct *mm, goto out_unlock; } + if (!is_shmem && PageDirty(page)) { + /* + * khugepaged only works on read-only fd, so this + * page is dirty because it hasn't been flushed + * since first write. + */ + result = SCAN_FAIL; + goto out_unlock; + } + if (isolate_lru_page(page)) { result = SCAN_DEL_PAGE_LRU; goto out_unlock; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 03a8d84badad..244607663363 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -527,6 +527,16 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) } /* + * Remove an object from the object_tree_root and object_list. Must be called + * with the kmemleak_lock held _if_ kmemleak is still enabled. + */ +static void __remove_object(struct kmemleak_object *object) +{ + rb_erase(&object->rb_node, &object_tree_root); + list_del_rcu(&object->object_list); +} + +/* * Look up an object in the object search tree and remove it from both * object_tree_root and object_list. The returned object's use_count should be * at least 1, as initially set by create_object(). @@ -538,10 +548,8 @@ static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int ali write_lock_irqsave(&kmemleak_lock, flags); object = lookup_object(ptr, alias); - if (object) { - rb_erase(&object->rb_node, &object_tree_root); - list_del_rcu(&object->object_list); - } + if (object) + __remove_object(object); write_unlock_irqrestore(&kmemleak_lock, flags); return object; @@ -1834,12 +1842,16 @@ static const struct file_operations kmemleak_fops = { static void __kmemleak_do_cleanup(void) { - struct kmemleak_object *object; + struct kmemleak_object *object, *tmp; - rcu_read_lock(); - list_for_each_entry_rcu(object, &object_list, object_list) - delete_object_full(object->pointer); - rcu_read_unlock(); + /* + * Kmemleak has already been disabled, no need for RCU list traversal + * or kmemleak_lock held. + */ + list_for_each_entry_safe(object, tmp, &object_list, object_list) { + __remove_object(object); + __delete_object(object); + } } /* @@ -885,13 +885,13 @@ static int remove_stable_node(struct stable_node *stable_node) return 0; } - if (WARN_ON_ONCE(page_mapped(page))) { - /* - * This should not happen: but if it does, just refuse to let - * merge_across_nodes be switched - there is no need to panic. - */ - err = -EBUSY; - } else { + /* + * Page could be still mapped if this races with __mmput() running in + * between ksm_exit() and exit_mmap(). Just refuse to let + * merge_across_nodes/max_page_sharing be switched. + */ + err = -EBUSY; + if (!page_mapped(page)) { /* * The stable node did not yet appear stale to get_ksm_page(), * since that allows for an unmapped ksm page to be recognized diff --git a/mm/maccess.c b/mm/maccess.c index d065736f6b87..3ca8d97e5010 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -18,6 +18,18 @@ probe_read_common(void *dst, const void __user *src, size_t size) return ret ? -EFAULT : 0; } +static __always_inline long +probe_write_common(void __user *dst, const void *src, size_t size) +{ + long ret; + + pagefault_disable(); + ret = __copy_to_user_inatomic(dst, src, size); + pagefault_enable(); + + return ret ? -EFAULT : 0; +} + /** * probe_kernel_read(): safely attempt to read from a kernel-space location * @dst: pointer to the buffer that shall take the data @@ -31,11 +43,20 @@ probe_read_common(void *dst, const void __user *src, size_t size) * do_page_fault() doesn't attempt to take mmap_sem. This makes * probe_kernel_read() suitable for use within regions where the caller * already holds mmap_sem, or other locks which nest inside mmap_sem. + * + * probe_kernel_read_strict() is the same as probe_kernel_read() except for + * the case where architectures have non-overlapping user and kernel address + * ranges: probe_kernel_read_strict() will additionally return -EFAULT for + * probing memory on a user address range where probe_user_read() is supposed + * to be used instead. */ long __weak probe_kernel_read(void *dst, const void *src, size_t size) __attribute__((alias("__probe_kernel_read"))); +long __weak probe_kernel_read_strict(void *dst, const void *src, size_t size) + __attribute__((alias("__probe_kernel_read"))); + long __probe_kernel_read(void *dst, const void *src, size_t size) { long ret; @@ -85,6 +106,7 @@ EXPORT_SYMBOL_GPL(probe_user_read); * Safely write to address @dst from the buffer at @src. If a kernel fault * happens, handle that and return -EFAULT. */ + long __weak probe_kernel_write(void *dst, const void *src, size_t size) __attribute__((alias("__probe_kernel_write"))); @@ -94,15 +116,39 @@ long __probe_kernel_write(void *dst, const void *src, size_t size) mm_segment_t old_fs = get_fs(); set_fs(KERNEL_DS); - pagefault_disable(); - ret = __copy_to_user_inatomic((__force void __user *)dst, src, size); - pagefault_enable(); + ret = probe_write_common((__force void __user *)dst, src, size); set_fs(old_fs); - return ret ? -EFAULT : 0; + return ret; } EXPORT_SYMBOL_GPL(probe_kernel_write); +/** + * probe_user_write(): safely attempt to write to a user-space location + * @dst: address to write to + * @src: pointer to the data that shall be written + * @size: size of the data chunk + * + * Safely write to address @dst from the buffer at @src. If a kernel fault + * happens, handle that and return -EFAULT. + */ + +long __weak probe_user_write(void __user *dst, const void *src, size_t size) + __attribute__((alias("__probe_user_write"))); + +long __probe_user_write(void __user *dst, const void *src, size_t size) +{ + long ret = -EFAULT; + mm_segment_t old_fs = get_fs(); + + set_fs(USER_DS); + if (access_ok(dst, size)) + ret = probe_write_common(dst, src, size); + set_fs(old_fs); + + return ret; +} +EXPORT_SYMBOL_GPL(probe_user_write); /** * strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address. @@ -120,8 +166,22 @@ EXPORT_SYMBOL_GPL(probe_kernel_write); * * If @count is smaller than the length of the string, copies @count-1 bytes, * sets the last byte of @dst buffer to NUL and returns @count. + * + * strncpy_from_unsafe_strict() is the same as strncpy_from_unsafe() except + * for the case where architectures have non-overlapping user and kernel address + * ranges: strncpy_from_unsafe_strict() will additionally return -EFAULT for + * probing memory on a user address range where strncpy_from_unsafe_user() is + * supposed to be used instead. */ -long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) + +long __weak strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) + __attribute__((alias("__strncpy_from_unsafe"))); + +long __weak strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr, + long count) + __attribute__((alias("__strncpy_from_unsafe"))); + +long __strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) { mm_segment_t old_fs = get_fs(); const void *src = unsafe_addr; diff --git a/mm/madvise.c b/mm/madvise.c index 2be9f3fdb05e..94c343b4c968 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -363,8 +363,12 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, ClearPageReferenced(page); test_and_clear_page_young(page); if (pageout) { - if (!isolate_lru_page(page)) - list_add(&page->lru, &page_list); + if (!isolate_lru_page(page)) { + if (PageUnevictable(page)) + putback_lru_page(page); + else + list_add(&page->lru, &page_list); + } } else deactivate_page(page); huge_unlock: @@ -441,8 +445,12 @@ regular_page: ClearPageReferenced(page); test_and_clear_page_young(page); if (pageout) { - if (!isolate_lru_page(page)) - list_add(&page->lru, &page_list); + if (!isolate_lru_page(page)) { + if (PageUnevictable(page)) + putback_lru_page(page); + else + list_add(&page->lru, &page_list); + } } else deactivate_page(page); } diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c new file mode 100644 index 000000000000..71070dda9643 --- /dev/null +++ b/mm/mapping_dirty_helpers.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/pagewalk.h> +#include <linux/hugetlb.h> +#include <linux/bitops.h> +#include <linux/mmu_notifier.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> + +/** + * struct wp_walk - Private struct for pagetable walk callbacks + * @range: Range for mmu notifiers + * @tlbflush_start: Address of first modified pte + * @tlbflush_end: Address of last modified pte + 1 + * @total: Total number of modified ptes + */ +struct wp_walk { + struct mmu_notifier_range range; + unsigned long tlbflush_start; + unsigned long tlbflush_end; + unsigned long total; +}; + +/** + * wp_pte - Write-protect a pte + * @pte: Pointer to the pte + * @addr: The virtual page address + * @walk: pagetable walk callback argument + * + * The function write-protects a pte and records the range in + * virtual address space of touched ptes for efficient range TLB flushes. + */ +static int wp_pte(pte_t *pte, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct wp_walk *wpwalk = walk->private; + pte_t ptent = *pte; + + if (pte_write(ptent)) { + pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte); + + ptent = pte_wrprotect(old_pte); + ptep_modify_prot_commit(walk->vma, addr, pte, old_pte, ptent); + wpwalk->total++; + wpwalk->tlbflush_start = min(wpwalk->tlbflush_start, addr); + wpwalk->tlbflush_end = max(wpwalk->tlbflush_end, + addr + PAGE_SIZE); + } + + return 0; +} + +/** + * struct clean_walk - Private struct for the clean_record_pte function. + * @base: struct wp_walk we derive from + * @bitmap_pgoff: Address_space Page offset of the first bit in @bitmap + * @bitmap: Bitmap with one bit for each page offset in the address_space range + * covered. + * @start: Address_space page offset of first modified pte relative + * to @bitmap_pgoff + * @end: Address_space page offset of last modified pte relative + * to @bitmap_pgoff + */ +struct clean_walk { + struct wp_walk base; + pgoff_t bitmap_pgoff; + unsigned long *bitmap; + pgoff_t start; + pgoff_t end; +}; + +#define to_clean_walk(_wpwalk) container_of(_wpwalk, struct clean_walk, base) + +/** + * clean_record_pte - Clean a pte and record its address space offset in a + * bitmap + * @pte: Pointer to the pte + * @addr: The virtual page address + * @walk: pagetable walk callback argument + * + * The function cleans a pte and records the range in + * virtual address space of touched ptes for efficient TLB flushes. + * It also records dirty ptes in a bitmap representing page offsets + * in the address_space, as well as the first and last of the bits + * touched. + */ +static int clean_record_pte(pte_t *pte, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct wp_walk *wpwalk = walk->private; + struct clean_walk *cwalk = to_clean_walk(wpwalk); + pte_t ptent = *pte; + + if (pte_dirty(ptent)) { + pgoff_t pgoff = ((addr - walk->vma->vm_start) >> PAGE_SHIFT) + + walk->vma->vm_pgoff - cwalk->bitmap_pgoff; + pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte); + + ptent = pte_mkclean(old_pte); + ptep_modify_prot_commit(walk->vma, addr, pte, old_pte, ptent); + + wpwalk->total++; + wpwalk->tlbflush_start = min(wpwalk->tlbflush_start, addr); + wpwalk->tlbflush_end = max(wpwalk->tlbflush_end, + addr + PAGE_SIZE); + + __set_bit(pgoff, cwalk->bitmap); + cwalk->start = min(cwalk->start, pgoff); + cwalk->end = max(cwalk->end, pgoff + 1); + } + + return 0; +} + +/* wp_clean_pmd_entry - The pagewalk pmd callback. */ +static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + /* Dirty-tracking should be handled on the pte level */ + pmd_t pmdval = pmd_read_atomic(pmd); + + if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) + WARN_ON(pmd_write(pmdval) || pmd_dirty(pmdval)); + + return 0; +} + +/* wp_clean_pud_entry - The pagewalk pud callback. */ +static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + /* Dirty-tracking should be handled on the pte level */ + pud_t pudval = READ_ONCE(*pud); + + if (pud_trans_huge(pudval) || pud_devmap(pudval)) + WARN_ON(pud_write(pudval) || pud_dirty(pudval)); + + return 0; +} + +/* + * wp_clean_pre_vma - The pagewalk pre_vma callback. + * + * The pre_vma callback performs the cache flush, stages the tlb flush + * and calls the necessary mmu notifiers. + */ +static int wp_clean_pre_vma(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct wp_walk *wpwalk = walk->private; + + wpwalk->tlbflush_start = end; + wpwalk->tlbflush_end = start; + + mmu_notifier_range_init(&wpwalk->range, MMU_NOTIFY_PROTECTION_PAGE, 0, + walk->vma, walk->mm, start, end); + mmu_notifier_invalidate_range_start(&wpwalk->range); + flush_cache_range(walk->vma, start, end); + + /* + * We're not using tlb_gather_mmu() since typically + * only a small subrange of PTEs are affected, whereas + * tlb_gather_mmu() records the full range. + */ + inc_tlb_flush_pending(walk->mm); + + return 0; +} + +/* + * wp_clean_post_vma - The pagewalk post_vma callback. + * + * The post_vma callback performs the tlb flush and calls necessary mmu + * notifiers. + */ +static void wp_clean_post_vma(struct mm_walk *walk) +{ + struct wp_walk *wpwalk = walk->private; + + if (mm_tlb_flush_nested(walk->mm)) + flush_tlb_range(walk->vma, wpwalk->range.start, + wpwalk->range.end); + else if (wpwalk->tlbflush_end > wpwalk->tlbflush_start) + flush_tlb_range(walk->vma, wpwalk->tlbflush_start, + wpwalk->tlbflush_end); + + mmu_notifier_invalidate_range_end(&wpwalk->range); + dec_tlb_flush_pending(walk->mm); +} + +/* + * wp_clean_test_walk - The pagewalk test_walk callback. + * + * Won't perform dirty-tracking on COW, read-only or HUGETLB vmas. + */ +static int wp_clean_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + unsigned long vm_flags = READ_ONCE(walk->vma->vm_flags); + + /* Skip non-applicable VMAs */ + if ((vm_flags & (VM_SHARED | VM_MAYWRITE | VM_HUGETLB)) != + (VM_SHARED | VM_MAYWRITE)) + return 1; + + return 0; +} + +static const struct mm_walk_ops clean_walk_ops = { + .pte_entry = clean_record_pte, + .pmd_entry = wp_clean_pmd_entry, + .pud_entry = wp_clean_pud_entry, + .test_walk = wp_clean_test_walk, + .pre_vma = wp_clean_pre_vma, + .post_vma = wp_clean_post_vma +}; + +static const struct mm_walk_ops wp_walk_ops = { + .pte_entry = wp_pte, + .pmd_entry = wp_clean_pmd_entry, + .pud_entry = wp_clean_pud_entry, + .test_walk = wp_clean_test_walk, + .pre_vma = wp_clean_pre_vma, + .post_vma = wp_clean_post_vma +}; + +/** + * wp_shared_mapping_range - Write-protect all ptes in an address space range + * @mapping: The address_space we want to write protect + * @first_index: The first page offset in the range + * @nr: Number of incremental page offsets to cover + * + * Note: This function currently skips transhuge page-table entries, since + * it's intended for dirty-tracking on the PTE level. It will warn on + * encountering transhuge write-enabled entries, though, and can easily be + * extended to handle them as well. + * + * Return: The number of ptes actually write-protected. Note that + * already write-protected ptes are not counted. + */ +unsigned long wp_shared_mapping_range(struct address_space *mapping, + pgoff_t first_index, pgoff_t nr) +{ + struct wp_walk wpwalk = { .total = 0 }; + + i_mmap_lock_read(mapping); + WARN_ON(walk_page_mapping(mapping, first_index, nr, &wp_walk_ops, + &wpwalk)); + i_mmap_unlock_read(mapping); + + return wpwalk.total; +} +EXPORT_SYMBOL_GPL(wp_shared_mapping_range); + +/** + * clean_record_shared_mapping_range - Clean and record all ptes in an + * address space range + * @mapping: The address_space we want to clean + * @first_index: The first page offset in the range + * @nr: Number of incremental page offsets to cover + * @bitmap_pgoff: The page offset of the first bit in @bitmap + * @bitmap: Pointer to a bitmap of at least @nr bits. The bitmap needs to + * cover the whole range @first_index..@first_index + @nr. + * @start: Pointer to number of the first set bit in @bitmap. + * is modified as new bits are set by the function. + * @end: Pointer to the number of the last set bit in @bitmap. + * none set. The value is modified as new bits are set by the function. + * + * Note: When this function returns there is no guarantee that a CPU has + * not already dirtied new ptes. However it will not clean any ptes not + * reported in the bitmap. The guarantees are as follows: + * a) All ptes dirty when the function starts executing will end up recorded + * in the bitmap. + * b) All ptes dirtied after that will either remain dirty, be recorded in the + * bitmap or both. + * + * If a caller needs to make sure all dirty ptes are picked up and none + * additional are added, it first needs to write-protect the address-space + * range and make sure new writers are blocked in page_mkwrite() or + * pfn_mkwrite(). And then after a TLB flush following the write-protection + * pick up all dirty bits. + * + * Note: This function currently skips transhuge page-table entries, since + * it's intended for dirty-tracking on the PTE level. It will warn on + * encountering transhuge dirty entries, though, and can easily be extended + * to handle them as well. + * + * Return: The number of dirty ptes actually cleaned. + */ +unsigned long clean_record_shared_mapping_range(struct address_space *mapping, + pgoff_t first_index, pgoff_t nr, + pgoff_t bitmap_pgoff, + unsigned long *bitmap, + pgoff_t *start, + pgoff_t *end) +{ + bool none_set = (*start >= *end); + struct clean_walk cwalk = { + .base = { .total = 0 }, + .bitmap_pgoff = bitmap_pgoff, + .bitmap = bitmap, + .start = none_set ? nr : *start, + .end = none_set ? 0 : *end, + }; + + i_mmap_lock_read(mapping); + WARN_ON(walk_page_mapping(mapping, first_index, nr, &clean_walk_ops, + &cwalk.base)); + i_mmap_unlock_read(mapping); + + *start = cwalk.start; + *end = cwalk.end; + + return cwalk.base.total; +} +EXPORT_SYMBOL_GPL(clean_record_shared_mapping_range); diff --git a/mm/memblock.c b/mm/memblock.c index 7d4f61ae666a..c4b16cae2bc9 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1356,9 +1356,6 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, align = SMP_CACHE_BYTES; } - if (end > memblock.current_limit) - end = memblock.current_limit; - again: found = memblock_find_in_range_node(size, align, start, end, nid, flags); @@ -1469,6 +1466,9 @@ static void * __init memblock_alloc_internal( if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, nid); + if (max_addr > memblock.current_limit) + max_addr = memblock.current_limit; + alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid); /* retry allocation without lower limit */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c313c49074ca..01f3f8b665e9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -484,7 +484,7 @@ ino_t page_cgroup_ino(struct page *page) unsigned long ino = 0; rcu_read_lock(); - if (PageHead(page) && PageSlab(page)) + if (PageSlab(page) && !PageTail(page)) memcg = memcg_from_slab_page(page); else memcg = READ_ONCE(page->mem_cgroup); @@ -960,7 +960,7 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) if (unlikely(!memcg)) memcg = root_mem_cgroup; } - } while (!css_tryget_online(&memcg->css)); + } while (!css_tryget(&memcg->css)); rcu_read_unlock(); return memcg; } @@ -1567,6 +1567,11 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) return max; } +unsigned long mem_cgroup_size(struct mem_cgroup *memcg) +{ + return page_counter_read(&memcg->memory); +} + static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, int order) { @@ -1795,7 +1800,7 @@ static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) struct mem_cgroup *iter; spin_lock(&memcg_oom_lock); - mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_); + mutex_release(&memcg_oom_lock_dep_map, _RET_IP_); for_each_mem_cgroup_tree(iter, memcg) iter->oom_lock = false; spin_unlock(&memcg_oom_lock); @@ -2530,6 +2535,15 @@ retry: } /* + * Memcg doesn't have a dedicated reserve for atomic + * allocations. But like the global atomic pool, we need to + * put the burden of reclaim on regular allocation requests + * and let these go through as privileged allocations. + */ + if (gfp_mask & __GFP_ATOMIC) + goto force; + + /* * Unlike in global OOM situations, memcg is not in a physical * memory shortage. Allow dying and OOM-killed tasks to * bypass the last charges so that they can exit quickly and @@ -5009,12 +5023,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; - /* - * Flush percpu vmstats and vmevents to guarantee the value correctness - * on parent's and all ancestor levels. - */ - memcg_flush_percpu_vmstats(memcg, false); - memcg_flush_percpu_vmevents(memcg); for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->vmstats_percpu); @@ -5025,6 +5033,12 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) static void mem_cgroup_free(struct mem_cgroup *memcg) { memcg_wb_domain_exit(memcg); + /* + * Flush percpu vmstats and vmevents to guarantee the value correctness + * on parent's and all ancestor levels. + */ + memcg_flush_percpu_vmstats(memcg, false); + memcg_flush_percpu_vmevents(memcg); __mem_cgroup_free(memcg); } @@ -5415,6 +5429,8 @@ static int mem_cgroup_move_account(struct page *page, struct mem_cgroup *from, struct mem_cgroup *to) { + struct lruvec *from_vec, *to_vec; + struct pglist_data *pgdat; unsigned long flags; unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; int ret; @@ -5438,11 +5454,15 @@ static int mem_cgroup_move_account(struct page *page, anon = PageAnon(page); + pgdat = page_pgdat(page); + from_vec = mem_cgroup_lruvec(pgdat, from); + to_vec = mem_cgroup_lruvec(pgdat, to); + spin_lock_irqsave(&from->move_lock, flags); if (!anon && page_mapped(page)) { - __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages); - __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages); + __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); + __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); } /* @@ -5454,14 +5474,14 @@ static int mem_cgroup_move_account(struct page *page, struct address_space *mapping = page_mapping(page); if (mapping_cap_account_dirty(mapping)) { - __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages); - __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages); + __mod_lruvec_state(from_vec, NR_FILE_DIRTY, -nr_pages); + __mod_lruvec_state(to_vec, NR_FILE_DIRTY, nr_pages); } } if (PageWriteback(page)) { - __mod_memcg_state(from, NR_WRITEBACK, -nr_pages); - __mod_memcg_state(to, NR_WRITEBACK, nr_pages); + __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages); + __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 7ef849da8278..3151c87dff73 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -199,7 +199,6 @@ struct to_kill { struct task_struct *tsk; unsigned long addr; short size_shift; - char addr_valid; }; /* @@ -324,22 +323,27 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, } } tk->addr = page_address_in_vma(p, vma); - tk->addr_valid = 1; if (is_zone_device_page(p)) tk->size_shift = dev_pagemap_mapping_shift(p, vma); else tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT; /* - * In theory we don't have to kill when the page was - * munmaped. But it could be also a mremap. Since that's - * likely very rare kill anyways just out of paranoia, but use - * a SIGKILL because the error is not contained anymore. + * Send SIGKILL if "tk->addr == -EFAULT". Also, as + * "tk->size_shift" is always non-zero for !is_zone_device_page(), + * so "tk->size_shift == 0" effectively checks no mapping on + * ZONE_DEVICE. Indeed, when a devdax page is mmapped N times + * to a process' address space, it's possible not all N VMAs + * contain mappings for the page, but at least one VMA does. + * Only deliver SIGBUS with payload derived from the VMA that + * has a mapping for the page. */ - if (tk->addr == -EFAULT || tk->size_shift == 0) { + if (tk->addr == -EFAULT) { pr_info("Memory failure: Unable to find user space address %lx in %s\n", page_to_pfn(p), tsk->comm); - tk->addr_valid = 0; + } else if (tk->size_shift == 0) { + kfree(tk); + return; } get_task_struct(tsk); tk->tsk = tsk; @@ -366,7 +370,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail, * make sure the process doesn't catch the * signal and then access the memory. Just kill it. */ - if (fail || tk->addr_valid == 0) { + if (fail || tk->addr == -EFAULT) { pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", pfn, tk->tsk->comm, tk->tsk->pid); do_send_sig_info(SIGKILL, SEND_SIG_PRIV, @@ -1253,17 +1257,19 @@ int memory_failure(unsigned long pfn, int flags) if (!sysctl_memory_failure_recovery) panic("Memory failure on page %lx", pfn); - if (!pfn_valid(pfn)) { + p = pfn_to_online_page(pfn); + if (!p) { + if (pfn_valid(pfn)) { + pgmap = get_dev_pagemap(pfn, NULL); + if (pgmap) + return memory_failure_dev_pagemap(pfn, flags, + pgmap); + } pr_err("Memory failure: %#lx: memory outside kernel control\n", pfn); return -ENXIO; } - pgmap = get_dev_pagemap(pfn, NULL); - if (pgmap) - return memory_failure_dev_pagemap(pfn, flags, pgmap); - - p = pfn_to_page(pfn); if (PageHuge(p)) return memory_failure_hugetlb(pfn, flags); if (TestSetPageHWPoison(p)) { diff --git a/mm/memory.c b/mm/memory.c index b1ca51a079f2..b6a5d6a08438 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -118,6 +118,18 @@ int randomize_va_space __read_mostly = 2; #endif +#ifndef arch_faults_on_old_pte +static inline bool arch_faults_on_old_pte(void) +{ + /* + * Those arches which don't have hw access flag feature need to + * implement their own helper. By default, "true" means pagefault + * will be hit on old pte. + */ + return true; +} +#endif + static int __init disable_randmaps(char *s) { randomize_va_space = 0; @@ -2145,32 +2157,82 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, return same; } -static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) +static inline bool cow_user_page(struct page *dst, struct page *src, + struct vm_fault *vmf) { + bool ret; + void *kaddr; + void __user *uaddr; + bool force_mkyoung; + struct vm_area_struct *vma = vmf->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long addr = vmf->address; + debug_dma_assert_idle(src); + if (likely(src)) { + copy_user_highpage(dst, src, addr, vma); + return true; + } + /* * If the source page was a PFN mapping, we don't have * a "struct page" for it. We do a best-effort copy by * just copying from the original user address. If that * fails, we just zero-fill it. Live with it. */ - if (unlikely(!src)) { - void *kaddr = kmap_atomic(dst); - void __user *uaddr = (void __user *)(va & PAGE_MASK); + kaddr = kmap_atomic(dst); + uaddr = (void __user *)(addr & PAGE_MASK); + + /* + * On architectures with software "accessed" bits, we would + * take a double page fault, so mark it accessed here. + */ + force_mkyoung = arch_faults_on_old_pte() && !pte_young(vmf->orig_pte); + if (force_mkyoung) { + pte_t entry; + + vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); + if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { + /* + * Other thread has already handled the fault + * and we don't need to do anything. If it's + * not the case, the fault will be triggered + * again on the same address. + */ + ret = false; + goto pte_unlock; + } + entry = pte_mkyoung(vmf->orig_pte); + if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0)) + update_mmu_cache(vma, addr, vmf->pte); + } + + /* + * This really shouldn't fail, because the page is there + * in the page tables. But it might just be unreadable, + * in which case we just give up and fill the result with + * zeroes. + */ + if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { /* - * This really shouldn't fail, because the page is there - * in the page tables. But it might just be unreadable, - * in which case we just give up and fill the result with - * zeroes. + * Give a warn in case there can be some obscure + * use-case */ - if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) - clear_page(kaddr); - kunmap_atomic(kaddr); - flush_dcache_page(dst); - } else - copy_user_highpage(dst, src, va, vma); + WARN_ON_ONCE(1); + clear_page(kaddr); + } + + ret = true; + +pte_unlock: + if (force_mkyoung) + pte_unmap_unlock(vmf->pte, vmf->ptl); + kunmap_atomic(kaddr); + flush_dcache_page(dst); + + return ret; } static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) @@ -2327,7 +2389,19 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) vmf->address); if (!new_page) goto oom; - cow_user_page(new_page, old_page, vmf->address, vma); + + if (!cow_user_page(new_page, old_page, vmf)) { + /* + * COW failed, if the fault was solved by other, + * it's fine. If not, userspace would re-fault on + * the same address and we will handle the fault + * from the second attempt. + */ + put_page(new_page); + if (old_page) + put_page(old_page); + return 0; + } } if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false)) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b1be791f772d..f307bd82d750 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -331,7 +331,7 @@ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone, unsigned long end_pfn) { for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) { - if (unlikely(!pfn_valid(start_pfn))) + if (unlikely(!pfn_to_online_page(start_pfn))) continue; if (unlikely(pfn_to_nid(start_pfn) != nid)) @@ -356,7 +356,7 @@ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone, /* pfn is the end pfn of a memory section. */ pfn = end_pfn - 1; for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) { - if (unlikely(!pfn_valid(pfn))) + if (unlikely(!pfn_to_online_page(pfn))) continue; if (unlikely(pfn_to_nid(pfn) != nid)) @@ -415,7 +415,7 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, */ pfn = zone_start_pfn; for (; pfn < zone_end_pfn; pfn += PAGES_PER_SUBSECTION) { - if (unlikely(!pfn_valid(pfn))) + if (unlikely(!pfn_to_online_page(pfn))) continue; if (page_zone(pfn_to_page(pfn)) != zone) @@ -436,67 +436,33 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, zone_span_writeunlock(zone); } -static void shrink_pgdat_span(struct pglist_data *pgdat, - unsigned long start_pfn, unsigned long end_pfn) +static void update_pgdat_span(struct pglist_data *pgdat) { - unsigned long pgdat_start_pfn = pgdat->node_start_pfn; - unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */ - unsigned long pgdat_end_pfn = p; - unsigned long pfn; - int nid = pgdat->node_id; - - if (pgdat_start_pfn == start_pfn) { - /* - * If the section is smallest section in the pgdat, it need - * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. - * In this case, we find second smallest valid mem_section - * for shrinking zone. - */ - pfn = find_smallest_section_pfn(nid, NULL, end_pfn, - pgdat_end_pfn); - if (pfn) { - pgdat->node_start_pfn = pfn; - pgdat->node_spanned_pages = pgdat_end_pfn - pfn; - } - } else if (pgdat_end_pfn == end_pfn) { - /* - * If the section is biggest section in the pgdat, it need - * shrink pgdat->node_spanned_pages. - * In this case, we find second biggest valid mem_section for - * shrinking zone. - */ - pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, - start_pfn); - if (pfn) - pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; - } + unsigned long node_start_pfn = 0, node_end_pfn = 0; + struct zone *zone; - /* - * If the section is not biggest or smallest mem_section in the pgdat, - * it only creates a hole in the pgdat. So in this case, we need not - * change the pgdat. - * But perhaps, the pgdat has only hole data. Thus it check the pgdat - * has only hole or not. - */ - pfn = pgdat_start_pfn; - for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SUBSECTION) { - if (unlikely(!pfn_valid(pfn))) - continue; + for (zone = pgdat->node_zones; + zone < pgdat->node_zones + MAX_NR_ZONES; zone++) { + unsigned long zone_end_pfn = zone->zone_start_pfn + + zone->spanned_pages; - if (pfn_to_nid(pfn) != nid) + /* No need to lock the zones, they can't change. */ + if (!zone->spanned_pages) continue; - - /* Skip range to be removed */ - if (pfn >= start_pfn && pfn < end_pfn) + if (!node_end_pfn) { + node_start_pfn = zone->zone_start_pfn; + node_end_pfn = zone_end_pfn; continue; + } - /* If we find valid section, we have nothing to do */ - return; + if (zone_end_pfn > node_end_pfn) + node_end_pfn = zone_end_pfn; + if (zone->zone_start_pfn < node_start_pfn) + node_start_pfn = zone->zone_start_pfn; } - /* The pgdat has no valid section */ - pgdat->node_start_pfn = 0; - pgdat->node_spanned_pages = 0; + pgdat->node_start_pfn = node_start_pfn; + pgdat->node_spanned_pages = node_end_pfn - node_start_pfn; } static void __remove_zone(struct zone *zone, unsigned long start_pfn, @@ -505,9 +471,19 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn, struct pglist_data *pgdat = zone->zone_pgdat; unsigned long flags; +#ifdef CONFIG_ZONE_DEVICE + /* + * Zone shrinking code cannot properly deal with ZONE_DEVICE. So + * we will not try to shrink the zones - which is okay as + * set_zone_contiguous() cannot deal with ZONE_DEVICE either way. + */ + if (zone_idx(zone) == ZONE_DEVICE) + return; +#endif + pgdat_resize_lock(zone->zone_pgdat, &flags); shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); - shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); + update_pgdat_span(pgdat); pgdat_resize_unlock(zone->zone_pgdat, &flags); } @@ -1680,6 +1656,18 @@ static int check_cpu_on_node(pg_data_t *pgdat) return 0; } +static int check_no_memblock_for_node_cb(struct memory_block *mem, void *arg) +{ + int nid = *(int *)arg; + + /* + * If a memory block belongs to multiple nodes, the stored nid is not + * reliable. However, such blocks are always online (e.g., cannot get + * offlined) and, therefore, are still spanned by the node. + */ + return mem->nid == nid ? -EEXIST : 0; +} + /** * try_offline_node * @nid: the node ID @@ -1692,25 +1680,24 @@ static int check_cpu_on_node(pg_data_t *pgdat) void try_offline_node(int nid) { pg_data_t *pgdat = NODE_DATA(nid); - unsigned long start_pfn = pgdat->node_start_pfn; - unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; - unsigned long pfn; - - for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { - unsigned long section_nr = pfn_to_section_nr(pfn); - - if (!present_section_nr(section_nr)) - continue; + int rc; - if (pfn_to_nid(pfn) != nid) - continue; + /* + * If the node still spans pages (especially ZONE_DEVICE), don't + * offline it. A node spans memory after move_pfn_range_to_zone(), + * e.g., after the memory block was onlined. + */ + if (pgdat->node_spanned_pages) + return; - /* - * some memory sections of this node are not removed, and we - * can't offline node now. - */ + /* + * Especially offline memory blocks might not be spanned by the + * node. They will get spanned by the node once they get onlined. + * However, they link to the node in sysfs and can get onlined later. + */ + rc = for_each_memory_block(&nid, check_no_memblock_for_node_cb); + if (rc) return; - } if (check_cpu_on_node(pgdat)) return; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4ae967bcf954..e08c94170ae4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -672,7 +672,9 @@ static const struct mm_walk_ops queue_pages_walk_ops = { * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were * specified. * 0 - queue pages successfully or no misplaced page. - * -EIO - there is misplaced page and only MPOL_MF_STRICT was specified. + * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or + * memory range specified by nodemask and maxnode points outside + * your accessible address space (-EFAULT) */ static int queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, @@ -1286,7 +1288,7 @@ static long do_mbind(unsigned long start, unsigned long len, flags | MPOL_MF_INVERT, &pagelist); if (ret < 0) { - err = -EIO; + err = ret; goto up_out; } @@ -1305,10 +1307,12 @@ static long do_mbind(unsigned long start, unsigned long len, if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT))) err = -EIO; - } else - putback_movable_pages(&pagelist); - + } else { up_out: + if (!list_empty(&pagelist)) + putback_movable_pages(&pagelist); + } + up_write(&mm->mmap_sem); mpol_out: mpol_put(new); diff --git a/mm/memremap.c b/mm/memremap.c index 32c79b51af86..03ccbdfeb697 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -13,8 +13,6 @@ #include <linux/xarray.h> static DEFINE_XARRAY(pgmap_array); -#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) -#define SECTION_SIZE (1UL << PA_SECTION_SHIFT) #ifdef CONFIG_DEV_PAGEMAP_OPS DEFINE_STATIC_KEY_FALSE(devmap_managed_key); @@ -105,6 +103,7 @@ static void dev_pagemap_cleanup(struct dev_pagemap *pgmap) void memunmap_pages(struct dev_pagemap *pgmap) { struct resource *res = &pgmap->res; + struct page *first_page; unsigned long pfn; int nid; @@ -113,14 +112,16 @@ void memunmap_pages(struct dev_pagemap *pgmap) put_page(pfn_to_page(pfn)); dev_pagemap_cleanup(pgmap); + /* make sure to access a memmap that was actually initialized */ + first_page = pfn_to_page(pfn_first(pgmap)); + /* pages are dead and unused, undo the arch mapping */ - nid = page_to_nid(pfn_to_page(PHYS_PFN(res->start))); + nid = page_to_nid(first_page); mem_hotplug_begin(); if (pgmap->type == MEMORY_DEVICE_PRIVATE) { - pfn = PHYS_PFN(res->start); - __remove_pages(page_zone(pfn_to_page(pfn)), pfn, - PHYS_PFN(resource_size(res)), NULL); + __remove_pages(page_zone(first_page), PHYS_PFN(res->start), + PHYS_PFN(resource_size(res)), NULL); } else { arch_remove_memory(nid, res->start, resource_size(res), pgmap_altmap(pgmap)); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 7fde88695f35..f76ea05b1cb0 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -12,6 +12,7 @@ #include <linux/export.h> #include <linux/mm.h> #include <linux/err.h> +#include <linux/interval_tree.h> #include <linux/srcu.h> #include <linux/rcupdate.h> #include <linux/sched.h> @@ -28,6 +29,254 @@ struct lockdep_map __mmu_notifier_invalidate_range_start_map = { #endif /* + * The mmu notifier_mm structure is allocated and installed in + * mm->mmu_notifier_mm inside the mm_take_all_locks() protected + * critical section and it's released only when mm_count reaches zero + * in mmdrop(). + */ +struct mmu_notifier_mm { + /* all mmu notifiers registered in this mm are queued in this list */ + struct hlist_head list; + bool has_itree; + /* to serialize the list modifications and hlist_unhashed */ + spinlock_t lock; + unsigned long invalidate_seq; + unsigned long active_invalidate_ranges; + struct rb_root_cached itree; + wait_queue_head_t wq; + struct hlist_head deferred_list; +}; + +/* + * This is a collision-retry read-side/write-side 'lock', a lot like a + * seqcount, however this allows multiple write-sides to hold it at + * once. Conceptually the write side is protecting the values of the PTEs in + * this mm, such that PTES cannot be read into SPTEs (shadow PTEs) while any + * writer exists. + * + * Note that the core mm creates nested invalidate_range_start()/end() regions + * within the same thread, and runs invalidate_range_start()/end() in parallel + * on multiple CPUs. This is designed to not reduce concurrency or block + * progress on the mm side. + * + * As a secondary function, holding the full write side also serves to prevent + * writers for the itree, this is an optimization to avoid extra locking + * during invalidate_range_start/end notifiers. + * + * The write side has two states, fully excluded: + * - mm->active_invalidate_ranges != 0 + * - mnn->invalidate_seq & 1 == True (odd) + * - some range on the mm_struct is being invalidated + * - the itree is not allowed to change + * + * And partially excluded: + * - mm->active_invalidate_ranges != 0 + * - mnn->invalidate_seq & 1 == False (even) + * - some range on the mm_struct is being invalidated + * - the itree is allowed to change + * + * Operations on mmu_notifier_mm->invalidate_seq (under spinlock): + * seq |= 1 # Begin writing + * seq++ # Release the writing state + * seq & 1 # True if a writer exists + * + * The later state avoids some expensive work on inv_end in the common case of + * no mni monitoring the VA. + */ +static bool mn_itree_is_invalidating(struct mmu_notifier_mm *mmn_mm) +{ + lockdep_assert_held(&mmn_mm->lock); + return mmn_mm->invalidate_seq & 1; +} + +static struct mmu_interval_notifier * +mn_itree_inv_start_range(struct mmu_notifier_mm *mmn_mm, + const struct mmu_notifier_range *range, + unsigned long *seq) +{ + struct interval_tree_node *node; + struct mmu_interval_notifier *res = NULL; + + spin_lock(&mmn_mm->lock); + mmn_mm->active_invalidate_ranges++; + node = interval_tree_iter_first(&mmn_mm->itree, range->start, + range->end - 1); + if (node) { + mmn_mm->invalidate_seq |= 1; + res = container_of(node, struct mmu_interval_notifier, + interval_tree); + } + + *seq = mmn_mm->invalidate_seq; + spin_unlock(&mmn_mm->lock); + return res; +} + +static struct mmu_interval_notifier * +mn_itree_inv_next(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range) +{ + struct interval_tree_node *node; + + node = interval_tree_iter_next(&mni->interval_tree, range->start, + range->end - 1); + if (!node) + return NULL; + return container_of(node, struct mmu_interval_notifier, interval_tree); +} + +static void mn_itree_inv_end(struct mmu_notifier_mm *mmn_mm) +{ + struct mmu_interval_notifier *mni; + struct hlist_node *next; + + spin_lock(&mmn_mm->lock); + if (--mmn_mm->active_invalidate_ranges || + !mn_itree_is_invalidating(mmn_mm)) { + spin_unlock(&mmn_mm->lock); + return; + } + + /* Make invalidate_seq even */ + mmn_mm->invalidate_seq++; + + /* + * The inv_end incorporates a deferred mechanism like rtnl_unlock(). + * Adds and removes are queued until the final inv_end happens then + * they are progressed. This arrangement for tree updates is used to + * avoid using a blocking lock during invalidate_range_start. + */ + hlist_for_each_entry_safe(mni, next, &mmn_mm->deferred_list, + deferred_item) { + if (RB_EMPTY_NODE(&mni->interval_tree.rb)) + interval_tree_insert(&mni->interval_tree, + &mmn_mm->itree); + else + interval_tree_remove(&mni->interval_tree, + &mmn_mm->itree); + hlist_del(&mni->deferred_item); + } + spin_unlock(&mmn_mm->lock); + + wake_up_all(&mmn_mm->wq); +} + +/** + * mmu_interval_read_begin - Begin a read side critical section against a VA + * range + * mni: The range to use + * + * mmu_iterval_read_begin()/mmu_iterval_read_retry() implement a + * collision-retry scheme similar to seqcount for the VA range under mni. If + * the mm invokes invalidation during the critical section then + * mmu_interval_read_retry() will return true. + * + * This is useful to obtain shadow PTEs where teardown or setup of the SPTEs + * require a blocking context. The critical region formed by this can sleep, + * and the required 'user_lock' can also be a sleeping lock. + * + * The caller is required to provide a 'user_lock' to serialize both teardown + * and setup. + * + * The return value should be passed to mmu_interval_read_retry(). + */ +unsigned long mmu_interval_read_begin(struct mmu_interval_notifier *mni) +{ + struct mmu_notifier_mm *mmn_mm = mni->mm->mmu_notifier_mm; + unsigned long seq; + bool is_invalidating; + + /* + * If the mni has a different seq value under the user_lock than we + * started with then it has collided. + * + * If the mni currently has the same seq value as the mmn_mm seq, then + * it is currently between invalidate_start/end and is colliding. + * + * The locking looks broadly like this: + * mn_tree_invalidate_start(): mmu_interval_read_begin(): + * spin_lock + * seq = READ_ONCE(mni->invalidate_seq); + * seq == mmn_mm->invalidate_seq + * spin_unlock + * spin_lock + * seq = ++mmn_mm->invalidate_seq + * spin_unlock + * op->invalidate_range(): + * user_lock + * mmu_interval_set_seq() + * mni->invalidate_seq = seq + * user_unlock + * + * [Required: mmu_interval_read_retry() == true] + * + * mn_itree_inv_end(): + * spin_lock + * seq = ++mmn_mm->invalidate_seq + * spin_unlock + * + * user_lock + * mmu_interval_read_retry(): + * mni->invalidate_seq != seq + * user_unlock + * + * Barriers are not needed here as any races here are closed by an + * eventual mmu_interval_read_retry(), which provides a barrier via the + * user_lock. + */ + spin_lock(&mmn_mm->lock); + /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */ + seq = READ_ONCE(mni->invalidate_seq); + is_invalidating = seq == mmn_mm->invalidate_seq; + spin_unlock(&mmn_mm->lock); + + /* + * mni->invalidate_seq must always be set to an odd value via + * mmu_interval_set_seq() using the provided cur_seq from + * mn_itree_inv_start_range(). This ensures that if seq does wrap we + * will always clear the below sleep in some reasonable time as + * mmn_mm->invalidate_seq is even in the idle state. + */ + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); + lock_map_release(&__mmu_notifier_invalidate_range_start_map); + if (is_invalidating) + wait_event(mmn_mm->wq, + READ_ONCE(mmn_mm->invalidate_seq) != seq); + + /* + * Notice that mmu_interval_read_retry() can already be true at this + * point, avoiding loops here allows the caller to provide a global + * time bound. + */ + + return seq; +} +EXPORT_SYMBOL_GPL(mmu_interval_read_begin); + +static void mn_itree_release(struct mmu_notifier_mm *mmn_mm, + struct mm_struct *mm) +{ + struct mmu_notifier_range range = { + .flags = MMU_NOTIFIER_RANGE_BLOCKABLE, + .event = MMU_NOTIFY_RELEASE, + .mm = mm, + .start = 0, + .end = ULONG_MAX, + }; + struct mmu_interval_notifier *mni; + unsigned long cur_seq; + bool ret; + + for (mni = mn_itree_inv_start_range(mmn_mm, &range, &cur_seq); mni; + mni = mn_itree_inv_next(mni, &range)) { + ret = mni->ops->invalidate(mni, &range, cur_seq); + WARN_ON(!ret); + } + + mn_itree_inv_end(mmn_mm); +} + +/* * This function can't run concurrently against mmu_notifier_register * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap * runs with mm_users == 0. Other tasks may still invoke mmu notifiers @@ -39,7 +288,8 @@ struct lockdep_map __mmu_notifier_invalidate_range_start_map = { * can't go away from under us as exit_mmap holds an mm_count pin * itself. */ -void __mmu_notifier_release(struct mm_struct *mm) +static void mn_hlist_release(struct mmu_notifier_mm *mmn_mm, + struct mm_struct *mm) { struct mmu_notifier *mn; int id; @@ -49,7 +299,7 @@ void __mmu_notifier_release(struct mm_struct *mm) * ->release returns. */ id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) + hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist) /* * If ->release runs before mmu_notifier_unregister it must be * handled, as it's the only way for the driver to flush all @@ -59,10 +309,9 @@ void __mmu_notifier_release(struct mm_struct *mm) if (mn->ops->release) mn->ops->release(mn, mm); - spin_lock(&mm->mmu_notifier_mm->lock); - while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { - mn = hlist_entry(mm->mmu_notifier_mm->list.first, - struct mmu_notifier, + spin_lock(&mmn_mm->lock); + while (unlikely(!hlist_empty(&mmn_mm->list))) { + mn = hlist_entry(mmn_mm->list.first, struct mmu_notifier, hlist); /* * We arrived before mmu_notifier_unregister so @@ -72,7 +321,7 @@ void __mmu_notifier_release(struct mm_struct *mm) */ hlist_del_init_rcu(&mn->hlist); } - spin_unlock(&mm->mmu_notifier_mm->lock); + spin_unlock(&mmn_mm->lock); srcu_read_unlock(&srcu, id); /* @@ -87,6 +336,17 @@ void __mmu_notifier_release(struct mm_struct *mm) synchronize_srcu(&srcu); } +void __mmu_notifier_release(struct mm_struct *mm) +{ + struct mmu_notifier_mm *mmn_mm = mm->mmu_notifier_mm; + + if (mmn_mm->has_itree) + mn_itree_release(mmn_mm, mm); + + if (!hlist_empty(&mmn_mm->list)) + mn_hlist_release(mmn_mm, mm); +} + /* * If no young bitflag is supported by the hardware, ->clear_flush_young can * unmap the address and return 1 or 0 depending if the mapping previously @@ -159,14 +419,43 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, srcu_read_unlock(&srcu, id); } -int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) +static int mn_itree_invalidate(struct mmu_notifier_mm *mmn_mm, + const struct mmu_notifier_range *range) +{ + struct mmu_interval_notifier *mni; + unsigned long cur_seq; + + for (mni = mn_itree_inv_start_range(mmn_mm, range, &cur_seq); mni; + mni = mn_itree_inv_next(mni, range)) { + bool ret; + + ret = mni->ops->invalidate(mni, range, cur_seq); + if (!ret) { + if (WARN_ON(mmu_notifier_range_blockable(range))) + continue; + goto out_would_block; + } + } + return 0; + +out_would_block: + /* + * On -EAGAIN the non-blocking caller is not allowed to call + * invalidate_range_end() + */ + mn_itree_inv_end(mmn_mm); + return -EAGAIN; +} + +static int mn_hlist_invalidate_range_start(struct mmu_notifier_mm *mmn_mm, + struct mmu_notifier_range *range) { struct mmu_notifier *mn; int ret = 0; int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) { + hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist) { if (mn->ops->invalidate_range_start) { int _ret; @@ -180,7 +469,7 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) mn->ops->invalidate_range_start, _ret, !mmu_notifier_range_blockable(range) ? "non-" : ""); WARN_ON(mmu_notifier_range_blockable(range) || - ret != -EAGAIN); + _ret != -EAGAIN); ret = _ret; } } @@ -190,15 +479,30 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) return ret; } -void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, - bool only_end) +int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) +{ + struct mmu_notifier_mm *mmn_mm = range->mm->mmu_notifier_mm; + int ret; + + if (mmn_mm->has_itree) { + ret = mn_itree_invalidate(mmn_mm, range); + if (ret) + return ret; + } + if (!hlist_empty(&mmn_mm->list)) + return mn_hlist_invalidate_range_start(mmn_mm, range); + return 0; +} + +static void mn_hlist_invalidate_end(struct mmu_notifier_mm *mmn_mm, + struct mmu_notifier_range *range, + bool only_end) { struct mmu_notifier *mn; int id; - lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) { + hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist) { /* * Call invalidate_range here too to avoid the need for the * subsystem of having to register an invalidate_range_end @@ -225,6 +529,19 @@ void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, } } srcu_read_unlock(&srcu, id); +} + +void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, + bool only_end) +{ + struct mmu_notifier_mm *mmn_mm = range->mm->mmu_notifier_mm; + + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); + if (mmn_mm->has_itree) + mn_itree_inv_end(mmn_mm); + + if (!hlist_empty(&mmn_mm->list)) + mn_hlist_invalidate_end(mmn_mm, range, only_end); lock_map_release(&__mmu_notifier_invalidate_range_start_map); } @@ -243,8 +560,9 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm, } /* - * Same as mmu_notifier_register but here the caller must hold the - * mmap_sem in write mode. + * Same as mmu_notifier_register but here the caller must hold the mmap_sem in + * write mode. A NULL mn signals the notifier is being registered for itree + * mode. */ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) { @@ -261,9 +579,6 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) fs_reclaim_release(GFP_KERNEL); } - mn->mm = mm; - mn->users = 1; - if (!mm->mmu_notifier_mm) { /* * kmalloc cannot be called under mm_take_all_locks(), but we @@ -271,21 +586,22 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) * the write side of the mmap_sem. */ mmu_notifier_mm = - kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); + kzalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); if (!mmu_notifier_mm) return -ENOMEM; INIT_HLIST_HEAD(&mmu_notifier_mm->list); spin_lock_init(&mmu_notifier_mm->lock); + mmu_notifier_mm->invalidate_seq = 2; + mmu_notifier_mm->itree = RB_ROOT_CACHED; + init_waitqueue_head(&mmu_notifier_mm->wq); + INIT_HLIST_HEAD(&mmu_notifier_mm->deferred_list); } ret = mm_take_all_locks(mm); if (unlikely(ret)) goto out_clean; - /* Pairs with the mmdrop in mmu_notifier_unregister_* */ - mmgrab(mm); - /* * Serialize the update against mmu_notifier_unregister. A * side note: mmu_notifier_release can't run concurrently with @@ -293,13 +609,28 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) * current->mm or explicitly with get_task_mm() or similar). * We can't race against any other mmu notifier method either * thanks to mm_take_all_locks(). + * + * release semantics on the initialization of the mmu_notifier_mm's + * contents are provided for unlocked readers. acquire can only be + * used while holding the mmgrab or mmget, and is safe because once + * created the mmu_notififer_mm is not freed until the mm is + * destroyed. As above, users holding the mmap_sem or one of the + * mm_take_all_locks() do not need to use acquire semantics. */ if (mmu_notifier_mm) - mm->mmu_notifier_mm = mmu_notifier_mm; + smp_store_release(&mm->mmu_notifier_mm, mmu_notifier_mm); - spin_lock(&mm->mmu_notifier_mm->lock); - hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list); - spin_unlock(&mm->mmu_notifier_mm->lock); + if (mn) { + /* Pairs with the mmdrop in mmu_notifier_unregister_* */ + mmgrab(mm); + mn->mm = mm; + mn->users = 1; + + spin_lock(&mm->mmu_notifier_mm->lock); + hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list); + spin_unlock(&mm->mmu_notifier_mm->lock); + } else + mm->mmu_notifier_mm->has_itree = true; mm_drop_all_locks(mm); BUG_ON(atomic_read(&mm->mm_users) <= 0); @@ -516,6 +847,180 @@ out_unlock: } EXPORT_SYMBOL_GPL(mmu_notifier_put); +static int __mmu_interval_notifier_insert( + struct mmu_interval_notifier *mni, struct mm_struct *mm, + struct mmu_notifier_mm *mmn_mm, unsigned long start, + unsigned long length, const struct mmu_interval_notifier_ops *ops) +{ + mni->mm = mm; + mni->ops = ops; + RB_CLEAR_NODE(&mni->interval_tree.rb); + mni->interval_tree.start = start; + /* + * Note that the representation of the intervals in the interval tree + * considers the ending point as contained in the interval. + */ + if (length == 0 || + check_add_overflow(start, length - 1, &mni->interval_tree.last)) + return -EOVERFLOW; + + /* Must call with a mmget() held */ + if (WARN_ON(atomic_read(&mm->mm_count) <= 0)) + return -EINVAL; + + /* pairs with mmdrop in mmu_interval_notifier_remove() */ + mmgrab(mm); + + /* + * If some invalidate_range_start/end region is going on in parallel + * we don't know what VA ranges are affected, so we must assume this + * new range is included. + * + * If the itree is invalidating then we are not allowed to change + * it. Retrying until invalidation is done is tricky due to the + * possibility for live lock, instead defer the add to + * mn_itree_inv_end() so this algorithm is deterministic. + * + * In all cases the value for the mni->invalidate_seq should be + * odd, see mmu_interval_read_begin() + */ + spin_lock(&mmn_mm->lock); + if (mmn_mm->active_invalidate_ranges) { + if (mn_itree_is_invalidating(mmn_mm)) + hlist_add_head(&mni->deferred_item, + &mmn_mm->deferred_list); + else { + mmn_mm->invalidate_seq |= 1; + interval_tree_insert(&mni->interval_tree, + &mmn_mm->itree); + } + mni->invalidate_seq = mmn_mm->invalidate_seq; + } else { + WARN_ON(mn_itree_is_invalidating(mmn_mm)); + /* + * The starting seq for a mni not under invalidation should be + * odd, not equal to the current invalidate_seq and + * invalidate_seq should not 'wrap' to the new seq any time + * soon. + */ + mni->invalidate_seq = mmn_mm->invalidate_seq - 1; + interval_tree_insert(&mni->interval_tree, &mmn_mm->itree); + } + spin_unlock(&mmn_mm->lock); + return 0; +} + +/** + * mmu_interval_notifier_insert - Insert an interval notifier + * @mni: Interval notifier to register + * @start: Starting virtual address to monitor + * @length: Length of the range to monitor + * @mm : mm_struct to attach to + * + * This function subscribes the interval notifier for notifications from the + * mm. Upon return the ops related to mmu_interval_notifier will be called + * whenever an event that intersects with the given range occurs. + * + * Upon return the range_notifier may not be present in the interval tree yet. + * The caller must use the normal interval notifier read flow via + * mmu_interval_read_begin() to establish SPTEs for this range. + */ +int mmu_interval_notifier_insert(struct mmu_interval_notifier *mni, + struct mm_struct *mm, unsigned long start, + unsigned long length, + const struct mmu_interval_notifier_ops *ops) +{ + struct mmu_notifier_mm *mmn_mm; + int ret; + + might_lock(&mm->mmap_sem); + + mmn_mm = smp_load_acquire(&mm->mmu_notifier_mm); + if (!mmn_mm || !mmn_mm->has_itree) { + ret = mmu_notifier_register(NULL, mm); + if (ret) + return ret; + mmn_mm = mm->mmu_notifier_mm; + } + return __mmu_interval_notifier_insert(mni, mm, mmn_mm, start, length, + ops); +} +EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert); + +int mmu_interval_notifier_insert_locked( + struct mmu_interval_notifier *mni, struct mm_struct *mm, + unsigned long start, unsigned long length, + const struct mmu_interval_notifier_ops *ops) +{ + struct mmu_notifier_mm *mmn_mm; + int ret; + + lockdep_assert_held_write(&mm->mmap_sem); + + mmn_mm = mm->mmu_notifier_mm; + if (!mmn_mm || !mmn_mm->has_itree) { + ret = __mmu_notifier_register(NULL, mm); + if (ret) + return ret; + mmn_mm = mm->mmu_notifier_mm; + } + return __mmu_interval_notifier_insert(mni, mm, mmn_mm, start, length, + ops); +} +EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked); + +/** + * mmu_interval_notifier_remove - Remove a interval notifier + * @mni: Interval notifier to unregister + * + * This function must be paired with mmu_interval_notifier_insert(). It cannot + * be called from any ops callback. + * + * Once this returns ops callbacks are no longer running on other CPUs and + * will not be called in future. + */ +void mmu_interval_notifier_remove(struct mmu_interval_notifier *mni) +{ + struct mm_struct *mm = mni->mm; + struct mmu_notifier_mm *mmn_mm = mm->mmu_notifier_mm; + unsigned long seq = 0; + + might_sleep(); + + spin_lock(&mmn_mm->lock); + if (mn_itree_is_invalidating(mmn_mm)) { + /* + * remove is being called after insert put this on the + * deferred list, but before the deferred list was processed. + */ + if (RB_EMPTY_NODE(&mni->interval_tree.rb)) { + hlist_del(&mni->deferred_item); + } else { + hlist_add_head(&mni->deferred_item, + &mmn_mm->deferred_list); + seq = mmn_mm->invalidate_seq; + } + } else { + WARN_ON(RB_EMPTY_NODE(&mni->interval_tree.rb)); + interval_tree_remove(&mni->interval_tree, &mmn_mm->itree); + } + spin_unlock(&mmn_mm->lock); + + /* + * The possible sleep on progress in the invalidation requires the + * caller not hold any locks held by invalidation callbacks. + */ + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); + lock_map_release(&__mmu_notifier_invalidate_range_start_map); + if (seq) + wait_event(mmn_mm->wq, + READ_ONCE(mmn_mm->invalidate_seq) != seq); + + /* pairs with mmgrab in mmu_interval_notifier_insert() */ + mmdrop(mm); +} +EXPORT_SYMBOL_GPL(mmu_interval_notifier_remove); + /** * mmu_notifier_synchronize - Ensure all mmu_notifiers are freed * diff --git a/mm/nommu.c b/mm/nommu.c index 99b7ec318824..7de592058ab4 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -155,11 +155,11 @@ void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags) return __vmalloc(size, flags, PAGE_KERNEL); } -void *vmalloc_user(unsigned long size) +static void *__vmalloc_user_flags(unsigned long size, gfp_t flags) { void *ret; - ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); + ret = __vmalloc(size, flags, PAGE_KERNEL); if (ret) { struct vm_area_struct *vma; @@ -172,8 +172,19 @@ void *vmalloc_user(unsigned long size) return ret; } + +void *vmalloc_user(unsigned long size) +{ + return __vmalloc_user_flags(size, GFP_KERNEL | __GFP_ZERO); +} EXPORT_SYMBOL(vmalloc_user); +void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags) +{ + return __vmalloc_user_flags(size, flags | __GFP_ZERO); +} +EXPORT_SYMBOL(vmalloc_user_node_flags); + struct page *vmalloc_to_page(const void *addr) { return virt_to_page(addr); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 15c2050c629b..f391c0c4ed1d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1175,11 +1175,17 @@ static __always_inline bool free_pages_prepare(struct page *page, debug_check_no_obj_freed(page_address(page), PAGE_SIZE << order); } - arch_free_page(page, order); if (want_init_on_free()) kernel_init_free_pages(page, 1 << order); kernel_poison_pages(page, 1 << order, 0); + /* + * arch_free_page() can make the page's contents inaccessible. s390 + * does this. So nothing which can access the page's contents should + * happen after this. + */ + arch_free_page(page, order); + if (debug_pagealloc_enabled()) kernel_map_pages(page, 1 << order, 0); @@ -1942,6 +1948,14 @@ void __init page_alloc_init_late(void) wait_for_completion(&pgdat_init_all_done_comp); /* + * The number of managed pages has changed due to the initialisation + * so the pcpu batch and high limits needs to be updated or the limits + * will be artificially small. + */ + for_each_populated_zone(zone) + zone_pcp_update(zone); + + /* * We initialized the rest of the deferred pages. Permanently disable * on-demand struct page initialization. */ @@ -3714,10 +3728,6 @@ try_this_zone: static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) { unsigned int filter = SHOW_MEM_FILTER_NODES; - static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1); - - if (!__ratelimit(&show_mem_rs)) - return; /* * This documents exceptions given to allocations in certain @@ -3738,8 +3748,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) { struct va_format vaf; va_list args; - static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); + static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1); if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) return; @@ -4467,12 +4476,14 @@ retry_cpuset: if (page) goto got_pg; - if (order >= pageblock_order && (gfp_mask & __GFP_IO)) { + if (order >= pageblock_order && (gfp_mask & __GFP_IO) && + !(gfp_mask & __GFP_RETRY_MAYFAIL)) { /* * If allocating entire pageblock(s) and compaction * failed because all zones are below low watermarks * or is prohibited because it recently failed at this - * order, fail immediately. + * order, fail immediately unless the allocator has + * requested compaction and reclaim retry. * * Reclaim is * - potentially very expensive because zones are far @@ -8506,7 +8517,6 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages) WARN(count != 0, "%d pages are still in use!\n", count); } -#ifdef CONFIG_MEMORY_HOTPLUG /* * The zone indicated has a new number of managed_pages; batch sizes and percpu * page high values need to be recalulated. @@ -8520,7 +8530,6 @@ void __meminit zone_pcp_update(struct zone *zone) per_cpu_ptr(zone->pageset, cpu)); mutex_unlock(&pcp_batch_high_lock); } -#endif void zone_pcp_reset(struct zone *zone) { diff --git a/mm/page_ext.c b/mm/page_ext.c index 5f5769c7db3b..4ade843ff588 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -67,8 +67,9 @@ static struct page_ext_operations *page_ext_ops[] = { #endif }; +unsigned long page_ext_size = sizeof(struct page_ext); + static unsigned long total_usage; -static unsigned long extra_mem; static bool __init invoke_need_callbacks(void) { @@ -78,9 +79,8 @@ static bool __init invoke_need_callbacks(void) for (i = 0; i < entries; i++) { if (page_ext_ops[i]->need && page_ext_ops[i]->need()) { - page_ext_ops[i]->offset = sizeof(struct page_ext) + - extra_mem; - extra_mem += page_ext_ops[i]->size; + page_ext_ops[i]->offset = page_ext_size; + page_ext_size += page_ext_ops[i]->size; need = true; } } @@ -99,14 +99,9 @@ static void __init invoke_init_callbacks(void) } } -static unsigned long get_entry_size(void) -{ - return sizeof(struct page_ext) + extra_mem; -} - static inline struct page_ext *get_entry(void *base, unsigned long index) { - return base + get_entry_size() * index; + return base + page_ext_size * index; } #if !defined(CONFIG_SPARSEMEM) @@ -156,7 +151,7 @@ static int __init alloc_node_page_ext(int nid) !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES)) nr_pages += MAX_ORDER_NR_PAGES; - table_size = get_entry_size() * nr_pages; + table_size = page_ext_size * nr_pages; base = memblock_alloc_try_nid( table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), @@ -234,7 +229,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid) if (section->page_ext) return 0; - table_size = get_entry_size() * PAGES_PER_SECTION; + table_size = page_ext_size * PAGES_PER_SECTION; base = alloc_page_ext(table_size, nid); /* @@ -254,7 +249,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid) * we need to apply a mask. */ pfn &= PAGE_SECTION_MASK; - section->page_ext = (void *)base - get_entry_size() * pfn; + section->page_ext = (void *)base - page_ext_size * pfn; total_usage += table_size; return 0; } @@ -267,7 +262,7 @@ static void free_page_ext(void *addr) struct page *page = virt_to_page(addr); size_t table_size; - table_size = get_entry_size() * PAGES_PER_SECTION; + table_size = page_ext_size * PAGES_PER_SECTION; BUG_ON(PageReserved(page)); kmemleak_free(addr); diff --git a/mm/page_io.c b/mm/page_io.c index 24ee600f9131..60a66a58b9bf 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -73,6 +73,7 @@ static void swap_slot_free_notify(struct page *page) { struct swap_info_struct *sis; struct gendisk *disk; + swp_entry_t entry; /* * There is no guarantee that the page is in swap cache - the software @@ -104,11 +105,10 @@ static void swap_slot_free_notify(struct page *page) * we again wish to reclaim it. */ disk = sis->bdev->bd_disk; - if (disk->fops->swap_slot_free_notify) { - swp_entry_t entry; + entry.val = page_private(page); + if (disk->fops->swap_slot_free_notify && __swap_count(entry) == 1) { unsigned long offset; - entry.val = page_private(page); offset = swp_offset(entry); SetPageDirty(page); diff --git a/mm/page_owner.c b/mm/page_owner.c index dee931184788..18ecde9f45b2 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -24,12 +24,10 @@ struct page_owner { short last_migrate_reason; gfp_t gfp_mask; depot_stack_handle_t handle; -#ifdef CONFIG_DEBUG_PAGEALLOC depot_stack_handle_t free_handle; -#endif }; -static bool page_owner_disabled = true; +static bool page_owner_enabled = false; DEFINE_STATIC_KEY_FALSE(page_owner_inited); static depot_stack_handle_t dummy_handle; @@ -44,7 +42,7 @@ static int __init early_page_owner_param(char *buf) return -EINVAL; if (strcmp(buf, "on") == 0) - page_owner_disabled = false; + page_owner_enabled = true; return 0; } @@ -52,10 +50,7 @@ early_param("page_owner", early_page_owner_param); static bool need_page_owner(void) { - if (page_owner_disabled) - return false; - - return true; + return page_owner_enabled; } static __always_inline depot_stack_handle_t create_dummy_stack(void) @@ -84,7 +79,7 @@ static noinline void register_early_stack(void) static void init_page_owner(void) { - if (page_owner_disabled) + if (!page_owner_enabled) return; register_dummy_stack(); @@ -148,25 +143,19 @@ void __reset_page_owner(struct page *page, unsigned int order) { int i; struct page_ext *page_ext; -#ifdef CONFIG_DEBUG_PAGEALLOC depot_stack_handle_t handle = 0; struct page_owner *page_owner; - if (debug_pagealloc_enabled()) - handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); -#endif + handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + return; for (i = 0; i < (1 << order); i++) { - page_ext = lookup_page_ext(page + i); - if (unlikely(!page_ext)) - continue; - __clear_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags); -#ifdef CONFIG_DEBUG_PAGEALLOC - if (debug_pagealloc_enabled()) { - page_owner = get_page_owner(page_ext); - page_owner->free_handle = handle; - } -#endif + __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); + page_owner = get_page_owner(page_ext); + page_owner->free_handle = handle; + page_ext = page_ext_next(page_ext); } } @@ -184,9 +173,9 @@ static inline void __set_page_owner_handle(struct page *page, page_owner->gfp_mask = gfp_mask; page_owner->last_migrate_reason = -1; __set_bit(PAGE_EXT_OWNER, &page_ext->flags); - __set_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags); + __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); - page_ext = lookup_page_ext(page + i); + page_ext = page_ext_next(page_ext); } } @@ -224,12 +213,10 @@ void __split_page_owner(struct page *page, unsigned int order) if (unlikely(!page_ext)) return; - page_owner = get_page_owner(page_ext); - page_owner->order = 0; - for (i = 1; i < (1 << order); i++) { - page_ext = lookup_page_ext(page + i); + for (i = 0; i < (1 << order); i++) { page_owner = get_page_owner(page_ext); page_owner->order = 0; + page_ext = page_ext_next(page_ext); } } @@ -260,7 +247,7 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) * the new page, which will be freed. */ __set_bit(PAGE_EXT_OWNER, &new_ext->flags); - __set_bit(PAGE_EXT_OWNER_ACTIVE, &new_ext->flags); + __set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags); } void pagetypeinfo_showmixedcount_print(struct seq_file *m, @@ -284,7 +271,8 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, * not matter as the mixed block count will still be correct */ for (; pfn < end_pfn; ) { - if (!pfn_valid(pfn)) { + page = pfn_to_online_page(pfn); + if (!page) { pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); continue; } @@ -292,13 +280,13 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); block_end_pfn = min(block_end_pfn, end_pfn); - page = pfn_to_page(pfn); pageblock_mt = get_pageblock_migratetype(page); for (; pfn < block_end_pfn; pfn++) { if (!pfn_valid_within(pfn)) continue; + /* The pageblock is online, no need to recheck. */ page = pfn_to_page(pfn); if (page_zone(page) != zone) @@ -320,7 +308,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, if (unlikely(!page_ext)) continue; - if (!test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags)) + if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) continue; page_owner = get_page_owner(page_ext); @@ -435,7 +423,7 @@ void __dump_page_owner(struct page *page) return; } - if (test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags)) + if (test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) pr_alert("page_owner tracks the page as allocated\n"); else pr_alert("page_owner tracks the page as freed\n"); @@ -451,7 +439,6 @@ void __dump_page_owner(struct page *page) stack_trace_print(entries, nr_entries, 0); } -#ifdef CONFIG_DEBUG_PAGEALLOC handle = READ_ONCE(page_owner->free_handle); if (!handle) { pr_alert("page_owner free stack trace missing\n"); @@ -460,7 +447,6 @@ void __dump_page_owner(struct page *page) pr_alert("page last free stack trace:\n"); stack_trace_print(entries, nr_entries, 0); } -#endif if (page_owner->last_migrate_reason != -1) pr_alert("page has been migrated, last migrate reason: %s\n", @@ -527,7 +513,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) * Although we do have the info about past allocation of free * pages, it's not relevant for current memory usage. */ - if (!test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags)) + if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) continue; page_owner = get_page_owner(page_ext); diff --git a/mm/pagewalk.c b/mm/pagewalk.c index d48c2a986ea3..ea0b9e606ad1 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -10,8 +10,9 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t *pte; int err = 0; const struct mm_walk_ops *ops = walk->ops; + spinlock_t *ptl; - pte = pte_offset_map(pmd, addr); + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); for (;;) { err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); if (err) @@ -22,7 +23,7 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte++; } - pte_unmap(pte); + pte_unmap_unlock(pte, ptl); return err; } @@ -253,13 +254,23 @@ static int __walk_page_range(unsigned long start, unsigned long end, { int err = 0; struct vm_area_struct *vma = walk->vma; + const struct mm_walk_ops *ops = walk->ops; + + if (vma && ops->pre_vma) { + err = ops->pre_vma(start, end, walk); + if (err) + return err; + } if (vma && is_vm_hugetlb_page(vma)) { - if (walk->ops->hugetlb_entry) + if (ops->hugetlb_entry) err = walk_hugetlb_range(start, end, walk); } else err = walk_pgd_range(start, end, walk); + if (vma && ops->post_vma) + ops->post_vma(walk); + return err; } @@ -290,6 +301,11 @@ static int __walk_page_range(unsigned long start, unsigned long end, * its vm_flags. walk_page_test() and @ops->test_walk() are used for this * purpose. * + * If operations need to be staged before and committed after a vma is walked, + * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(), + * since it is intended to handle commit-type operations, can't return any + * errors. + * * struct mm_walk keeps current values of some common data like vma and pmd, * which are useful for the access from callbacks. If you want to pass some * caller-specific data to callbacks, @private should be helpful. @@ -376,3 +392,80 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, return err; return __walk_page_range(vma->vm_start, vma->vm_end, &walk); } + +/** + * walk_page_mapping - walk all memory areas mapped into a struct address_space. + * @mapping: Pointer to the struct address_space + * @first_index: First page offset in the address_space + * @nr: Number of incremental page offsets to cover + * @ops: operation to call during the walk + * @private: private data for callbacks' usage + * + * This function walks all memory areas mapped into a struct address_space. + * The walk is limited to only the given page-size index range, but if + * the index boundaries cross a huge page-table entry, that entry will be + * included. + * + * Also see walk_page_range() for additional information. + * + * Locking: + * This function can't require that the struct mm_struct::mmap_sem is held, + * since @mapping may be mapped by multiple processes. Instead + * @mapping->i_mmap_rwsem must be held. This might have implications in the + * callbacks, and it's up tho the caller to ensure that the + * struct mm_struct::mmap_sem is not needed. + * + * Also this means that a caller can't rely on the struct + * vm_area_struct::vm_flags to be constant across a call, + * except for immutable flags. Callers requiring this shouldn't use + * this function. + * + * Return: 0 on success, negative error code on failure, positive number on + * caller defined premature termination. + */ +int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, + pgoff_t nr, const struct mm_walk_ops *ops, + void *private) +{ + struct mm_walk walk = { + .ops = ops, + .private = private, + }; + struct vm_area_struct *vma; + pgoff_t vba, vea, cba, cea; + unsigned long start_addr, end_addr; + int err = 0; + + lockdep_assert_held(&mapping->i_mmap_rwsem); + vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index, + first_index + nr - 1) { + /* Clip to the vma */ + vba = vma->vm_pgoff; + vea = vba + vma_pages(vma); + cba = first_index; + cba = max(cba, vba); + cea = first_index + nr; + cea = min(cea, vea); + + start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start; + end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start; + if (start_addr >= end_addr) + continue; + + walk.vma = vma; + walk.mm = vma->vm_mm; + + err = walk_page_test(vma->vm_start, vma->vm_end, &walk); + if (err > 0) { + err = 0; + break; + } else if (err < 0) + break; + + err = __walk_page_range(start_addr, end_addr, &walk); + if (err) + break; + } + + return err; +} diff --git a/mm/rmap.c b/mm/rmap.c index d9a23bb773bf..0c7b2a9400d4 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -61,6 +61,7 @@ #include <linux/mmu_notifier.h> #include <linux/migrate.h> #include <linux/hugetlb.h> +#include <linux/huge_mm.h> #include <linux/backing-dev.h> #include <linux/page_idle.h> #include <linux/memremap.h> diff --git a/mm/shmem.c b/mm/shmem.c index cd570cc79c76..220be9fa2c41 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3482,6 +3482,12 @@ static int shmem_parse_options(struct fs_context *fc, void *data) { char *options = data; + if (options) { + int err = security_sb_eat_lsm_opts(options, &fc->security); + if (err) + return err; + } + while (options != NULL) { char *this_char = options; for (;;) { diff --git a/mm/shuffle.c b/mm/shuffle.c index 3ce12481b1dc..b3fe97fd6654 100644 --- a/mm/shuffle.c +++ b/mm/shuffle.c @@ -33,7 +33,7 @@ __meminit void page_alloc_shuffle(enum mm_shuffle_ctl ctl) } static bool shuffle_param; -extern int shuffle_show(char *buffer, const struct kernel_param *kp) +static int shuffle_show(char *buffer, const struct kernel_param *kp) { return sprintf(buffer, "%c\n", test_bit(SHUFFLE_ENABLE, &shuffle_state) ? 'Y' : 'N'); diff --git a/mm/slab.c b/mm/slab.c index 9df370558e5d..66e5d8032bae 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4206,9 +4206,12 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, /** * __ksize -- Uninstrumented ksize. + * @objp: pointer to the object * * Unlike ksize(), __ksize() is uninstrumented, and does not provide the same * safety checks as ksize() with KASAN instrumentation enabled. + * + * Return: size of the actual memory used by @objp in bytes */ size_t __ksize(const void *objp) { diff --git a/mm/slab.h b/mm/slab.h index 68e455f2b698..b2b01694dc43 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -323,8 +323,8 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) * Expects a pointer to a slab page. Please note, that PageSlab() check * isn't sufficient, as it returns true also for tail compound slab pages, * which do not have slab_cache pointer set. - * So this function assumes that the page can pass PageHead() and PageSlab() - * checks. + * So this function assumes that the page can pass PageSlab() && !PageTail() + * check. * * The kmem_cache can be reparented asynchronously. The caller must ensure * the memcg lifetime, e.g. by taking rcu_read_lock() or cgroup_mutex. diff --git a/mm/slab_common.c b/mm/slab_common.c index 6491c3a41805..f9fb27b4c843 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -178,10 +178,13 @@ static int init_memcg_params(struct kmem_cache *s, static void destroy_memcg_params(struct kmem_cache *s) { - if (is_root_cache(s)) + if (is_root_cache(s)) { kvfree(rcu_access_pointer(s->memcg_params.memcg_caches)); - else + } else { + mem_cgroup_put(s->memcg_params.memcg); + WRITE_ONCE(s->memcg_params.memcg, NULL); percpu_ref_exit(&s->memcg_params.refcnt); + } } static void free_memcg_params(struct rcu_head *rcu) @@ -253,8 +256,6 @@ static void memcg_unlink_cache(struct kmem_cache *s) } else { list_del(&s->memcg_params.children_node); list_del(&s->memcg_params.kmem_caches_node); - mem_cgroup_put(s->memcg_params.memcg); - WRITE_ONCE(s->memcg_params.memcg, NULL); } } #else @@ -1030,10 +1031,19 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, unsigned int useroffset, unsigned int usersize) { int err; + unsigned int align = ARCH_KMALLOC_MINALIGN; s->name = name; s->size = s->object_size = size; - s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size); + + /* + * For power of two sizes, guarantee natural alignment for kmalloc + * caches, regardless of SL*B debugging options. + */ + if (is_power_of_2(size)) + align = max(align, size); + s->align = calculate_alignment(flags, align, size); + s->useroffset = useroffset; s->usersize = usersize; @@ -1287,12 +1297,16 @@ void __init create_kmalloc_caches(slab_flags_t flags) */ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) { - void *ret; + void *ret = NULL; struct page *page; flags |= __GFP_COMP; page = alloc_pages(flags, order); - ret = page ? page_address(page) : NULL; + if (likely(page)) { + ret = page_address(page); + mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE, + 1 << order); + } ret = kasan_kmalloc_large(ret, size, flags); /* As ret might get tagged, call kmemleak hook after KASAN. */ kmemleak_alloc(ret, size, 1, flags); diff --git a/mm/slob.c b/mm/slob.c index cf377beab962..fa53e9f73893 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -190,7 +190,7 @@ static int slob_last(slob_t *s) static void *slob_new_pages(gfp_t gfp, int order, int node) { - void *page; + struct page *page; #ifdef CONFIG_NUMA if (node != NUMA_NO_NODE) @@ -202,14 +202,21 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) if (!page) return NULL; + mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE, + 1 << order); return page_address(page); } static void slob_free_pages(void *b, int order) { + struct page *sp = virt_to_page(b); + if (current->reclaim_state) current->reclaim_state->reclaimed_slab += 1 << order; - free_pages((unsigned long)b, order); + + mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE, + -(1 << order)); + __free_pages(sp, order); } /* @@ -217,6 +224,7 @@ static void slob_free_pages(void *b, int order) * @sp: Page to look in. * @size: Size of the allocation. * @align: Allocation alignment. + * @align_offset: Offset in the allocated block that will be aligned. * @page_removed_from_list: Return parameter. * * Tries to find a chunk of memory at least @size bytes big within @page. @@ -227,7 +235,7 @@ static void slob_free_pages(void *b, int order) * true (set to false otherwise). */ static void *slob_page_alloc(struct page *sp, size_t size, int align, - bool *page_removed_from_list) + int align_offset, bool *page_removed_from_list) { slob_t *prev, *cur, *aligned = NULL; int delta = 0, units = SLOB_UNITS(size); @@ -236,8 +244,17 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align, for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) { slobidx_t avail = slob_units(cur); + /* + * 'aligned' will hold the address of the slob block so that the + * address 'aligned'+'align_offset' is aligned according to the + * 'align' parameter. This is for kmalloc() which prepends the + * allocated block with its size, so that the block itself is + * aligned when needed. + */ if (align) { - aligned = (slob_t *)ALIGN((unsigned long)cur, align); + aligned = (slob_t *) + (ALIGN((unsigned long)cur + align_offset, align) + - align_offset); delta = aligned - cur; } if (avail >= units + delta) { /* room enough? */ @@ -281,7 +298,8 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align, /* * slob_alloc: entry point into the slob allocator. */ -static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) +static void *slob_alloc(size_t size, gfp_t gfp, int align, int node, + int align_offset) { struct page *sp; struct list_head *slob_list; @@ -312,7 +330,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) if (sp->units < SLOB_UNITS(size)) continue; - b = slob_page_alloc(sp, size, align, &page_removed_from_list); + b = slob_page_alloc(sp, size, align, align_offset, &page_removed_from_list); if (!b) continue; @@ -349,7 +367,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) INIT_LIST_HEAD(&sp->slab_list); set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); set_slob_page_free(sp, slob_list); - b = slob_page_alloc(sp, size, align, &_unused); + b = slob_page_alloc(sp, size, align, align_offset, &_unused); BUG_ON(!b); spin_unlock_irqrestore(&slob_lock, flags); } @@ -451,7 +469,7 @@ static __always_inline void * __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) { unsigned int *m; - int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + int minalign = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); void *ret; gfp &= gfp_allowed_mask; @@ -459,19 +477,28 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) fs_reclaim_acquire(gfp); fs_reclaim_release(gfp); - if (size < PAGE_SIZE - align) { + if (size < PAGE_SIZE - minalign) { + int align = minalign; + + /* + * For power of two sizes, guarantee natural alignment for + * kmalloc()'d objects. + */ + if (is_power_of_2(size)) + align = max(minalign, (int) size); + if (!size) return ZERO_SIZE_PTR; - m = slob_alloc(size + align, gfp, align, node); + m = slob_alloc(size + minalign, gfp, align, node, minalign); if (!m) return NULL; *m = size; - ret = (void *)m + align; + ret = (void *)m + minalign; trace_kmalloc_node(caller, ret, - size, size + align, gfp, node); + size, size + minalign, gfp, node); } else { unsigned int order = get_order(size); @@ -521,8 +548,13 @@ void kfree(const void *block) int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); unsigned int *m = (unsigned int *)(block - align); slob_free(m, *m + align); - } else - __free_pages(sp, compound_order(sp)); + } else { + unsigned int order = compound_order(sp); + mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE, + -(1 << order)); + __free_pages(sp, order); + + } } EXPORT_SYMBOL(kfree); @@ -567,7 +599,7 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) fs_reclaim_release(flags); if (c->size < PAGE_SIZE) { - b = slob_alloc(c->size, flags, c->align, node); + b = slob_alloc(c->size, flags, c->align, node, 0); trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size, SLOB_UNITS(c->size) * SLOB_UNIT, flags, node); diff --git a/mm/slub.c b/mm/slub.c index 42c1b3af3c98..e72e802fc569 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1433,12 +1433,15 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, void *old_tail = *tail ? *tail : *head; int rsize; - if (slab_want_init_on_free(s)) { - void *p = NULL; + /* Head and tail of the reconstructed freelist */ + *head = NULL; + *tail = NULL; - do { - object = next; - next = get_freepointer(s, object); + do { + object = next; + next = get_freepointer(s, object); + + if (slab_want_init_on_free(s)) { /* * Clear the object and the metadata, but don't touch * the redzone. @@ -1448,29 +1451,8 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, : 0; memset((char *)object + s->inuse, 0, s->size - s->inuse - rsize); - set_freepointer(s, object, p); - p = object; - } while (object != old_tail); - } -/* - * Compiler cannot detect this function can be removed if slab_free_hook() - * evaluates to nothing. Thus, catch all relevant config debug options here. - */ -#if defined(CONFIG_LOCKDEP) || \ - defined(CONFIG_DEBUG_KMEMLEAK) || \ - defined(CONFIG_DEBUG_OBJECTS_FREE) || \ - defined(CONFIG_KASAN) - - next = *head; - - /* Head and tail of the reconstructed freelist */ - *head = NULL; - *tail = NULL; - - do { - object = next; - next = get_freepointer(s, object); + } /* If object's reuse doesn't have to be delayed */ if (!slab_free_hook(s, object)) { /* Move object to the new freelist */ @@ -1485,9 +1467,6 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, *tail = NULL; return *head != NULL; -#else - return true; -#endif } static void *setup_object(struct kmem_cache *s, struct page *page, @@ -2672,6 +2651,17 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, } /* + * If the object has been wiped upon free, make sure it's fully initialized by + * zeroing out freelist pointer. + */ +static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, + void *obj) +{ + if (unlikely(slab_want_init_on_free(s)) && obj) + memset((void *)((char *)obj + s->offset), 0, sizeof(void *)); +} + +/* * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) * have the fastpath folded into their functions. So no function call * overhead for requests that can be satisfied on the fastpath. @@ -2759,12 +2749,8 @@ redo: prefetch_freepointer(s, next_object); stat(s, ALLOC_FASTPATH); } - /* - * If the object has been wiped upon free, make sure it's fully - * initialized by zeroing out freelist pointer. - */ - if (unlikely(slab_want_init_on_free(s)) && object) - memset(object + s->offset, 0, sizeof(void *)); + + maybe_wipe_obj_freeptr(s, object); if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object) memset(object, 0, s->object_size); @@ -3178,10 +3164,13 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, goto error; c = this_cpu_ptr(s->cpu_slab); + maybe_wipe_obj_freeptr(s, p[i]); + continue; /* goto for-loop */ } c->freelist = get_freepointer(s, object); p[i] = object; + maybe_wipe_obj_freeptr(s, p[i]); } c->tid = next_tid(c->tid); local_irq_enable(); @@ -3821,11 +3810,15 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) { struct page *page; void *ptr = NULL; + unsigned int order = get_order(size); flags |= __GFP_COMP; - page = alloc_pages_node(node, flags, get_order(size)); - if (page) + page = alloc_pages_node(node, flags, order); + if (page) { ptr = page_address(page); + mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE, + 1 << order); + } return kmalloc_large_node_hook(ptr, size, flags); } @@ -3951,9 +3944,13 @@ void kfree(const void *x) page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { + unsigned int order = compound_order(page); + BUG_ON(!PageCompound(page)); kfree_hook(object); - __free_pages(page, compound_order(page)); + mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE, + -(1 << order)); + __free_pages(page, order); return; } slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_); @@ -4838,7 +4835,17 @@ static ssize_t show_slab_objects(struct kmem_cache *s, } } - get_online_mems(); + /* + * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex" + * already held which will conflict with an existing lock order: + * + * mem_hotplug_lock->slab_mutex->kernfs_mutex + * + * We don't really need mem_hotplug_lock (to hold off + * slab_mem_going_offline_callback) here because slab's memory hot + * unplug code doesn't destroy the kmem_cache->node[] data. + */ + #ifdef CONFIG_SLUB_DEBUG if (flags & SO_ALL) { struct kmem_cache_node *n; @@ -4879,7 +4886,6 @@ static ssize_t show_slab_objects(struct kmem_cache *s, x += sprintf(buf + x, " N%d=%lu", node, nodes[node]); #endif - put_online_mems(); kfree(nodes); return x + sprintf(buf + x, "\n"); } diff --git a/mm/sparse.c b/mm/sparse.c index bf32de9e666b..f6891c1992b1 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -219,7 +219,7 @@ static inline unsigned long first_present_section_nr(void) return next_present_section_nr(-1); } -void subsection_mask_set(unsigned long *map, unsigned long pfn, +static void subsection_mask_set(unsigned long *map, unsigned long pfn, unsigned long nr_pages) { int idx = subsection_map_index(pfn); diff --git a/mm/truncate.c b/mm/truncate.c index 8563339041f6..dd9ebc1da356 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -592,6 +592,16 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, unlock_page(page); continue; } + + /* Take a pin outside pagevec */ + get_page(page); + + /* + * Drop extra pins before trying to invalidate + * the huge page. + */ + pagevec_remove_exceptionals(&pvec); + pagevec_release(&pvec); } ret = invalidate_inode_page(page); @@ -602,6 +612,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, */ if (!ret) deactivate_file_page(page); + if (PageTransHuge(page)) + put_page(page); count += ret; } pagevec_remove_exceptionals(&pvec); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a3c70e275f4e..4a7d7459c4f9 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2672,6 +2672,26 @@ void *vzalloc_node(unsigned long size, int node) EXPORT_SYMBOL(vzalloc_node); /** + * vmalloc_user_node_flags - allocate memory for userspace on a specific node + * @size: allocation size + * @node: numa node + * @flags: flags for the page level allocator + * + * The resulting memory area is zeroed so it can be mapped to userspace + * without leaking data. + * + * Return: pointer to the allocated memory or %NULL on error + */ +void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags) +{ + return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + flags | __GFP_ZERO, PAGE_KERNEL, + VM_USERMAP, node, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(vmalloc_user_node_flags); + +/** * vmalloc_exec - allocate virtually contiguous, executable memory * @size: allocation size * diff --git a/mm/vmpressure.c b/mm/vmpressure.c index f3b50811497a..4bac22fe1aa2 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -355,6 +355,9 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) * "hierarchy" or "local"). * * To be used as memcg event method. + * + * Return: 0 on success, -ENOMEM on memory failure or -EINVAL if @args could + * not be parsed. */ int vmpressure_register_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, const char *args) @@ -362,7 +365,7 @@ int vmpressure_register_event(struct mem_cgroup *memcg, struct vmpressure *vmpr = memcg_to_vmpressure(memcg); struct vmpressure_event *ev; enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH; - enum vmpressure_levels level = -1; + enum vmpressure_levels level; char *spec, *spec_orig; char *token; int ret = 0; @@ -375,20 +378,18 @@ int vmpressure_register_event(struct mem_cgroup *memcg, /* Find required level */ token = strsep(&spec, ","); - level = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token); - if (level < 0) { - ret = level; + ret = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token); + if (ret < 0) goto out; - } + level = ret; /* Find optional mode */ token = strsep(&spec, ","); if (token) { - mode = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token); - if (mode < 0) { - ret = mode; + ret = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token); + if (ret < 0) goto out; - } + mode = ret; } ev = kzalloc(sizeof(*ev), GFP_KERNEL); @@ -404,6 +405,7 @@ int vmpressure_register_event(struct mem_cgroup *memcg, mutex_lock(&vmpr->events_lock); list_add(&ev->node, &vmpr->events); mutex_unlock(&vmpr->events_lock); + ret = 0; out: kfree(spec_orig); return ret; diff --git a/mm/vmscan.c b/mm/vmscan.c index e5d52d6a24af..ee4eecc7e1c2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -351,12 +351,13 @@ unsigned long zone_reclaimable_pages(struct zone *zone) */ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { - unsigned long lru_size; + unsigned long lru_size = 0; int zid; - if (!mem_cgroup_disabled()) - lru_size = lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); - else + if (!mem_cgroup_disabled()) { + for (zid = 0; zid < MAX_NR_ZONES; zid++) + lru_size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid); + } else lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) { @@ -932,10 +933,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * Note that if SetPageDirty is always performed via set_page_dirty, * and thus under the i_pages lock, then this ordering is not required. */ - if (unlikely(PageTransHuge(page)) && PageSwapCache(page)) - refcount = 1 + HPAGE_PMD_NR; - else - refcount = 2; + refcount = 1 + compound_nr(page); if (!page_ref_freeze(page, refcount)) goto cannot_free; /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */ @@ -2459,17 +2457,70 @@ out: *lru_pages = 0; for_each_evictable_lru(lru) { int file = is_file_lru(lru); - unsigned long size; + unsigned long lruvec_size; unsigned long scan; + unsigned long protection; + + lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); + protection = mem_cgroup_protection(memcg, + sc->memcg_low_reclaim); + + if (protection) { + /* + * Scale a cgroup's reclaim pressure by proportioning + * its current usage to its memory.low or memory.min + * setting. + * + * This is important, as otherwise scanning aggression + * becomes extremely binary -- from nothing as we + * approach the memory protection threshold, to totally + * nominal as we exceed it. This results in requiring + * setting extremely liberal protection thresholds. It + * also means we simply get no protection at all if we + * set it too low, which is not ideal. + * + * If there is any protection in place, we reduce scan + * pressure by how much of the total memory used is + * within protection thresholds. + * + * There is one special case: in the first reclaim pass, + * we skip over all groups that are within their low + * protection. If that fails to reclaim enough pages to + * satisfy the reclaim goal, we come back and override + * the best-effort low protection. However, we still + * ideally want to honor how well-behaved groups are in + * that case instead of simply punishing them all + * equally. As such, we reclaim them based on how much + * memory they are using, reducing the scan pressure + * again by how much of the total memory used is under + * hard protection. + */ + unsigned long cgroup_size = mem_cgroup_size(memcg); + + /* Avoid TOCTOU with earlier protection check */ + cgroup_size = max(cgroup_size, protection); + + scan = lruvec_size - lruvec_size * protection / + cgroup_size; + + /* + * Minimally target SWAP_CLUSTER_MAX pages to keep + * reclaim moving forwards, avoiding decremeting + * sc->priority further than desirable. + */ + scan = max(scan, SWAP_CLUSTER_MAX); + } else { + scan = lruvec_size; + } + + scan >>= sc->priority; - size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); - scan = size >> sc->priority; /* * If the cgroup's already been deleted, make sure to * scrape out the remaining cache. */ if (!scan && !mem_cgroup_online(memcg)) - scan = min(size, SWAP_CLUSTER_MAX); + scan = min(lruvec_size, SWAP_CLUSTER_MAX); switch (scan_balance) { case SCAN_EQUAL: @@ -2489,7 +2540,7 @@ out: case SCAN_ANON: /* Scan one type exclusively */ if ((scan_balance == SCAN_FILE) != file) { - size = 0; + lruvec_size = 0; scan = 0; } break; @@ -2498,7 +2549,7 @@ out: BUG(); } - *lru_pages += size; + *lru_pages += lruvec_size; nr[lru] = scan; } } @@ -2742,6 +2793,13 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) memcg_memory_event(memcg, MEMCG_LOW); break; case MEMCG_PROT_NONE: + /* + * All protection thresholds breached. We may + * still choose to vary the scan pressure + * applied based on by how much the cgroup in + * question has exceeded its protection + * thresholds (see get_scan_count). + */ break; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 6afc892a148a..a8222041bd44 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1383,12 +1383,29 @@ static void pagetypeinfo_showfree_print(struct seq_file *m, unsigned long freecount = 0; struct free_area *area; struct list_head *curr; + bool overflow = false; area = &(zone->free_area[order]); - list_for_each(curr, &area->free_list[mtype]) - freecount++; - seq_printf(m, "%6lu ", freecount); + list_for_each(curr, &area->free_list[mtype]) { + /* + * Cap the free_list iteration because it might + * be really large and we are under a spinlock + * so a long time spent here could trigger a + * hard lockup detector. Anyway this is a + * debugging tool so knowing there is a handful + * of pages of this order should be more than + * sufficient. + */ + if (++freecount >= 100000) { + overflow = true; + break; + } + } + seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount); + spin_unlock_irq(&zone->lock); + cond_resched(); + spin_lock_irq(&zone->lock); } seq_putc(m, '\n'); } @@ -1972,7 +1989,7 @@ void __init init_mm_internals(void) #endif #ifdef CONFIG_PROC_FS proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op); - proc_create_seq("pagetypeinfo", 0444, NULL, &pagetypeinfo_op); + proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op); proc_create_seq("vmstat", 0444, NULL, &vmstat_op); proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op); #endif diff --git a/mm/z3fold.c b/mm/z3fold.c index 05bdf90646e7..6d3d3f698ebb 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -998,9 +998,11 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) struct z3fold_header *zhdr; struct page *page; enum buddy bud; + bool page_claimed; zhdr = handle_to_z3fold_header(handle); page = virt_to_page(zhdr); + page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private); if (test_bit(PAGE_HEADLESS, &page->private)) { /* if a headless page is under reclaim, just leave. @@ -1008,7 +1010,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) * has not been set before, we release this page * immediately so we don't care about its value any more. */ - if (!test_and_set_bit(PAGE_CLAIMED, &page->private)) { + if (!page_claimed) { spin_lock(&pool->lock); list_del(&page->lru); spin_unlock(&pool->lock); @@ -1044,13 +1046,15 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) atomic64_dec(&pool->pages_nr); return; } - if (test_bit(PAGE_CLAIMED, &page->private)) { + if (page_claimed) { + /* the page has not been claimed by us */ z3fold_page_unlock(zhdr); return; } if (unlikely(PageIsolated(page)) || test_and_set_bit(NEEDS_COMPACTING, &page->private)) { z3fold_page_unlock(zhdr); + clear_bit(PAGE_CLAIMED, &page->private); return; } if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) { @@ -1060,10 +1064,12 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) zhdr->cpu = -1; kref_get(&zhdr->refcount); do_compact_page(zhdr, true); + clear_bit(PAGE_CLAIMED, &page->private); return; } kref_get(&zhdr->refcount); queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work); + clear_bit(PAGE_CLAIMED, &page->private); z3fold_page_unlock(zhdr); } |