summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig5
-rw-r--r--mm/Makefile1
-rw-r--r--mm/backing-dev.c4
-rw-r--r--mm/compaction.c7
-rw-r--r--mm/debug.c31
-rw-r--r--mm/filemap.c1
-rw-r--r--mm/gup.c14
-rw-r--r--mm/hmm.c523
-rw-r--r--mm/huge_memory.c9
-rw-r--r--mm/hugetlb.c5
-rw-r--r--mm/hugetlb_cgroup.c2
-rw-r--r--mm/init-mm.c1
-rw-r--r--mm/khugepaged.c35
-rw-r--r--mm/kmemleak.c30
-rw-r--r--mm/ksm.c14
-rw-r--r--mm/maccess.c70
-rw-r--r--mm/madvise.c16
-rw-r--r--mm/mapping_dirty_helpers.c315
-rw-r--r--mm/memblock.c6
-rw-r--r--mm/memcontrol.c50
-rw-r--r--mm/memory-failure.c36
-rw-r--r--mm/memory.c104
-rw-r--r--mm/memory_hotplug.c133
-rw-r--r--mm/mempolicy.c14
-rw-r--r--mm/memremap.c13
-rw-r--r--mm/mmu_notifier.c559
-rw-r--r--mm/nommu.c15
-rw-r--r--mm/page_alloc.c31
-rw-r--r--mm/page_ext.c23
-rw-r--r--mm/page_io.c6
-rw-r--r--mm/page_owner.c60
-rw-r--r--mm/pagewalk.c99
-rw-r--r--mm/rmap.c1
-rw-r--r--mm/shmem.c6
-rw-r--r--mm/shuffle.c2
-rw-r--r--mm/slab.c3
-rw-r--r--mm/slab.h4
-rw-r--r--mm/slab_common.c28
-rw-r--r--mm/slob.c62
-rw-r--r--mm/slub.c88
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/truncate.c12
-rw-r--r--mm/vmalloc.c20
-rw-r--r--mm/vmpressure.c20
-rw-r--r--mm/vmscan.c86
-rw-r--r--mm/vmstat.c25
-rw-r--r--mm/z3fold.c10
47 files changed, 1740 insertions, 861 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a5dae9a7eb51..f332efe751dd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -284,6 +284,7 @@ config VIRT_TO_BUS
config MMU_NOTIFIER
bool
select SRCU
+ select INTERVAL_TREE
config KSM
bool "Enable KSM for page merging"
@@ -674,7 +675,6 @@ config DEV_PAGEMAP_OPS
config HMM_MIRROR
bool
depends on MMU
- depends on MMU_NOTIFIER
config DEVICE_PRIVATE
bool "Unaddressable device memory (GPU memory, ...)"
@@ -736,4 +736,7 @@ config ARCH_HAS_PTE_SPECIAL
config ARCH_HAS_HUGEPD
bool
+config MAPPING_DIRTY_HELPERS
+ bool
+
endmenu
diff --git a/mm/Makefile b/mm/Makefile
index d996846697ef..1937cc251883 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -107,3 +107,4 @@ obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
obj-$(CONFIG_ZONE_DEVICE) += memremap.o
obj-$(CONFIG_HMM_MIRROR) += hmm.o
obj-$(CONFIG_MEMFD_CREATE) += memfd.o
+obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index d9daa3e422d0..c360f6a6c844 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -239,8 +239,8 @@ static int __init default_bdi_init(void)
{
int err;
- bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE |
- WQ_UNBOUND | WQ_SYSFS, 0);
+ bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND |
+ WQ_SYSFS, 0);
if (!bdi_wq)
return -ENOMEM;
diff --git a/mm/compaction.c b/mm/compaction.c
index ce08b39d85d4..672d3c78c6ab 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -270,14 +270,15 @@ __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source,
/* Ensure the start of the pageblock or zone is online and valid */
block_pfn = pageblock_start_pfn(pfn);
- block_page = pfn_to_online_page(max(block_pfn, zone->zone_start_pfn));
+ block_pfn = max(block_pfn, zone->zone_start_pfn);
+ block_page = pfn_to_online_page(block_pfn);
if (block_page) {
page = block_page;
pfn = block_pfn;
}
/* Ensure the end of the pageblock or zone is online and valid */
- block_pfn += pageblock_nr_pages;
+ block_pfn = pageblock_end_pfn(pfn) - 1;
block_pfn = min(block_pfn, zone_end_pfn(zone) - 1);
end_page = pfn_to_online_page(block_pfn);
if (!end_page)
@@ -303,7 +304,7 @@ __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source,
page += (1 << PAGE_ALLOC_COSTLY_ORDER);
pfn += (1 << PAGE_ALLOC_COSTLY_ORDER);
- } while (page < end_page);
+ } while (page <= end_page);
return false;
}
diff --git a/mm/debug.c b/mm/debug.c
index 8345bb6e4769..0461df1207cb 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -67,28 +67,31 @@ void __dump_page(struct page *page, const char *reason)
*/
mapcount = PageSlab(page) ? 0 : page_mapcount(page);
- pr_warn("page:%px refcount:%d mapcount:%d mapping:%px index:%#lx",
- page, page_ref_count(page), mapcount,
- page->mapping, page_to_pgoff(page));
if (PageCompound(page))
- pr_cont(" compound_mapcount: %d", compound_mapcount(page));
- pr_cont("\n");
- if (PageAnon(page))
- pr_warn("anon ");
- else if (PageKsm(page))
- pr_warn("ksm ");
+ pr_warn("page:%px refcount:%d mapcount:%d mapping:%px "
+ "index:%#lx compound_mapcount: %d\n",
+ page, page_ref_count(page), mapcount,
+ page->mapping, page_to_pgoff(page),
+ compound_mapcount(page));
+ else
+ pr_warn("page:%px refcount:%d mapcount:%d mapping:%px index:%#lx\n",
+ page, page_ref_count(page), mapcount,
+ page->mapping, page_to_pgoff(page));
+ if (PageKsm(page))
+ pr_warn("ksm flags: %#lx(%pGp)\n", page->flags, &page->flags);
+ else if (PageAnon(page))
+ pr_warn("anon flags: %#lx(%pGp)\n", page->flags, &page->flags);
else if (mapping) {
- pr_warn("%ps ", mapping->a_ops);
if (mapping->host && mapping->host->i_dentry.first) {
struct dentry *dentry;
dentry = container_of(mapping->host->i_dentry.first, struct dentry, d_u.d_alias);
- pr_warn("name:\"%pd\" ", dentry);
- }
+ pr_warn("%ps name:\"%pd\"\n", mapping->a_ops, dentry);
+ } else
+ pr_warn("%ps\n", mapping->a_ops);
+ pr_warn("flags: %#lx(%pGp)\n", page->flags, &page->flags);
}
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
- pr_warn("flags: %#lx(%pGp)\n", page->flags, &page->flags);
-
hex_only:
print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
sizeof(unsigned long), page,
diff --git a/mm/filemap.c b/mm/filemap.c
index 1146fcfa3215..85b7d087eb45 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -40,6 +40,7 @@
#include <linux/rmap.h>
#include <linux/delayacct.h>
#include <linux/psi.h>
+#include <linux/ramfs.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
diff --git a/mm/gup.c b/mm/gup.c
index 23a9f9c9d377..8f236a335ae9 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1973,7 +1973,8 @@ static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
}
static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
unsigned long pte_end;
struct page *head, *page;
@@ -1986,7 +1987,7 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
pte = READ_ONCE(*ptep);
- if (!pte_access_permitted(pte, write))
+ if (!pte_access_permitted(pte, flags & FOLL_WRITE))
return 0;
/* hugepages are never "special" */
@@ -2023,7 +2024,7 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
}
static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
- unsigned int pdshift, unsigned long end, int write,
+ unsigned int pdshift, unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
pte_t *ptep;
@@ -2033,7 +2034,7 @@ static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
ptep = hugepte_offset(hugepd, addr, pdshift);
do {
next = hugepte_addr_end(addr, end, sz);
- if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
+ if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
return 0;
} while (ptep++, addr = next, addr != end);
@@ -2041,7 +2042,7 @@ static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
}
#else
static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
- unsigned pdshift, unsigned long end, int write,
+ unsigned int pdshift, unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
return 0;
@@ -2049,7 +2050,8 @@ static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
#endif /* CONFIG_ARCH_HAS_HUGEPD */
static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
- unsigned long end, unsigned int flags, struct page **pages, int *nr)
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
struct page *head, *page;
int refs;
diff --git a/mm/hmm.c b/mm/hmm.c
index 902f5fa6bf93..d379cb6496ae 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -26,193 +26,6 @@
#include <linux/mmu_notifier.h>
#include <linux/memory_hotplug.h>
-static struct mmu_notifier *hmm_alloc_notifier(struct mm_struct *mm)
-{
- struct hmm *hmm;
-
- hmm = kzalloc(sizeof(*hmm), GFP_KERNEL);
- if (!hmm)
- return ERR_PTR(-ENOMEM);
-
- init_waitqueue_head(&hmm->wq);
- INIT_LIST_HEAD(&hmm->mirrors);
- init_rwsem(&hmm->mirrors_sem);
- INIT_LIST_HEAD(&hmm->ranges);
- spin_lock_init(&hmm->ranges_lock);
- hmm->notifiers = 0;
- return &hmm->mmu_notifier;
-}
-
-static void hmm_free_notifier(struct mmu_notifier *mn)
-{
- struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
-
- WARN_ON(!list_empty(&hmm->ranges));
- WARN_ON(!list_empty(&hmm->mirrors));
- kfree(hmm);
-}
-
-static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
-{
- struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
- struct hmm_mirror *mirror;
-
- /*
- * Since hmm_range_register() holds the mmget() lock hmm_release() is
- * prevented as long as a range exists.
- */
- WARN_ON(!list_empty_careful(&hmm->ranges));
-
- down_read(&hmm->mirrors_sem);
- list_for_each_entry(mirror, &hmm->mirrors, list) {
- /*
- * Note: The driver is not allowed to trigger
- * hmm_mirror_unregister() from this thread.
- */
- if (mirror->ops->release)
- mirror->ops->release(mirror);
- }
- up_read(&hmm->mirrors_sem);
-}
-
-static void notifiers_decrement(struct hmm *hmm)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&hmm->ranges_lock, flags);
- hmm->notifiers--;
- if (!hmm->notifiers) {
- struct hmm_range *range;
-
- list_for_each_entry(range, &hmm->ranges, list) {
- if (range->valid)
- continue;
- range->valid = true;
- }
- wake_up_all(&hmm->wq);
- }
- spin_unlock_irqrestore(&hmm->ranges_lock, flags);
-}
-
-static int hmm_invalidate_range_start(struct mmu_notifier *mn,
- const struct mmu_notifier_range *nrange)
-{
- struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
- struct hmm_mirror *mirror;
- struct hmm_range *range;
- unsigned long flags;
- int ret = 0;
-
- spin_lock_irqsave(&hmm->ranges_lock, flags);
- hmm->notifiers++;
- list_for_each_entry(range, &hmm->ranges, list) {
- if (nrange->end < range->start || nrange->start >= range->end)
- continue;
-
- range->valid = false;
- }
- spin_unlock_irqrestore(&hmm->ranges_lock, flags);
-
- if (mmu_notifier_range_blockable(nrange))
- down_read(&hmm->mirrors_sem);
- else if (!down_read_trylock(&hmm->mirrors_sem)) {
- ret = -EAGAIN;
- goto out;
- }
-
- list_for_each_entry(mirror, &hmm->mirrors, list) {
- int rc;
-
- rc = mirror->ops->sync_cpu_device_pagetables(mirror, nrange);
- if (rc) {
- if (WARN_ON(mmu_notifier_range_blockable(nrange) ||
- rc != -EAGAIN))
- continue;
- ret = -EAGAIN;
- break;
- }
- }
- up_read(&hmm->mirrors_sem);
-
-out:
- if (ret)
- notifiers_decrement(hmm);
- return ret;
-}
-
-static void hmm_invalidate_range_end(struct mmu_notifier *mn,
- const struct mmu_notifier_range *nrange)
-{
- struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
-
- notifiers_decrement(hmm);
-}
-
-static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
- .release = hmm_release,
- .invalidate_range_start = hmm_invalidate_range_start,
- .invalidate_range_end = hmm_invalidate_range_end,
- .alloc_notifier = hmm_alloc_notifier,
- .free_notifier = hmm_free_notifier,
-};
-
-/*
- * hmm_mirror_register() - register a mirror against an mm
- *
- * @mirror: new mirror struct to register
- * @mm: mm to register against
- * Return: 0 on success, -ENOMEM if no memory, -EINVAL if invalid arguments
- *
- * To start mirroring a process address space, the device driver must register
- * an HMM mirror struct.
- *
- * The caller cannot unregister the hmm_mirror while any ranges are
- * registered.
- *
- * Callers using this function must put a call to mmu_notifier_synchronize()
- * in their module exit functions.
- */
-int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
-{
- struct mmu_notifier *mn;
-
- lockdep_assert_held_write(&mm->mmap_sem);
-
- /* Sanity check */
- if (!mm || !mirror || !mirror->ops)
- return -EINVAL;
-
- mn = mmu_notifier_get_locked(&hmm_mmu_notifier_ops, mm);
- if (IS_ERR(mn))
- return PTR_ERR(mn);
- mirror->hmm = container_of(mn, struct hmm, mmu_notifier);
-
- down_write(&mirror->hmm->mirrors_sem);
- list_add(&mirror->list, &mirror->hmm->mirrors);
- up_write(&mirror->hmm->mirrors_sem);
-
- return 0;
-}
-EXPORT_SYMBOL(hmm_mirror_register);
-
-/*
- * hmm_mirror_unregister() - unregister a mirror
- *
- * @mirror: mirror struct to unregister
- *
- * Stop mirroring a process address space, and cleanup.
- */
-void hmm_mirror_unregister(struct hmm_mirror *mirror)
-{
- struct hmm *hmm = mirror->hmm;
-
- down_write(&hmm->mirrors_sem);
- list_del(&mirror->list);
- up_write(&hmm->mirrors_sem);
- mmu_notifier_put(&hmm->mmu_notifier);
-}
-EXPORT_SYMBOL(hmm_mirror_unregister);
-
struct hmm_vma_walk {
struct hmm_range *range;
struct dev_pagemap *pgmap;
@@ -252,18 +65,15 @@ err:
return -EFAULT;
}
-static int hmm_pfns_bad(unsigned long addr,
- unsigned long end,
- struct mm_walk *walk)
+static int hmm_pfns_fill(unsigned long addr, unsigned long end,
+ struct hmm_range *range, enum hmm_pfn_value_e value)
{
- struct hmm_vma_walk *hmm_vma_walk = walk->private;
- struct hmm_range *range = hmm_vma_walk->range;
uint64_t *pfns = range->pfns;
unsigned long i;
i = (addr - range->start) >> PAGE_SHIFT;
for (; addr < end; addr += PAGE_SIZE, i++)
- pfns[i] = range->values[HMM_PFN_ERROR];
+ pfns[i] = range->values[value];
return 0;
}
@@ -532,8 +342,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
if (unlikely(!hmm_vma_walk->pgmap))
return -EBUSY;
} else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) {
- *pfn = range->values[HMM_PFN_SPECIAL];
- return -EFAULT;
+ if (!is_zero_pfn(pte_pfn(pte))) {
+ *pfn = range->values[HMM_PFN_SPECIAL];
+ return -EFAULT;
+ }
+ /*
+ * Since each architecture defines a struct page for the zero
+ * page, just fall through and treat it like a normal page.
+ */
}
*pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags;
@@ -584,7 +400,7 @@ again:
}
return 0;
} else if (!pmd_present(pmd))
- return hmm_pfns_bad(start, end, walk);
+ return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
/*
@@ -612,7 +428,7 @@ again:
* recover.
*/
if (pmd_bad(pmd))
- return hmm_pfns_bad(start, end, walk);
+ return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
ptep = pte_offset_map(pmdp, addr);
i = (addr - range->start) >> PAGE_SHIFT;
@@ -770,93 +586,55 @@ unlock:
#define hmm_vma_walk_hugetlb_entry NULL
#endif /* CONFIG_HUGETLB_PAGE */
-static void hmm_pfns_clear(struct hmm_range *range,
- uint64_t *pfns,
- unsigned long addr,
- unsigned long end)
-{
- for (; addr < end; addr += PAGE_SIZE, pfns++)
- *pfns = range->values[HMM_PFN_NONE];
-}
-
-/*
- * hmm_range_register() - start tracking change to CPU page table over a range
- * @range: range
- * @mm: the mm struct for the range of virtual address
- *
- * Return: 0 on success, -EFAULT if the address space is no longer valid
- *
- * Track updates to the CPU page table see include/linux/hmm.h
- */
-int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror)
+static int hmm_vma_walk_test(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
{
- struct hmm *hmm = mirror->hmm;
- unsigned long flags;
-
- range->valid = false;
- range->hmm = NULL;
-
- if ((range->start & (PAGE_SIZE - 1)) || (range->end & (PAGE_SIZE - 1)))
- return -EINVAL;
- if (range->start >= range->end)
- return -EINVAL;
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ struct vm_area_struct *vma = walk->vma;
- /* Prevent hmm_release() from running while the range is valid */
- if (!mmget_not_zero(hmm->mmu_notifier.mm))
+ /*
+ * Skip vma ranges that don't have struct page backing them or
+ * map I/O devices directly.
+ */
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP))
return -EFAULT;
- /* Initialize range to track CPU page table updates. */
- spin_lock_irqsave(&hmm->ranges_lock, flags);
-
- range->hmm = hmm;
- list_add(&range->list, &hmm->ranges);
-
/*
- * If there are any concurrent notifiers we have to wait for them for
- * the range to be valid (see hmm_range_wait_until_valid()).
+ * If the vma does not allow read access, then assume that it does not
+ * allow write access either. HMM does not support architectures
+ * that allow write without read.
*/
- if (!hmm->notifiers)
- range->valid = true;
- spin_unlock_irqrestore(&hmm->ranges_lock, flags);
-
- return 0;
-}
-EXPORT_SYMBOL(hmm_range_register);
+ if (!(vma->vm_flags & VM_READ)) {
+ bool fault, write_fault;
-/*
- * hmm_range_unregister() - stop tracking change to CPU page table over a range
- * @range: range
- *
- * Range struct is used to track updates to the CPU page table after a call to
- * hmm_range_register(). See include/linux/hmm.h for how to use it.
- */
-void hmm_range_unregister(struct hmm_range *range)
-{
- struct hmm *hmm = range->hmm;
- unsigned long flags;
+ /*
+ * Check to see if a fault is requested for any page in the
+ * range.
+ */
+ hmm_range_need_fault(hmm_vma_walk, range->pfns +
+ ((start - range->start) >> PAGE_SHIFT),
+ (end - start) >> PAGE_SHIFT,
+ 0, &fault, &write_fault);
+ if (fault || write_fault)
+ return -EFAULT;
- spin_lock_irqsave(&hmm->ranges_lock, flags);
- list_del_init(&range->list);
- spin_unlock_irqrestore(&hmm->ranges_lock, flags);
+ hmm_pfns_fill(start, end, range, HMM_PFN_NONE);
+ hmm_vma_walk->last = end;
- /* Drop reference taken by hmm_range_register() */
- mmput(hmm->mmu_notifier.mm);
+ /* Skip this vma and continue processing the next vma. */
+ return 1;
+ }
- /*
- * The range is now invalid and the ref on the hmm is dropped, so
- * poison the pointer. Leave other fields in place, for the caller's
- * use.
- */
- range->valid = false;
- memset(&range->hmm, POISON_INUSE, sizeof(range->hmm));
+ return 0;
}
-EXPORT_SYMBOL(hmm_range_unregister);
static const struct mm_walk_ops hmm_walk_ops = {
.pud_entry = hmm_vma_walk_pud,
.pmd_entry = hmm_vma_walk_pmd,
.pte_hole = hmm_vma_walk_hole,
.hugetlb_entry = hmm_vma_walk_hugetlb_entry,
+ .test_walk = hmm_vma_walk_test,
};
/**
@@ -889,210 +667,27 @@ static const struct mm_walk_ops hmm_walk_ops = {
*/
long hmm_range_fault(struct hmm_range *range, unsigned int flags)
{
- const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
- unsigned long start = range->start, end;
- struct hmm_vma_walk hmm_vma_walk;
- struct hmm *hmm = range->hmm;
- struct vm_area_struct *vma;
+ struct hmm_vma_walk hmm_vma_walk = {
+ .range = range,
+ .last = range->start,
+ .flags = flags,
+ };
+ struct mm_struct *mm = range->notifier->mm;
int ret;
- lockdep_assert_held(&hmm->mmu_notifier.mm->mmap_sem);
+ lockdep_assert_held(&mm->mmap_sem);
do {
/* If range is no longer valid force retry. */
- if (!range->valid)
+ if (mmu_interval_check_retry(range->notifier,
+ range->notifier_seq))
return -EBUSY;
+ ret = walk_page_range(mm, hmm_vma_walk.last, range->end,
+ &hmm_walk_ops, &hmm_vma_walk);
+ } while (ret == -EBUSY);
- vma = find_vma(hmm->mmu_notifier.mm, start);
- if (vma == NULL || (vma->vm_flags & device_vma))
- return -EFAULT;
-
- if (!(vma->vm_flags & VM_READ)) {
- /*
- * If vma do not allow read access, then assume that it
- * does not allow write access, either. HMM does not
- * support architecture that allow write without read.
- */
- hmm_pfns_clear(range, range->pfns,
- range->start, range->end);
- return -EPERM;
- }
-
- hmm_vma_walk.pgmap = NULL;
- hmm_vma_walk.last = start;
- hmm_vma_walk.flags = flags;
- hmm_vma_walk.range = range;
- end = min(range->end, vma->vm_end);
-
- walk_page_range(vma->vm_mm, start, end, &hmm_walk_ops,
- &hmm_vma_walk);
-
- do {
- ret = walk_page_range(vma->vm_mm, start, end,
- &hmm_walk_ops, &hmm_vma_walk);
- start = hmm_vma_walk.last;
-
- /* Keep trying while the range is valid. */
- } while (ret == -EBUSY && range->valid);
-
- if (ret) {
- unsigned long i;
-
- i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
- hmm_pfns_clear(range, &range->pfns[i],
- hmm_vma_walk.last, range->end);
- return ret;
- }
- start = end;
-
- } while (start < range->end);
-
+ if (ret)
+ return ret;
return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
}
EXPORT_SYMBOL(hmm_range_fault);
-
-/**
- * hmm_range_dma_map - hmm_range_fault() and dma map page all in one.
- * @range: range being faulted
- * @device: device to map page to
- * @daddrs: array of dma addresses for the mapped pages
- * @flags: HMM_FAULT_*
- *
- * Return: the number of pages mapped on success (including zero), or any
- * status return from hmm_range_fault() otherwise.
- */
-long hmm_range_dma_map(struct hmm_range *range, struct device *device,
- dma_addr_t *daddrs, unsigned int flags)
-{
- unsigned long i, npages, mapped;
- long ret;
-
- ret = hmm_range_fault(range, flags);
- if (ret <= 0)
- return ret ? ret : -EBUSY;
-
- npages = (range->end - range->start) >> PAGE_SHIFT;
- for (i = 0, mapped = 0; i < npages; ++i) {
- enum dma_data_direction dir = DMA_TO_DEVICE;
- struct page *page;
-
- /*
- * FIXME need to update DMA API to provide invalid DMA address
- * value instead of a function to test dma address value. This
- * would remove lot of dumb code duplicated accross many arch.
- *
- * For now setting it to 0 here is good enough as the pfns[]
- * value is what is use to check what is valid and what isn't.
- */
- daddrs[i] = 0;
-
- page = hmm_device_entry_to_page(range, range->pfns[i]);
- if (page == NULL)
- continue;
-
- /* Check if range is being invalidated */
- if (!range->valid) {
- ret = -EBUSY;
- goto unmap;
- }
-
- /* If it is read and write than map bi-directional. */
- if (range->pfns[i] & range->flags[HMM_PFN_WRITE])
- dir = DMA_BIDIRECTIONAL;
-
- daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir);
- if (dma_mapping_error(device, daddrs[i])) {
- ret = -EFAULT;
- goto unmap;
- }
-
- mapped++;
- }
-
- return mapped;
-
-unmap:
- for (npages = i, i = 0; (i < npages) && mapped; ++i) {
- enum dma_data_direction dir = DMA_TO_DEVICE;
- struct page *page;
-
- page = hmm_device_entry_to_page(range, range->pfns[i]);
- if (page == NULL)
- continue;
-
- if (dma_mapping_error(device, daddrs[i]))
- continue;
-
- /* If it is read and write than map bi-directional. */
- if (range->pfns[i] & range->flags[HMM_PFN_WRITE])
- dir = DMA_BIDIRECTIONAL;
-
- dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
- mapped--;
- }
-
- return ret;
-}
-EXPORT_SYMBOL(hmm_range_dma_map);
-
-/**
- * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map()
- * @range: range being unmapped
- * @device: device against which dma map was done
- * @daddrs: dma address of mapped pages
- * @dirty: dirty page if it had the write flag set
- * Return: number of page unmapped on success, -EINVAL otherwise
- *
- * Note that caller MUST abide by mmu notifier or use HMM mirror and abide
- * to the sync_cpu_device_pagetables() callback so that it is safe here to
- * call set_page_dirty(). Caller must also take appropriate locks to avoid
- * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress.
- */
-long hmm_range_dma_unmap(struct hmm_range *range,
- struct device *device,
- dma_addr_t *daddrs,
- bool dirty)
-{
- unsigned long i, npages;
- long cpages = 0;
-
- /* Sanity check. */
- if (range->end <= range->start)
- return -EINVAL;
- if (!daddrs)
- return -EINVAL;
- if (!range->pfns)
- return -EINVAL;
-
- npages = (range->end - range->start) >> PAGE_SHIFT;
- for (i = 0; i < npages; ++i) {
- enum dma_data_direction dir = DMA_TO_DEVICE;
- struct page *page;
-
- page = hmm_device_entry_to_page(range, range->pfns[i]);
- if (page == NULL)
- continue;
-
- /* If it is read and write than map bi-directional. */
- if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) {
- dir = DMA_BIDIRECTIONAL;
-
- /*
- * See comments in function description on why it is
- * safe here to call set_page_dirty()
- */
- if (dirty)
- set_page_dirty(page);
- }
-
- /* Unmap and clear pfns/dma address */
- dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
- range->pfns[i] = range->values[HMM_PFN_NONE];
- /* FIXME see comments in hmm_vma_dma_map() */
- daddrs[i] = 0;
- cpages++;
- }
-
- return cpages;
-}
-EXPORT_SYMBOL(hmm_range_dma_unmap);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c5cb6dcd6c69..13cc93785006 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2789,8 +2789,13 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
ds_queue->split_queue_len--;
list_del(page_deferred_list(head));
}
- if (mapping)
- __dec_node_page_state(page, NR_SHMEM_THPS);
+ if (mapping) {
+ if (PageSwapBacked(page))
+ __dec_node_page_state(page, NR_SHMEM_THPS);
+ else
+ __dec_node_page_state(page, NR_FILE_THPS);
+ }
+
spin_unlock(&ds_queue->split_queue_lock);
__split_huge_page(page, list, end, flags);
if (PageSwapCache(head)) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ef37c85423a5..b45a95363a84 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1084,11 +1084,10 @@ static bool pfn_range_valid_gigantic(struct zone *z,
struct page *page;
for (i = start_pfn; i < end_pfn; i++) {
- if (!pfn_valid(i))
+ page = pfn_to_online_page(i);
+ if (!page)
return false;
- page = pfn_to_page(i);
-
if (page_zone(page) != z)
return false;
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index f1930fa0b445..2ac38bdc18a1 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -196,7 +196,7 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
again:
rcu_read_lock();
h_cg = hugetlb_cgroup_from_task(current);
- if (!css_tryget_online(&h_cg->css)) {
+ if (!css_tryget(&h_cg->css)) {
rcu_read_unlock();
goto again;
}
diff --git a/mm/init-mm.c b/mm/init-mm.c
index fb1e15028ef0..19603302a77f 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -5,6 +5,7 @@
#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/cpumask.h>
+#include <linux/mman.h>
#include <linux/atomic.h>
#include <linux/user_namespace.h>
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 0a1b4b484ac5..a8a57bebb5fa 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1028,12 +1028,13 @@ static void collapse_huge_page(struct mm_struct *mm,
anon_vma_lock_write(vma->anon_vma);
- pte = pte_offset_map(pmd, address);
- pte_ptl = pte_lockptr(mm, pmd);
-
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
address, address + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
+
+ pte = pte_offset_map(pmd, address);
+ pte_ptl = pte_lockptr(mm, pmd);
+
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
* After this gup_fast can't run anymore. This also removes
@@ -1601,17 +1602,6 @@ static void collapse_file(struct mm_struct *mm,
result = SCAN_FAIL;
goto xa_unlocked;
}
- } else if (!PageUptodate(page)) {
- xas_unlock_irq(&xas);
- wait_on_page_locked(page);
- if (!trylock_page(page)) {
- result = SCAN_PAGE_LOCK;
- goto xa_unlocked;
- }
- get_page(page);
- } else if (PageDirty(page)) {
- result = SCAN_FAIL;
- goto xa_locked;
} else if (trylock_page(page)) {
get_page(page);
xas_unlock_irq(&xas);
@@ -1626,7 +1616,12 @@ static void collapse_file(struct mm_struct *mm,
* without racing with truncate.
*/
VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageUptodate(page), page);
+
+ /* make sure the page is up to date */
+ if (unlikely(!PageUptodate(page))) {
+ result = SCAN_FAIL;
+ goto out_unlock;
+ }
/*
* If file was truncated then extended, or hole-punched, before
@@ -1642,6 +1637,16 @@ static void collapse_file(struct mm_struct *mm,
goto out_unlock;
}
+ if (!is_shmem && PageDirty(page)) {
+ /*
+ * khugepaged only works on read-only fd, so this
+ * page is dirty because it hasn't been flushed
+ * since first write.
+ */
+ result = SCAN_FAIL;
+ goto out_unlock;
+ }
+
if (isolate_lru_page(page)) {
result = SCAN_DEL_PAGE_LRU;
goto out_unlock;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 03a8d84badad..244607663363 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -527,6 +527,16 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
}
/*
+ * Remove an object from the object_tree_root and object_list. Must be called
+ * with the kmemleak_lock held _if_ kmemleak is still enabled.
+ */
+static void __remove_object(struct kmemleak_object *object)
+{
+ rb_erase(&object->rb_node, &object_tree_root);
+ list_del_rcu(&object->object_list);
+}
+
+/*
* Look up an object in the object search tree and remove it from both
* object_tree_root and object_list. The returned object's use_count should be
* at least 1, as initially set by create_object().
@@ -538,10 +548,8 @@ static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int ali
write_lock_irqsave(&kmemleak_lock, flags);
object = lookup_object(ptr, alias);
- if (object) {
- rb_erase(&object->rb_node, &object_tree_root);
- list_del_rcu(&object->object_list);
- }
+ if (object)
+ __remove_object(object);
write_unlock_irqrestore(&kmemleak_lock, flags);
return object;
@@ -1834,12 +1842,16 @@ static const struct file_operations kmemleak_fops = {
static void __kmemleak_do_cleanup(void)
{
- struct kmemleak_object *object;
+ struct kmemleak_object *object, *tmp;
- rcu_read_lock();
- list_for_each_entry_rcu(object, &object_list, object_list)
- delete_object_full(object->pointer);
- rcu_read_unlock();
+ /*
+ * Kmemleak has already been disabled, no need for RCU list traversal
+ * or kmemleak_lock held.
+ */
+ list_for_each_entry_safe(object, tmp, &object_list, object_list) {
+ __remove_object(object);
+ __delete_object(object);
+ }
}
/*
diff --git a/mm/ksm.c b/mm/ksm.c
index dbee2eb4dd05..7905934cd3ad 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -885,13 +885,13 @@ static int remove_stable_node(struct stable_node *stable_node)
return 0;
}
- if (WARN_ON_ONCE(page_mapped(page))) {
- /*
- * This should not happen: but if it does, just refuse to let
- * merge_across_nodes be switched - there is no need to panic.
- */
- err = -EBUSY;
- } else {
+ /*
+ * Page could be still mapped if this races with __mmput() running in
+ * between ksm_exit() and exit_mmap(). Just refuse to let
+ * merge_across_nodes/max_page_sharing be switched.
+ */
+ err = -EBUSY;
+ if (!page_mapped(page)) {
/*
* The stable node did not yet appear stale to get_ksm_page(),
* since that allows for an unmapped ksm page to be recognized
diff --git a/mm/maccess.c b/mm/maccess.c
index d065736f6b87..3ca8d97e5010 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -18,6 +18,18 @@ probe_read_common(void *dst, const void __user *src, size_t size)
return ret ? -EFAULT : 0;
}
+static __always_inline long
+probe_write_common(void __user *dst, const void *src, size_t size)
+{
+ long ret;
+
+ pagefault_disable();
+ ret = __copy_to_user_inatomic(dst, src, size);
+ pagefault_enable();
+
+ return ret ? -EFAULT : 0;
+}
+
/**
* probe_kernel_read(): safely attempt to read from a kernel-space location
* @dst: pointer to the buffer that shall take the data
@@ -31,11 +43,20 @@ probe_read_common(void *dst, const void __user *src, size_t size)
* do_page_fault() doesn't attempt to take mmap_sem. This makes
* probe_kernel_read() suitable for use within regions where the caller
* already holds mmap_sem, or other locks which nest inside mmap_sem.
+ *
+ * probe_kernel_read_strict() is the same as probe_kernel_read() except for
+ * the case where architectures have non-overlapping user and kernel address
+ * ranges: probe_kernel_read_strict() will additionally return -EFAULT for
+ * probing memory on a user address range where probe_user_read() is supposed
+ * to be used instead.
*/
long __weak probe_kernel_read(void *dst, const void *src, size_t size)
__attribute__((alias("__probe_kernel_read")));
+long __weak probe_kernel_read_strict(void *dst, const void *src, size_t size)
+ __attribute__((alias("__probe_kernel_read")));
+
long __probe_kernel_read(void *dst, const void *src, size_t size)
{
long ret;
@@ -85,6 +106,7 @@ EXPORT_SYMBOL_GPL(probe_user_read);
* Safely write to address @dst from the buffer at @src. If a kernel fault
* happens, handle that and return -EFAULT.
*/
+
long __weak probe_kernel_write(void *dst, const void *src, size_t size)
__attribute__((alias("__probe_kernel_write")));
@@ -94,15 +116,39 @@ long __probe_kernel_write(void *dst, const void *src, size_t size)
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
- pagefault_disable();
- ret = __copy_to_user_inatomic((__force void __user *)dst, src, size);
- pagefault_enable();
+ ret = probe_write_common((__force void __user *)dst, src, size);
set_fs(old_fs);
- return ret ? -EFAULT : 0;
+ return ret;
}
EXPORT_SYMBOL_GPL(probe_kernel_write);
+/**
+ * probe_user_write(): safely attempt to write to a user-space location
+ * @dst: address to write to
+ * @src: pointer to the data that shall be written
+ * @size: size of the data chunk
+ *
+ * Safely write to address @dst from the buffer at @src. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+
+long __weak probe_user_write(void __user *dst, const void *src, size_t size)
+ __attribute__((alias("__probe_user_write")));
+
+long __probe_user_write(void __user *dst, const void *src, size_t size)
+{
+ long ret = -EFAULT;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(USER_DS);
+ if (access_ok(dst, size))
+ ret = probe_write_common(dst, src, size);
+ set_fs(old_fs);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(probe_user_write);
/**
* strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address.
@@ -120,8 +166,22 @@ EXPORT_SYMBOL_GPL(probe_kernel_write);
*
* If @count is smaller than the length of the string, copies @count-1 bytes,
* sets the last byte of @dst buffer to NUL and returns @count.
+ *
+ * strncpy_from_unsafe_strict() is the same as strncpy_from_unsafe() except
+ * for the case where architectures have non-overlapping user and kernel address
+ * ranges: strncpy_from_unsafe_strict() will additionally return -EFAULT for
+ * probing memory on a user address range where strncpy_from_unsafe_user() is
+ * supposed to be used instead.
*/
-long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
+
+long __weak strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
+ __attribute__((alias("__strncpy_from_unsafe")));
+
+long __weak strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr,
+ long count)
+ __attribute__((alias("__strncpy_from_unsafe")));
+
+long __strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
{
mm_segment_t old_fs = get_fs();
const void *src = unsafe_addr;
diff --git a/mm/madvise.c b/mm/madvise.c
index 2be9f3fdb05e..94c343b4c968 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -363,8 +363,12 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
ClearPageReferenced(page);
test_and_clear_page_young(page);
if (pageout) {
- if (!isolate_lru_page(page))
- list_add(&page->lru, &page_list);
+ if (!isolate_lru_page(page)) {
+ if (PageUnevictable(page))
+ putback_lru_page(page);
+ else
+ list_add(&page->lru, &page_list);
+ }
} else
deactivate_page(page);
huge_unlock:
@@ -441,8 +445,12 @@ regular_page:
ClearPageReferenced(page);
test_and_clear_page_young(page);
if (pageout) {
- if (!isolate_lru_page(page))
- list_add(&page->lru, &page_list);
+ if (!isolate_lru_page(page)) {
+ if (PageUnevictable(page))
+ putback_lru_page(page);
+ else
+ list_add(&page->lru, &page_list);
+ }
} else
deactivate_page(page);
}
diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c
new file mode 100644
index 000000000000..71070dda9643
--- /dev/null
+++ b/mm/mapping_dirty_helpers.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/pagewalk.h>
+#include <linux/hugetlb.h>
+#include <linux/bitops.h>
+#include <linux/mmu_notifier.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+/**
+ * struct wp_walk - Private struct for pagetable walk callbacks
+ * @range: Range for mmu notifiers
+ * @tlbflush_start: Address of first modified pte
+ * @tlbflush_end: Address of last modified pte + 1
+ * @total: Total number of modified ptes
+ */
+struct wp_walk {
+ struct mmu_notifier_range range;
+ unsigned long tlbflush_start;
+ unsigned long tlbflush_end;
+ unsigned long total;
+};
+
+/**
+ * wp_pte - Write-protect a pte
+ * @pte: Pointer to the pte
+ * @addr: The virtual page address
+ * @walk: pagetable walk callback argument
+ *
+ * The function write-protects a pte and records the range in
+ * virtual address space of touched ptes for efficient range TLB flushes.
+ */
+static int wp_pte(pte_t *pte, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct wp_walk *wpwalk = walk->private;
+ pte_t ptent = *pte;
+
+ if (pte_write(ptent)) {
+ pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte);
+
+ ptent = pte_wrprotect(old_pte);
+ ptep_modify_prot_commit(walk->vma, addr, pte, old_pte, ptent);
+ wpwalk->total++;
+ wpwalk->tlbflush_start = min(wpwalk->tlbflush_start, addr);
+ wpwalk->tlbflush_end = max(wpwalk->tlbflush_end,
+ addr + PAGE_SIZE);
+ }
+
+ return 0;
+}
+
+/**
+ * struct clean_walk - Private struct for the clean_record_pte function.
+ * @base: struct wp_walk we derive from
+ * @bitmap_pgoff: Address_space Page offset of the first bit in @bitmap
+ * @bitmap: Bitmap with one bit for each page offset in the address_space range
+ * covered.
+ * @start: Address_space page offset of first modified pte relative
+ * to @bitmap_pgoff
+ * @end: Address_space page offset of last modified pte relative
+ * to @bitmap_pgoff
+ */
+struct clean_walk {
+ struct wp_walk base;
+ pgoff_t bitmap_pgoff;
+ unsigned long *bitmap;
+ pgoff_t start;
+ pgoff_t end;
+};
+
+#define to_clean_walk(_wpwalk) container_of(_wpwalk, struct clean_walk, base)
+
+/**
+ * clean_record_pte - Clean a pte and record its address space offset in a
+ * bitmap
+ * @pte: Pointer to the pte
+ * @addr: The virtual page address
+ * @walk: pagetable walk callback argument
+ *
+ * The function cleans a pte and records the range in
+ * virtual address space of touched ptes for efficient TLB flushes.
+ * It also records dirty ptes in a bitmap representing page offsets
+ * in the address_space, as well as the first and last of the bits
+ * touched.
+ */
+static int clean_record_pte(pte_t *pte, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct wp_walk *wpwalk = walk->private;
+ struct clean_walk *cwalk = to_clean_walk(wpwalk);
+ pte_t ptent = *pte;
+
+ if (pte_dirty(ptent)) {
+ pgoff_t pgoff = ((addr - walk->vma->vm_start) >> PAGE_SHIFT) +
+ walk->vma->vm_pgoff - cwalk->bitmap_pgoff;
+ pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte);
+
+ ptent = pte_mkclean(old_pte);
+ ptep_modify_prot_commit(walk->vma, addr, pte, old_pte, ptent);
+
+ wpwalk->total++;
+ wpwalk->tlbflush_start = min(wpwalk->tlbflush_start, addr);
+ wpwalk->tlbflush_end = max(wpwalk->tlbflush_end,
+ addr + PAGE_SIZE);
+
+ __set_bit(pgoff, cwalk->bitmap);
+ cwalk->start = min(cwalk->start, pgoff);
+ cwalk->end = max(cwalk->end, pgoff + 1);
+ }
+
+ return 0;
+}
+
+/* wp_clean_pmd_entry - The pagewalk pmd callback. */
+static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ /* Dirty-tracking should be handled on the pte level */
+ pmd_t pmdval = pmd_read_atomic(pmd);
+
+ if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval))
+ WARN_ON(pmd_write(pmdval) || pmd_dirty(pmdval));
+
+ return 0;
+}
+
+/* wp_clean_pud_entry - The pagewalk pud callback. */
+static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ /* Dirty-tracking should be handled on the pte level */
+ pud_t pudval = READ_ONCE(*pud);
+
+ if (pud_trans_huge(pudval) || pud_devmap(pudval))
+ WARN_ON(pud_write(pudval) || pud_dirty(pudval));
+
+ return 0;
+}
+
+/*
+ * wp_clean_pre_vma - The pagewalk pre_vma callback.
+ *
+ * The pre_vma callback performs the cache flush, stages the tlb flush
+ * and calls the necessary mmu notifiers.
+ */
+static int wp_clean_pre_vma(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct wp_walk *wpwalk = walk->private;
+
+ wpwalk->tlbflush_start = end;
+ wpwalk->tlbflush_end = start;
+
+ mmu_notifier_range_init(&wpwalk->range, MMU_NOTIFY_PROTECTION_PAGE, 0,
+ walk->vma, walk->mm, start, end);
+ mmu_notifier_invalidate_range_start(&wpwalk->range);
+ flush_cache_range(walk->vma, start, end);
+
+ /*
+ * We're not using tlb_gather_mmu() since typically
+ * only a small subrange of PTEs are affected, whereas
+ * tlb_gather_mmu() records the full range.
+ */
+ inc_tlb_flush_pending(walk->mm);
+
+ return 0;
+}
+
+/*
+ * wp_clean_post_vma - The pagewalk post_vma callback.
+ *
+ * The post_vma callback performs the tlb flush and calls necessary mmu
+ * notifiers.
+ */
+static void wp_clean_post_vma(struct mm_walk *walk)
+{
+ struct wp_walk *wpwalk = walk->private;
+
+ if (mm_tlb_flush_nested(walk->mm))
+ flush_tlb_range(walk->vma, wpwalk->range.start,
+ wpwalk->range.end);
+ else if (wpwalk->tlbflush_end > wpwalk->tlbflush_start)
+ flush_tlb_range(walk->vma, wpwalk->tlbflush_start,
+ wpwalk->tlbflush_end);
+
+ mmu_notifier_invalidate_range_end(&wpwalk->range);
+ dec_tlb_flush_pending(walk->mm);
+}
+
+/*
+ * wp_clean_test_walk - The pagewalk test_walk callback.
+ *
+ * Won't perform dirty-tracking on COW, read-only or HUGETLB vmas.
+ */
+static int wp_clean_test_walk(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ unsigned long vm_flags = READ_ONCE(walk->vma->vm_flags);
+
+ /* Skip non-applicable VMAs */
+ if ((vm_flags & (VM_SHARED | VM_MAYWRITE | VM_HUGETLB)) !=
+ (VM_SHARED | VM_MAYWRITE))
+ return 1;
+
+ return 0;
+}
+
+static const struct mm_walk_ops clean_walk_ops = {
+ .pte_entry = clean_record_pte,
+ .pmd_entry = wp_clean_pmd_entry,
+ .pud_entry = wp_clean_pud_entry,
+ .test_walk = wp_clean_test_walk,
+ .pre_vma = wp_clean_pre_vma,
+ .post_vma = wp_clean_post_vma
+};
+
+static const struct mm_walk_ops wp_walk_ops = {
+ .pte_entry = wp_pte,
+ .pmd_entry = wp_clean_pmd_entry,
+ .pud_entry = wp_clean_pud_entry,
+ .test_walk = wp_clean_test_walk,
+ .pre_vma = wp_clean_pre_vma,
+ .post_vma = wp_clean_post_vma
+};
+
+/**
+ * wp_shared_mapping_range - Write-protect all ptes in an address space range
+ * @mapping: The address_space we want to write protect
+ * @first_index: The first page offset in the range
+ * @nr: Number of incremental page offsets to cover
+ *
+ * Note: This function currently skips transhuge page-table entries, since
+ * it's intended for dirty-tracking on the PTE level. It will warn on
+ * encountering transhuge write-enabled entries, though, and can easily be
+ * extended to handle them as well.
+ *
+ * Return: The number of ptes actually write-protected. Note that
+ * already write-protected ptes are not counted.
+ */
+unsigned long wp_shared_mapping_range(struct address_space *mapping,
+ pgoff_t first_index, pgoff_t nr)
+{
+ struct wp_walk wpwalk = { .total = 0 };
+
+ i_mmap_lock_read(mapping);
+ WARN_ON(walk_page_mapping(mapping, first_index, nr, &wp_walk_ops,
+ &wpwalk));
+ i_mmap_unlock_read(mapping);
+
+ return wpwalk.total;
+}
+EXPORT_SYMBOL_GPL(wp_shared_mapping_range);
+
+/**
+ * clean_record_shared_mapping_range - Clean and record all ptes in an
+ * address space range
+ * @mapping: The address_space we want to clean
+ * @first_index: The first page offset in the range
+ * @nr: Number of incremental page offsets to cover
+ * @bitmap_pgoff: The page offset of the first bit in @bitmap
+ * @bitmap: Pointer to a bitmap of at least @nr bits. The bitmap needs to
+ * cover the whole range @first_index..@first_index + @nr.
+ * @start: Pointer to number of the first set bit in @bitmap.
+ * is modified as new bits are set by the function.
+ * @end: Pointer to the number of the last set bit in @bitmap.
+ * none set. The value is modified as new bits are set by the function.
+ *
+ * Note: When this function returns there is no guarantee that a CPU has
+ * not already dirtied new ptes. However it will not clean any ptes not
+ * reported in the bitmap. The guarantees are as follows:
+ * a) All ptes dirty when the function starts executing will end up recorded
+ * in the bitmap.
+ * b) All ptes dirtied after that will either remain dirty, be recorded in the
+ * bitmap or both.
+ *
+ * If a caller needs to make sure all dirty ptes are picked up and none
+ * additional are added, it first needs to write-protect the address-space
+ * range and make sure new writers are blocked in page_mkwrite() or
+ * pfn_mkwrite(). And then after a TLB flush following the write-protection
+ * pick up all dirty bits.
+ *
+ * Note: This function currently skips transhuge page-table entries, since
+ * it's intended for dirty-tracking on the PTE level. It will warn on
+ * encountering transhuge dirty entries, though, and can easily be extended
+ * to handle them as well.
+ *
+ * Return: The number of dirty ptes actually cleaned.
+ */
+unsigned long clean_record_shared_mapping_range(struct address_space *mapping,
+ pgoff_t first_index, pgoff_t nr,
+ pgoff_t bitmap_pgoff,
+ unsigned long *bitmap,
+ pgoff_t *start,
+ pgoff_t *end)
+{
+ bool none_set = (*start >= *end);
+ struct clean_walk cwalk = {
+ .base = { .total = 0 },
+ .bitmap_pgoff = bitmap_pgoff,
+ .bitmap = bitmap,
+ .start = none_set ? nr : *start,
+ .end = none_set ? 0 : *end,
+ };
+
+ i_mmap_lock_read(mapping);
+ WARN_ON(walk_page_mapping(mapping, first_index, nr, &clean_walk_ops,
+ &cwalk.base));
+ i_mmap_unlock_read(mapping);
+
+ *start = cwalk.start;
+ *end = cwalk.end;
+
+ return cwalk.base.total;
+}
+EXPORT_SYMBOL_GPL(clean_record_shared_mapping_range);
diff --git a/mm/memblock.c b/mm/memblock.c
index 7d4f61ae666a..c4b16cae2bc9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1356,9 +1356,6 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
align = SMP_CACHE_BYTES;
}
- if (end > memblock.current_limit)
- end = memblock.current_limit;
-
again:
found = memblock_find_in_range_node(size, align, start, end, nid,
flags);
@@ -1469,6 +1466,9 @@ static void * __init memblock_alloc_internal(
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, nid);
+ if (max_addr > memblock.current_limit)
+ max_addr = memblock.current_limit;
+
alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid);
/* retry allocation without lower limit */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c313c49074ca..01f3f8b665e9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -484,7 +484,7 @@ ino_t page_cgroup_ino(struct page *page)
unsigned long ino = 0;
rcu_read_lock();
- if (PageHead(page) && PageSlab(page))
+ if (PageSlab(page) && !PageTail(page))
memcg = memcg_from_slab_page(page);
else
memcg = READ_ONCE(page->mem_cgroup);
@@ -960,7 +960,7 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
if (unlikely(!memcg))
memcg = root_mem_cgroup;
}
- } while (!css_tryget_online(&memcg->css));
+ } while (!css_tryget(&memcg->css));
rcu_read_unlock();
return memcg;
}
@@ -1567,6 +1567,11 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
return max;
}
+unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
+{
+ return page_counter_read(&memcg->memory);
+}
+
static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
int order)
{
@@ -1795,7 +1800,7 @@ static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
struct mem_cgroup *iter;
spin_lock(&memcg_oom_lock);
- mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
+ mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
for_each_mem_cgroup_tree(iter, memcg)
iter->oom_lock = false;
spin_unlock(&memcg_oom_lock);
@@ -2530,6 +2535,15 @@ retry:
}
/*
+ * Memcg doesn't have a dedicated reserve for atomic
+ * allocations. But like the global atomic pool, we need to
+ * put the burden of reclaim on regular allocation requests
+ * and let these go through as privileged allocations.
+ */
+ if (gfp_mask & __GFP_ATOMIC)
+ goto force;
+
+ /*
* Unlike in global OOM situations, memcg is not in a physical
* memory shortage. Allow dying and OOM-killed tasks to
* bypass the last charges so that they can exit quickly and
@@ -5009,12 +5023,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
- /*
- * Flush percpu vmstats and vmevents to guarantee the value correctness
- * on parent's and all ancestor levels.
- */
- memcg_flush_percpu_vmstats(memcg, false);
- memcg_flush_percpu_vmevents(memcg);
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->vmstats_percpu);
@@ -5025,6 +5033,12 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
memcg_wb_domain_exit(memcg);
+ /*
+ * Flush percpu vmstats and vmevents to guarantee the value correctness
+ * on parent's and all ancestor levels.
+ */
+ memcg_flush_percpu_vmstats(memcg, false);
+ memcg_flush_percpu_vmevents(memcg);
__mem_cgroup_free(memcg);
}
@@ -5415,6 +5429,8 @@ static int mem_cgroup_move_account(struct page *page,
struct mem_cgroup *from,
struct mem_cgroup *to)
{
+ struct lruvec *from_vec, *to_vec;
+ struct pglist_data *pgdat;
unsigned long flags;
unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
int ret;
@@ -5438,11 +5454,15 @@ static int mem_cgroup_move_account(struct page *page,
anon = PageAnon(page);
+ pgdat = page_pgdat(page);
+ from_vec = mem_cgroup_lruvec(pgdat, from);
+ to_vec = mem_cgroup_lruvec(pgdat, to);
+
spin_lock_irqsave(&from->move_lock, flags);
if (!anon && page_mapped(page)) {
- __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages);
- __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages);
+ __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
}
/*
@@ -5454,14 +5474,14 @@ static int mem_cgroup_move_account(struct page *page,
struct address_space *mapping = page_mapping(page);
if (mapping_cap_account_dirty(mapping)) {
- __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages);
- __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages);
+ __mod_lruvec_state(from_vec, NR_FILE_DIRTY, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_FILE_DIRTY, nr_pages);
}
}
if (PageWriteback(page)) {
- __mod_memcg_state(from, NR_WRITEBACK, -nr_pages);
- __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
+ __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 7ef849da8278..3151c87dff73 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -199,7 +199,6 @@ struct to_kill {
struct task_struct *tsk;
unsigned long addr;
short size_shift;
- char addr_valid;
};
/*
@@ -324,22 +323,27 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
}
}
tk->addr = page_address_in_vma(p, vma);
- tk->addr_valid = 1;
if (is_zone_device_page(p))
tk->size_shift = dev_pagemap_mapping_shift(p, vma);
else
tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT;
/*
- * In theory we don't have to kill when the page was
- * munmaped. But it could be also a mremap. Since that's
- * likely very rare kill anyways just out of paranoia, but use
- * a SIGKILL because the error is not contained anymore.
+ * Send SIGKILL if "tk->addr == -EFAULT". Also, as
+ * "tk->size_shift" is always non-zero for !is_zone_device_page(),
+ * so "tk->size_shift == 0" effectively checks no mapping on
+ * ZONE_DEVICE. Indeed, when a devdax page is mmapped N times
+ * to a process' address space, it's possible not all N VMAs
+ * contain mappings for the page, but at least one VMA does.
+ * Only deliver SIGBUS with payload derived from the VMA that
+ * has a mapping for the page.
*/
- if (tk->addr == -EFAULT || tk->size_shift == 0) {
+ if (tk->addr == -EFAULT) {
pr_info("Memory failure: Unable to find user space address %lx in %s\n",
page_to_pfn(p), tsk->comm);
- tk->addr_valid = 0;
+ } else if (tk->size_shift == 0) {
+ kfree(tk);
+ return;
}
get_task_struct(tsk);
tk->tsk = tsk;
@@ -366,7 +370,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
* make sure the process doesn't catch the
* signal and then access the memory. Just kill it.
*/
- if (fail || tk->addr_valid == 0) {
+ if (fail || tk->addr == -EFAULT) {
pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
pfn, tk->tsk->comm, tk->tsk->pid);
do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
@@ -1253,17 +1257,19 @@ int memory_failure(unsigned long pfn, int flags)
if (!sysctl_memory_failure_recovery)
panic("Memory failure on page %lx", pfn);
- if (!pfn_valid(pfn)) {
+ p = pfn_to_online_page(pfn);
+ if (!p) {
+ if (pfn_valid(pfn)) {
+ pgmap = get_dev_pagemap(pfn, NULL);
+ if (pgmap)
+ return memory_failure_dev_pagemap(pfn, flags,
+ pgmap);
+ }
pr_err("Memory failure: %#lx: memory outside kernel control\n",
pfn);
return -ENXIO;
}
- pgmap = get_dev_pagemap(pfn, NULL);
- if (pgmap)
- return memory_failure_dev_pagemap(pfn, flags, pgmap);
-
- p = pfn_to_page(pfn);
if (PageHuge(p))
return memory_failure_hugetlb(pfn, flags);
if (TestSetPageHWPoison(p)) {
diff --git a/mm/memory.c b/mm/memory.c
index b1ca51a079f2..b6a5d6a08438 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -118,6 +118,18 @@ int randomize_va_space __read_mostly =
2;
#endif
+#ifndef arch_faults_on_old_pte
+static inline bool arch_faults_on_old_pte(void)
+{
+ /*
+ * Those arches which don't have hw access flag feature need to
+ * implement their own helper. By default, "true" means pagefault
+ * will be hit on old pte.
+ */
+ return true;
+}
+#endif
+
static int __init disable_randmaps(char *s)
{
randomize_va_space = 0;
@@ -2145,32 +2157,82 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
return same;
}
-static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
+static inline bool cow_user_page(struct page *dst, struct page *src,
+ struct vm_fault *vmf)
{
+ bool ret;
+ void *kaddr;
+ void __user *uaddr;
+ bool force_mkyoung;
+ struct vm_area_struct *vma = vmf->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long addr = vmf->address;
+
debug_dma_assert_idle(src);
+ if (likely(src)) {
+ copy_user_highpage(dst, src, addr, vma);
+ return true;
+ }
+
/*
* If the source page was a PFN mapping, we don't have
* a "struct page" for it. We do a best-effort copy by
* just copying from the original user address. If that
* fails, we just zero-fill it. Live with it.
*/
- if (unlikely(!src)) {
- void *kaddr = kmap_atomic(dst);
- void __user *uaddr = (void __user *)(va & PAGE_MASK);
+ kaddr = kmap_atomic(dst);
+ uaddr = (void __user *)(addr & PAGE_MASK);
+
+ /*
+ * On architectures with software "accessed" bits, we would
+ * take a double page fault, so mark it accessed here.
+ */
+ force_mkyoung = arch_faults_on_old_pte() && !pte_young(vmf->orig_pte);
+ if (force_mkyoung) {
+ pte_t entry;
+
+ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
+ if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+ /*
+ * Other thread has already handled the fault
+ * and we don't need to do anything. If it's
+ * not the case, the fault will be triggered
+ * again on the same address.
+ */
+ ret = false;
+ goto pte_unlock;
+ }
+ entry = pte_mkyoung(vmf->orig_pte);
+ if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
+ update_mmu_cache(vma, addr, vmf->pte);
+ }
+
+ /*
+ * This really shouldn't fail, because the page is there
+ * in the page tables. But it might just be unreadable,
+ * in which case we just give up and fill the result with
+ * zeroes.
+ */
+ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
/*
- * This really shouldn't fail, because the page is there
- * in the page tables. But it might just be unreadable,
- * in which case we just give up and fill the result with
- * zeroes.
+ * Give a warn in case there can be some obscure
+ * use-case
*/
- if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
- clear_page(kaddr);
- kunmap_atomic(kaddr);
- flush_dcache_page(dst);
- } else
- copy_user_highpage(dst, src, va, vma);
+ WARN_ON_ONCE(1);
+ clear_page(kaddr);
+ }
+
+ ret = true;
+
+pte_unlock:
+ if (force_mkyoung)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ kunmap_atomic(kaddr);
+ flush_dcache_page(dst);
+
+ return ret;
}
static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
@@ -2327,7 +2389,19 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
vmf->address);
if (!new_page)
goto oom;
- cow_user_page(new_page, old_page, vmf->address, vma);
+
+ if (!cow_user_page(new_page, old_page, vmf)) {
+ /*
+ * COW failed, if the fault was solved by other,
+ * it's fine. If not, userspace would re-fault on
+ * the same address and we will handle the fault
+ * from the second attempt.
+ */
+ put_page(new_page);
+ if (old_page)
+ put_page(old_page);
+ return 0;
+ }
}
if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b1be791f772d..f307bd82d750 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -331,7 +331,7 @@ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
unsigned long end_pfn)
{
for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) {
- if (unlikely(!pfn_valid(start_pfn)))
+ if (unlikely(!pfn_to_online_page(start_pfn)))
continue;
if (unlikely(pfn_to_nid(start_pfn) != nid))
@@ -356,7 +356,7 @@ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
/* pfn is the end pfn of a memory section. */
pfn = end_pfn - 1;
for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) {
- if (unlikely(!pfn_valid(pfn)))
+ if (unlikely(!pfn_to_online_page(pfn)))
continue;
if (unlikely(pfn_to_nid(pfn) != nid))
@@ -415,7 +415,7 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
*/
pfn = zone_start_pfn;
for (; pfn < zone_end_pfn; pfn += PAGES_PER_SUBSECTION) {
- if (unlikely(!pfn_valid(pfn)))
+ if (unlikely(!pfn_to_online_page(pfn)))
continue;
if (page_zone(pfn_to_page(pfn)) != zone)
@@ -436,67 +436,33 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
zone_span_writeunlock(zone);
}
-static void shrink_pgdat_span(struct pglist_data *pgdat,
- unsigned long start_pfn, unsigned long end_pfn)
+static void update_pgdat_span(struct pglist_data *pgdat)
{
- unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
- unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
- unsigned long pgdat_end_pfn = p;
- unsigned long pfn;
- int nid = pgdat->node_id;
-
- if (pgdat_start_pfn == start_pfn) {
- /*
- * If the section is smallest section in the pgdat, it need
- * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
- * In this case, we find second smallest valid mem_section
- * for shrinking zone.
- */
- pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
- pgdat_end_pfn);
- if (pfn) {
- pgdat->node_start_pfn = pfn;
- pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
- }
- } else if (pgdat_end_pfn == end_pfn) {
- /*
- * If the section is biggest section in the pgdat, it need
- * shrink pgdat->node_spanned_pages.
- * In this case, we find second biggest valid mem_section for
- * shrinking zone.
- */
- pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
- start_pfn);
- if (pfn)
- pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
- }
+ unsigned long node_start_pfn = 0, node_end_pfn = 0;
+ struct zone *zone;
- /*
- * If the section is not biggest or smallest mem_section in the pgdat,
- * it only creates a hole in the pgdat. So in this case, we need not
- * change the pgdat.
- * But perhaps, the pgdat has only hole data. Thus it check the pgdat
- * has only hole or not.
- */
- pfn = pgdat_start_pfn;
- for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SUBSECTION) {
- if (unlikely(!pfn_valid(pfn)))
- continue;
+ for (zone = pgdat->node_zones;
+ zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
+ unsigned long zone_end_pfn = zone->zone_start_pfn +
+ zone->spanned_pages;
- if (pfn_to_nid(pfn) != nid)
+ /* No need to lock the zones, they can't change. */
+ if (!zone->spanned_pages)
continue;
-
- /* Skip range to be removed */
- if (pfn >= start_pfn && pfn < end_pfn)
+ if (!node_end_pfn) {
+ node_start_pfn = zone->zone_start_pfn;
+ node_end_pfn = zone_end_pfn;
continue;
+ }
- /* If we find valid section, we have nothing to do */
- return;
+ if (zone_end_pfn > node_end_pfn)
+ node_end_pfn = zone_end_pfn;
+ if (zone->zone_start_pfn < node_start_pfn)
+ node_start_pfn = zone->zone_start_pfn;
}
- /* The pgdat has no valid section */
- pgdat->node_start_pfn = 0;
- pgdat->node_spanned_pages = 0;
+ pgdat->node_start_pfn = node_start_pfn;
+ pgdat->node_spanned_pages = node_end_pfn - node_start_pfn;
}
static void __remove_zone(struct zone *zone, unsigned long start_pfn,
@@ -505,9 +471,19 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn,
struct pglist_data *pgdat = zone->zone_pgdat;
unsigned long flags;
+#ifdef CONFIG_ZONE_DEVICE
+ /*
+ * Zone shrinking code cannot properly deal with ZONE_DEVICE. So
+ * we will not try to shrink the zones - which is okay as
+ * set_zone_contiguous() cannot deal with ZONE_DEVICE either way.
+ */
+ if (zone_idx(zone) == ZONE_DEVICE)
+ return;
+#endif
+
pgdat_resize_lock(zone->zone_pgdat, &flags);
shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
- shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
+ update_pgdat_span(pgdat);
pgdat_resize_unlock(zone->zone_pgdat, &flags);
}
@@ -1680,6 +1656,18 @@ static int check_cpu_on_node(pg_data_t *pgdat)
return 0;
}
+static int check_no_memblock_for_node_cb(struct memory_block *mem, void *arg)
+{
+ int nid = *(int *)arg;
+
+ /*
+ * If a memory block belongs to multiple nodes, the stored nid is not
+ * reliable. However, such blocks are always online (e.g., cannot get
+ * offlined) and, therefore, are still spanned by the node.
+ */
+ return mem->nid == nid ? -EEXIST : 0;
+}
+
/**
* try_offline_node
* @nid: the node ID
@@ -1692,25 +1680,24 @@ static int check_cpu_on_node(pg_data_t *pgdat)
void try_offline_node(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
- unsigned long start_pfn = pgdat->node_start_pfn;
- unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
- unsigned long pfn;
-
- for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
- unsigned long section_nr = pfn_to_section_nr(pfn);
-
- if (!present_section_nr(section_nr))
- continue;
+ int rc;
- if (pfn_to_nid(pfn) != nid)
- continue;
+ /*
+ * If the node still spans pages (especially ZONE_DEVICE), don't
+ * offline it. A node spans memory after move_pfn_range_to_zone(),
+ * e.g., after the memory block was onlined.
+ */
+ if (pgdat->node_spanned_pages)
+ return;
- /*
- * some memory sections of this node are not removed, and we
- * can't offline node now.
- */
+ /*
+ * Especially offline memory blocks might not be spanned by the
+ * node. They will get spanned by the node once they get onlined.
+ * However, they link to the node in sysfs and can get onlined later.
+ */
+ rc = for_each_memory_block(&nid, check_no_memblock_for_node_cb);
+ if (rc)
return;
- }
if (check_cpu_on_node(pgdat))
return;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4ae967bcf954..e08c94170ae4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -672,7 +672,9 @@ static const struct mm_walk_ops queue_pages_walk_ops = {
* 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
* specified.
* 0 - queue pages successfully or no misplaced page.
- * -EIO - there is misplaced page and only MPOL_MF_STRICT was specified.
+ * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
+ * memory range specified by nodemask and maxnode points outside
+ * your accessible address space (-EFAULT)
*/
static int
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
@@ -1286,7 +1288,7 @@ static long do_mbind(unsigned long start, unsigned long len,
flags | MPOL_MF_INVERT, &pagelist);
if (ret < 0) {
- err = -EIO;
+ err = ret;
goto up_out;
}
@@ -1305,10 +1307,12 @@ static long do_mbind(unsigned long start, unsigned long len,
if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
err = -EIO;
- } else
- putback_movable_pages(&pagelist);
-
+ } else {
up_out:
+ if (!list_empty(&pagelist))
+ putback_movable_pages(&pagelist);
+ }
+
up_write(&mm->mmap_sem);
mpol_out:
mpol_put(new);
diff --git a/mm/memremap.c b/mm/memremap.c
index 32c79b51af86..03ccbdfeb697 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -13,8 +13,6 @@
#include <linux/xarray.h>
static DEFINE_XARRAY(pgmap_array);
-#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
-#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
#ifdef CONFIG_DEV_PAGEMAP_OPS
DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
@@ -105,6 +103,7 @@ static void dev_pagemap_cleanup(struct dev_pagemap *pgmap)
void memunmap_pages(struct dev_pagemap *pgmap)
{
struct resource *res = &pgmap->res;
+ struct page *first_page;
unsigned long pfn;
int nid;
@@ -113,14 +112,16 @@ void memunmap_pages(struct dev_pagemap *pgmap)
put_page(pfn_to_page(pfn));
dev_pagemap_cleanup(pgmap);
+ /* make sure to access a memmap that was actually initialized */
+ first_page = pfn_to_page(pfn_first(pgmap));
+
/* pages are dead and unused, undo the arch mapping */
- nid = page_to_nid(pfn_to_page(PHYS_PFN(res->start)));
+ nid = page_to_nid(first_page);
mem_hotplug_begin();
if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
- pfn = PHYS_PFN(res->start);
- __remove_pages(page_zone(pfn_to_page(pfn)), pfn,
- PHYS_PFN(resource_size(res)), NULL);
+ __remove_pages(page_zone(first_page), PHYS_PFN(res->start),
+ PHYS_PFN(resource_size(res)), NULL);
} else {
arch_remove_memory(nid, res->start, resource_size(res),
pgmap_altmap(pgmap));
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 7fde88695f35..f76ea05b1cb0 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -12,6 +12,7 @@
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/err.h>
+#include <linux/interval_tree.h>
#include <linux/srcu.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
@@ -28,6 +29,254 @@ struct lockdep_map __mmu_notifier_invalidate_range_start_map = {
#endif
/*
+ * The mmu notifier_mm structure is allocated and installed in
+ * mm->mmu_notifier_mm inside the mm_take_all_locks() protected
+ * critical section and it's released only when mm_count reaches zero
+ * in mmdrop().
+ */
+struct mmu_notifier_mm {
+ /* all mmu notifiers registered in this mm are queued in this list */
+ struct hlist_head list;
+ bool has_itree;
+ /* to serialize the list modifications and hlist_unhashed */
+ spinlock_t lock;
+ unsigned long invalidate_seq;
+ unsigned long active_invalidate_ranges;
+ struct rb_root_cached itree;
+ wait_queue_head_t wq;
+ struct hlist_head deferred_list;
+};
+
+/*
+ * This is a collision-retry read-side/write-side 'lock', a lot like a
+ * seqcount, however this allows multiple write-sides to hold it at
+ * once. Conceptually the write side is protecting the values of the PTEs in
+ * this mm, such that PTES cannot be read into SPTEs (shadow PTEs) while any
+ * writer exists.
+ *
+ * Note that the core mm creates nested invalidate_range_start()/end() regions
+ * within the same thread, and runs invalidate_range_start()/end() in parallel
+ * on multiple CPUs. This is designed to not reduce concurrency or block
+ * progress on the mm side.
+ *
+ * As a secondary function, holding the full write side also serves to prevent
+ * writers for the itree, this is an optimization to avoid extra locking
+ * during invalidate_range_start/end notifiers.
+ *
+ * The write side has two states, fully excluded:
+ * - mm->active_invalidate_ranges != 0
+ * - mnn->invalidate_seq & 1 == True (odd)
+ * - some range on the mm_struct is being invalidated
+ * - the itree is not allowed to change
+ *
+ * And partially excluded:
+ * - mm->active_invalidate_ranges != 0
+ * - mnn->invalidate_seq & 1 == False (even)
+ * - some range on the mm_struct is being invalidated
+ * - the itree is allowed to change
+ *
+ * Operations on mmu_notifier_mm->invalidate_seq (under spinlock):
+ * seq |= 1 # Begin writing
+ * seq++ # Release the writing state
+ * seq & 1 # True if a writer exists
+ *
+ * The later state avoids some expensive work on inv_end in the common case of
+ * no mni monitoring the VA.
+ */
+static bool mn_itree_is_invalidating(struct mmu_notifier_mm *mmn_mm)
+{
+ lockdep_assert_held(&mmn_mm->lock);
+ return mmn_mm->invalidate_seq & 1;
+}
+
+static struct mmu_interval_notifier *
+mn_itree_inv_start_range(struct mmu_notifier_mm *mmn_mm,
+ const struct mmu_notifier_range *range,
+ unsigned long *seq)
+{
+ struct interval_tree_node *node;
+ struct mmu_interval_notifier *res = NULL;
+
+ spin_lock(&mmn_mm->lock);
+ mmn_mm->active_invalidate_ranges++;
+ node = interval_tree_iter_first(&mmn_mm->itree, range->start,
+ range->end - 1);
+ if (node) {
+ mmn_mm->invalidate_seq |= 1;
+ res = container_of(node, struct mmu_interval_notifier,
+ interval_tree);
+ }
+
+ *seq = mmn_mm->invalidate_seq;
+ spin_unlock(&mmn_mm->lock);
+ return res;
+}
+
+static struct mmu_interval_notifier *
+mn_itree_inv_next(struct mmu_interval_notifier *mni,
+ const struct mmu_notifier_range *range)
+{
+ struct interval_tree_node *node;
+
+ node = interval_tree_iter_next(&mni->interval_tree, range->start,
+ range->end - 1);
+ if (!node)
+ return NULL;
+ return container_of(node, struct mmu_interval_notifier, interval_tree);
+}
+
+static void mn_itree_inv_end(struct mmu_notifier_mm *mmn_mm)
+{
+ struct mmu_interval_notifier *mni;
+ struct hlist_node *next;
+
+ spin_lock(&mmn_mm->lock);
+ if (--mmn_mm->active_invalidate_ranges ||
+ !mn_itree_is_invalidating(mmn_mm)) {
+ spin_unlock(&mmn_mm->lock);
+ return;
+ }
+
+ /* Make invalidate_seq even */
+ mmn_mm->invalidate_seq++;
+
+ /*
+ * The inv_end incorporates a deferred mechanism like rtnl_unlock().
+ * Adds and removes are queued until the final inv_end happens then
+ * they are progressed. This arrangement for tree updates is used to
+ * avoid using a blocking lock during invalidate_range_start.
+ */
+ hlist_for_each_entry_safe(mni, next, &mmn_mm->deferred_list,
+ deferred_item) {
+ if (RB_EMPTY_NODE(&mni->interval_tree.rb))
+ interval_tree_insert(&mni->interval_tree,
+ &mmn_mm->itree);
+ else
+ interval_tree_remove(&mni->interval_tree,
+ &mmn_mm->itree);
+ hlist_del(&mni->deferred_item);
+ }
+ spin_unlock(&mmn_mm->lock);
+
+ wake_up_all(&mmn_mm->wq);
+}
+
+/**
+ * mmu_interval_read_begin - Begin a read side critical section against a VA
+ * range
+ * mni: The range to use
+ *
+ * mmu_iterval_read_begin()/mmu_iterval_read_retry() implement a
+ * collision-retry scheme similar to seqcount for the VA range under mni. If
+ * the mm invokes invalidation during the critical section then
+ * mmu_interval_read_retry() will return true.
+ *
+ * This is useful to obtain shadow PTEs where teardown or setup of the SPTEs
+ * require a blocking context. The critical region formed by this can sleep,
+ * and the required 'user_lock' can also be a sleeping lock.
+ *
+ * The caller is required to provide a 'user_lock' to serialize both teardown
+ * and setup.
+ *
+ * The return value should be passed to mmu_interval_read_retry().
+ */
+unsigned long mmu_interval_read_begin(struct mmu_interval_notifier *mni)
+{
+ struct mmu_notifier_mm *mmn_mm = mni->mm->mmu_notifier_mm;
+ unsigned long seq;
+ bool is_invalidating;
+
+ /*
+ * If the mni has a different seq value under the user_lock than we
+ * started with then it has collided.
+ *
+ * If the mni currently has the same seq value as the mmn_mm seq, then
+ * it is currently between invalidate_start/end and is colliding.
+ *
+ * The locking looks broadly like this:
+ * mn_tree_invalidate_start(): mmu_interval_read_begin():
+ * spin_lock
+ * seq = READ_ONCE(mni->invalidate_seq);
+ * seq == mmn_mm->invalidate_seq
+ * spin_unlock
+ * spin_lock
+ * seq = ++mmn_mm->invalidate_seq
+ * spin_unlock
+ * op->invalidate_range():
+ * user_lock
+ * mmu_interval_set_seq()
+ * mni->invalidate_seq = seq
+ * user_unlock
+ *
+ * [Required: mmu_interval_read_retry() == true]
+ *
+ * mn_itree_inv_end():
+ * spin_lock
+ * seq = ++mmn_mm->invalidate_seq
+ * spin_unlock
+ *
+ * user_lock
+ * mmu_interval_read_retry():
+ * mni->invalidate_seq != seq
+ * user_unlock
+ *
+ * Barriers are not needed here as any races here are closed by an
+ * eventual mmu_interval_read_retry(), which provides a barrier via the
+ * user_lock.
+ */
+ spin_lock(&mmn_mm->lock);
+ /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */
+ seq = READ_ONCE(mni->invalidate_seq);
+ is_invalidating = seq == mmn_mm->invalidate_seq;
+ spin_unlock(&mmn_mm->lock);
+
+ /*
+ * mni->invalidate_seq must always be set to an odd value via
+ * mmu_interval_set_seq() using the provided cur_seq from
+ * mn_itree_inv_start_range(). This ensures that if seq does wrap we
+ * will always clear the below sleep in some reasonable time as
+ * mmn_mm->invalidate_seq is even in the idle state.
+ */
+ lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
+ lock_map_release(&__mmu_notifier_invalidate_range_start_map);
+ if (is_invalidating)
+ wait_event(mmn_mm->wq,
+ READ_ONCE(mmn_mm->invalidate_seq) != seq);
+
+ /*
+ * Notice that mmu_interval_read_retry() can already be true at this
+ * point, avoiding loops here allows the caller to provide a global
+ * time bound.
+ */
+
+ return seq;
+}
+EXPORT_SYMBOL_GPL(mmu_interval_read_begin);
+
+static void mn_itree_release(struct mmu_notifier_mm *mmn_mm,
+ struct mm_struct *mm)
+{
+ struct mmu_notifier_range range = {
+ .flags = MMU_NOTIFIER_RANGE_BLOCKABLE,
+ .event = MMU_NOTIFY_RELEASE,
+ .mm = mm,
+ .start = 0,
+ .end = ULONG_MAX,
+ };
+ struct mmu_interval_notifier *mni;
+ unsigned long cur_seq;
+ bool ret;
+
+ for (mni = mn_itree_inv_start_range(mmn_mm, &range, &cur_seq); mni;
+ mni = mn_itree_inv_next(mni, &range)) {
+ ret = mni->ops->invalidate(mni, &range, cur_seq);
+ WARN_ON(!ret);
+ }
+
+ mn_itree_inv_end(mmn_mm);
+}
+
+/*
* This function can't run concurrently against mmu_notifier_register
* because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
* runs with mm_users == 0. Other tasks may still invoke mmu notifiers
@@ -39,7 +288,8 @@ struct lockdep_map __mmu_notifier_invalidate_range_start_map = {
* can't go away from under us as exit_mmap holds an mm_count pin
* itself.
*/
-void __mmu_notifier_release(struct mm_struct *mm)
+static void mn_hlist_release(struct mmu_notifier_mm *mmn_mm,
+ struct mm_struct *mm)
{
struct mmu_notifier *mn;
int id;
@@ -49,7 +299,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
* ->release returns.
*/
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist)
+ hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist)
/*
* If ->release runs before mmu_notifier_unregister it must be
* handled, as it's the only way for the driver to flush all
@@ -59,10 +309,9 @@ void __mmu_notifier_release(struct mm_struct *mm)
if (mn->ops->release)
mn->ops->release(mn, mm);
- spin_lock(&mm->mmu_notifier_mm->lock);
- while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
- mn = hlist_entry(mm->mmu_notifier_mm->list.first,
- struct mmu_notifier,
+ spin_lock(&mmn_mm->lock);
+ while (unlikely(!hlist_empty(&mmn_mm->list))) {
+ mn = hlist_entry(mmn_mm->list.first, struct mmu_notifier,
hlist);
/*
* We arrived before mmu_notifier_unregister so
@@ -72,7 +321,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
*/
hlist_del_init_rcu(&mn->hlist);
}
- spin_unlock(&mm->mmu_notifier_mm->lock);
+ spin_unlock(&mmn_mm->lock);
srcu_read_unlock(&srcu, id);
/*
@@ -87,6 +336,17 @@ void __mmu_notifier_release(struct mm_struct *mm)
synchronize_srcu(&srcu);
}
+void __mmu_notifier_release(struct mm_struct *mm)
+{
+ struct mmu_notifier_mm *mmn_mm = mm->mmu_notifier_mm;
+
+ if (mmn_mm->has_itree)
+ mn_itree_release(mmn_mm, mm);
+
+ if (!hlist_empty(&mmn_mm->list))
+ mn_hlist_release(mmn_mm, mm);
+}
+
/*
* If no young bitflag is supported by the hardware, ->clear_flush_young can
* unmap the address and return 1 or 0 depending if the mapping previously
@@ -159,14 +419,43 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
srcu_read_unlock(&srcu, id);
}
-int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
+static int mn_itree_invalidate(struct mmu_notifier_mm *mmn_mm,
+ const struct mmu_notifier_range *range)
+{
+ struct mmu_interval_notifier *mni;
+ unsigned long cur_seq;
+
+ for (mni = mn_itree_inv_start_range(mmn_mm, range, &cur_seq); mni;
+ mni = mn_itree_inv_next(mni, range)) {
+ bool ret;
+
+ ret = mni->ops->invalidate(mni, range, cur_seq);
+ if (!ret) {
+ if (WARN_ON(mmu_notifier_range_blockable(range)))
+ continue;
+ goto out_would_block;
+ }
+ }
+ return 0;
+
+out_would_block:
+ /*
+ * On -EAGAIN the non-blocking caller is not allowed to call
+ * invalidate_range_end()
+ */
+ mn_itree_inv_end(mmn_mm);
+ return -EAGAIN;
+}
+
+static int mn_hlist_invalidate_range_start(struct mmu_notifier_mm *mmn_mm,
+ struct mmu_notifier_range *range)
{
struct mmu_notifier *mn;
int ret = 0;
int id;
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) {
+ hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist) {
if (mn->ops->invalidate_range_start) {
int _ret;
@@ -180,7 +469,7 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
mn->ops->invalidate_range_start, _ret,
!mmu_notifier_range_blockable(range) ? "non-" : "");
WARN_ON(mmu_notifier_range_blockable(range) ||
- ret != -EAGAIN);
+ _ret != -EAGAIN);
ret = _ret;
}
}
@@ -190,15 +479,30 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
return ret;
}
-void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
- bool only_end)
+int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
+{
+ struct mmu_notifier_mm *mmn_mm = range->mm->mmu_notifier_mm;
+ int ret;
+
+ if (mmn_mm->has_itree) {
+ ret = mn_itree_invalidate(mmn_mm, range);
+ if (ret)
+ return ret;
+ }
+ if (!hlist_empty(&mmn_mm->list))
+ return mn_hlist_invalidate_range_start(mmn_mm, range);
+ return 0;
+}
+
+static void mn_hlist_invalidate_end(struct mmu_notifier_mm *mmn_mm,
+ struct mmu_notifier_range *range,
+ bool only_end)
{
struct mmu_notifier *mn;
int id;
- lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) {
+ hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist) {
/*
* Call invalidate_range here too to avoid the need for the
* subsystem of having to register an invalidate_range_end
@@ -225,6 +529,19 @@ void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
}
}
srcu_read_unlock(&srcu, id);
+}
+
+void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
+ bool only_end)
+{
+ struct mmu_notifier_mm *mmn_mm = range->mm->mmu_notifier_mm;
+
+ lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
+ if (mmn_mm->has_itree)
+ mn_itree_inv_end(mmn_mm);
+
+ if (!hlist_empty(&mmn_mm->list))
+ mn_hlist_invalidate_end(mmn_mm, range, only_end);
lock_map_release(&__mmu_notifier_invalidate_range_start_map);
}
@@ -243,8 +560,9 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
}
/*
- * Same as mmu_notifier_register but here the caller must hold the
- * mmap_sem in write mode.
+ * Same as mmu_notifier_register but here the caller must hold the mmap_sem in
+ * write mode. A NULL mn signals the notifier is being registered for itree
+ * mode.
*/
int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
{
@@ -261,9 +579,6 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
fs_reclaim_release(GFP_KERNEL);
}
- mn->mm = mm;
- mn->users = 1;
-
if (!mm->mmu_notifier_mm) {
/*
* kmalloc cannot be called under mm_take_all_locks(), but we
@@ -271,21 +586,22 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
* the write side of the mmap_sem.
*/
mmu_notifier_mm =
- kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
+ kzalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
if (!mmu_notifier_mm)
return -ENOMEM;
INIT_HLIST_HEAD(&mmu_notifier_mm->list);
spin_lock_init(&mmu_notifier_mm->lock);
+ mmu_notifier_mm->invalidate_seq = 2;
+ mmu_notifier_mm->itree = RB_ROOT_CACHED;
+ init_waitqueue_head(&mmu_notifier_mm->wq);
+ INIT_HLIST_HEAD(&mmu_notifier_mm->deferred_list);
}
ret = mm_take_all_locks(mm);
if (unlikely(ret))
goto out_clean;
- /* Pairs with the mmdrop in mmu_notifier_unregister_* */
- mmgrab(mm);
-
/*
* Serialize the update against mmu_notifier_unregister. A
* side note: mmu_notifier_release can't run concurrently with
@@ -293,13 +609,28 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
* current->mm or explicitly with get_task_mm() or similar).
* We can't race against any other mmu notifier method either
* thanks to mm_take_all_locks().
+ *
+ * release semantics on the initialization of the mmu_notifier_mm's
+ * contents are provided for unlocked readers. acquire can only be
+ * used while holding the mmgrab or mmget, and is safe because once
+ * created the mmu_notififer_mm is not freed until the mm is
+ * destroyed. As above, users holding the mmap_sem or one of the
+ * mm_take_all_locks() do not need to use acquire semantics.
*/
if (mmu_notifier_mm)
- mm->mmu_notifier_mm = mmu_notifier_mm;
+ smp_store_release(&mm->mmu_notifier_mm, mmu_notifier_mm);
- spin_lock(&mm->mmu_notifier_mm->lock);
- hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list);
- spin_unlock(&mm->mmu_notifier_mm->lock);
+ if (mn) {
+ /* Pairs with the mmdrop in mmu_notifier_unregister_* */
+ mmgrab(mm);
+ mn->mm = mm;
+ mn->users = 1;
+
+ spin_lock(&mm->mmu_notifier_mm->lock);
+ hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list);
+ spin_unlock(&mm->mmu_notifier_mm->lock);
+ } else
+ mm->mmu_notifier_mm->has_itree = true;
mm_drop_all_locks(mm);
BUG_ON(atomic_read(&mm->mm_users) <= 0);
@@ -516,6 +847,180 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(mmu_notifier_put);
+static int __mmu_interval_notifier_insert(
+ struct mmu_interval_notifier *mni, struct mm_struct *mm,
+ struct mmu_notifier_mm *mmn_mm, unsigned long start,
+ unsigned long length, const struct mmu_interval_notifier_ops *ops)
+{
+ mni->mm = mm;
+ mni->ops = ops;
+ RB_CLEAR_NODE(&mni->interval_tree.rb);
+ mni->interval_tree.start = start;
+ /*
+ * Note that the representation of the intervals in the interval tree
+ * considers the ending point as contained in the interval.
+ */
+ if (length == 0 ||
+ check_add_overflow(start, length - 1, &mni->interval_tree.last))
+ return -EOVERFLOW;
+
+ /* Must call with a mmget() held */
+ if (WARN_ON(atomic_read(&mm->mm_count) <= 0))
+ return -EINVAL;
+
+ /* pairs with mmdrop in mmu_interval_notifier_remove() */
+ mmgrab(mm);
+
+ /*
+ * If some invalidate_range_start/end region is going on in parallel
+ * we don't know what VA ranges are affected, so we must assume this
+ * new range is included.
+ *
+ * If the itree is invalidating then we are not allowed to change
+ * it. Retrying until invalidation is done is tricky due to the
+ * possibility for live lock, instead defer the add to
+ * mn_itree_inv_end() so this algorithm is deterministic.
+ *
+ * In all cases the value for the mni->invalidate_seq should be
+ * odd, see mmu_interval_read_begin()
+ */
+ spin_lock(&mmn_mm->lock);
+ if (mmn_mm->active_invalidate_ranges) {
+ if (mn_itree_is_invalidating(mmn_mm))
+ hlist_add_head(&mni->deferred_item,
+ &mmn_mm->deferred_list);
+ else {
+ mmn_mm->invalidate_seq |= 1;
+ interval_tree_insert(&mni->interval_tree,
+ &mmn_mm->itree);
+ }
+ mni->invalidate_seq = mmn_mm->invalidate_seq;
+ } else {
+ WARN_ON(mn_itree_is_invalidating(mmn_mm));
+ /*
+ * The starting seq for a mni not under invalidation should be
+ * odd, not equal to the current invalidate_seq and
+ * invalidate_seq should not 'wrap' to the new seq any time
+ * soon.
+ */
+ mni->invalidate_seq = mmn_mm->invalidate_seq - 1;
+ interval_tree_insert(&mni->interval_tree, &mmn_mm->itree);
+ }
+ spin_unlock(&mmn_mm->lock);
+ return 0;
+}
+
+/**
+ * mmu_interval_notifier_insert - Insert an interval notifier
+ * @mni: Interval notifier to register
+ * @start: Starting virtual address to monitor
+ * @length: Length of the range to monitor
+ * @mm : mm_struct to attach to
+ *
+ * This function subscribes the interval notifier for notifications from the
+ * mm. Upon return the ops related to mmu_interval_notifier will be called
+ * whenever an event that intersects with the given range occurs.
+ *
+ * Upon return the range_notifier may not be present in the interval tree yet.
+ * The caller must use the normal interval notifier read flow via
+ * mmu_interval_read_begin() to establish SPTEs for this range.
+ */
+int mmu_interval_notifier_insert(struct mmu_interval_notifier *mni,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long length,
+ const struct mmu_interval_notifier_ops *ops)
+{
+ struct mmu_notifier_mm *mmn_mm;
+ int ret;
+
+ might_lock(&mm->mmap_sem);
+
+ mmn_mm = smp_load_acquire(&mm->mmu_notifier_mm);
+ if (!mmn_mm || !mmn_mm->has_itree) {
+ ret = mmu_notifier_register(NULL, mm);
+ if (ret)
+ return ret;
+ mmn_mm = mm->mmu_notifier_mm;
+ }
+ return __mmu_interval_notifier_insert(mni, mm, mmn_mm, start, length,
+ ops);
+}
+EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert);
+
+int mmu_interval_notifier_insert_locked(
+ struct mmu_interval_notifier *mni, struct mm_struct *mm,
+ unsigned long start, unsigned long length,
+ const struct mmu_interval_notifier_ops *ops)
+{
+ struct mmu_notifier_mm *mmn_mm;
+ int ret;
+
+ lockdep_assert_held_write(&mm->mmap_sem);
+
+ mmn_mm = mm->mmu_notifier_mm;
+ if (!mmn_mm || !mmn_mm->has_itree) {
+ ret = __mmu_notifier_register(NULL, mm);
+ if (ret)
+ return ret;
+ mmn_mm = mm->mmu_notifier_mm;
+ }
+ return __mmu_interval_notifier_insert(mni, mm, mmn_mm, start, length,
+ ops);
+}
+EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked);
+
+/**
+ * mmu_interval_notifier_remove - Remove a interval notifier
+ * @mni: Interval notifier to unregister
+ *
+ * This function must be paired with mmu_interval_notifier_insert(). It cannot
+ * be called from any ops callback.
+ *
+ * Once this returns ops callbacks are no longer running on other CPUs and
+ * will not be called in future.
+ */
+void mmu_interval_notifier_remove(struct mmu_interval_notifier *mni)
+{
+ struct mm_struct *mm = mni->mm;
+ struct mmu_notifier_mm *mmn_mm = mm->mmu_notifier_mm;
+ unsigned long seq = 0;
+
+ might_sleep();
+
+ spin_lock(&mmn_mm->lock);
+ if (mn_itree_is_invalidating(mmn_mm)) {
+ /*
+ * remove is being called after insert put this on the
+ * deferred list, but before the deferred list was processed.
+ */
+ if (RB_EMPTY_NODE(&mni->interval_tree.rb)) {
+ hlist_del(&mni->deferred_item);
+ } else {
+ hlist_add_head(&mni->deferred_item,
+ &mmn_mm->deferred_list);
+ seq = mmn_mm->invalidate_seq;
+ }
+ } else {
+ WARN_ON(RB_EMPTY_NODE(&mni->interval_tree.rb));
+ interval_tree_remove(&mni->interval_tree, &mmn_mm->itree);
+ }
+ spin_unlock(&mmn_mm->lock);
+
+ /*
+ * The possible sleep on progress in the invalidation requires the
+ * caller not hold any locks held by invalidation callbacks.
+ */
+ lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
+ lock_map_release(&__mmu_notifier_invalidate_range_start_map);
+ if (seq)
+ wait_event(mmn_mm->wq,
+ READ_ONCE(mmn_mm->invalidate_seq) != seq);
+
+ /* pairs with mmgrab in mmu_interval_notifier_insert() */
+ mmdrop(mm);
+}
+EXPORT_SYMBOL_GPL(mmu_interval_notifier_remove);
+
/**
* mmu_notifier_synchronize - Ensure all mmu_notifiers are freed
*
diff --git a/mm/nommu.c b/mm/nommu.c
index 99b7ec318824..7de592058ab4 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -155,11 +155,11 @@ void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags)
return __vmalloc(size, flags, PAGE_KERNEL);
}
-void *vmalloc_user(unsigned long size)
+static void *__vmalloc_user_flags(unsigned long size, gfp_t flags)
{
void *ret;
- ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
+ ret = __vmalloc(size, flags, PAGE_KERNEL);
if (ret) {
struct vm_area_struct *vma;
@@ -172,8 +172,19 @@ void *vmalloc_user(unsigned long size)
return ret;
}
+
+void *vmalloc_user(unsigned long size)
+{
+ return __vmalloc_user_flags(size, GFP_KERNEL | __GFP_ZERO);
+}
EXPORT_SYMBOL(vmalloc_user);
+void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags)
+{
+ return __vmalloc_user_flags(size, flags | __GFP_ZERO);
+}
+EXPORT_SYMBOL(vmalloc_user_node_flags);
+
struct page *vmalloc_to_page(const void *addr)
{
return virt_to_page(addr);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 15c2050c629b..f391c0c4ed1d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1175,11 +1175,17 @@ static __always_inline bool free_pages_prepare(struct page *page,
debug_check_no_obj_freed(page_address(page),
PAGE_SIZE << order);
}
- arch_free_page(page, order);
if (want_init_on_free())
kernel_init_free_pages(page, 1 << order);
kernel_poison_pages(page, 1 << order, 0);
+ /*
+ * arch_free_page() can make the page's contents inaccessible. s390
+ * does this. So nothing which can access the page's contents should
+ * happen after this.
+ */
+ arch_free_page(page, order);
+
if (debug_pagealloc_enabled())
kernel_map_pages(page, 1 << order, 0);
@@ -1942,6 +1948,14 @@ void __init page_alloc_init_late(void)
wait_for_completion(&pgdat_init_all_done_comp);
/*
+ * The number of managed pages has changed due to the initialisation
+ * so the pcpu batch and high limits needs to be updated or the limits
+ * will be artificially small.
+ */
+ for_each_populated_zone(zone)
+ zone_pcp_update(zone);
+
+ /*
* We initialized the rest of the deferred pages. Permanently disable
* on-demand struct page initialization.
*/
@@ -3714,10 +3728,6 @@ try_this_zone:
static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
{
unsigned int filter = SHOW_MEM_FILTER_NODES;
- static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
-
- if (!__ratelimit(&show_mem_rs))
- return;
/*
* This documents exceptions given to allocations in certain
@@ -3738,8 +3748,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
- static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
+ static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);
if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
return;
@@ -4467,12 +4476,14 @@ retry_cpuset:
if (page)
goto got_pg;
- if (order >= pageblock_order && (gfp_mask & __GFP_IO)) {
+ if (order >= pageblock_order && (gfp_mask & __GFP_IO) &&
+ !(gfp_mask & __GFP_RETRY_MAYFAIL)) {
/*
* If allocating entire pageblock(s) and compaction
* failed because all zones are below low watermarks
* or is prohibited because it recently failed at this
- * order, fail immediately.
+ * order, fail immediately unless the allocator has
+ * requested compaction and reclaim retry.
*
* Reclaim is
* - potentially very expensive because zones are far
@@ -8506,7 +8517,6 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages)
WARN(count != 0, "%d pages are still in use!\n", count);
}
-#ifdef CONFIG_MEMORY_HOTPLUG
/*
* The zone indicated has a new number of managed_pages; batch sizes and percpu
* page high values need to be recalulated.
@@ -8520,7 +8530,6 @@ void __meminit zone_pcp_update(struct zone *zone)
per_cpu_ptr(zone->pageset, cpu));
mutex_unlock(&pcp_batch_high_lock);
}
-#endif
void zone_pcp_reset(struct zone *zone)
{
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 5f5769c7db3b..4ade843ff588 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -67,8 +67,9 @@ static struct page_ext_operations *page_ext_ops[] = {
#endif
};
+unsigned long page_ext_size = sizeof(struct page_ext);
+
static unsigned long total_usage;
-static unsigned long extra_mem;
static bool __init invoke_need_callbacks(void)
{
@@ -78,9 +79,8 @@ static bool __init invoke_need_callbacks(void)
for (i = 0; i < entries; i++) {
if (page_ext_ops[i]->need && page_ext_ops[i]->need()) {
- page_ext_ops[i]->offset = sizeof(struct page_ext) +
- extra_mem;
- extra_mem += page_ext_ops[i]->size;
+ page_ext_ops[i]->offset = page_ext_size;
+ page_ext_size += page_ext_ops[i]->size;
need = true;
}
}
@@ -99,14 +99,9 @@ static void __init invoke_init_callbacks(void)
}
}
-static unsigned long get_entry_size(void)
-{
- return sizeof(struct page_ext) + extra_mem;
-}
-
static inline struct page_ext *get_entry(void *base, unsigned long index)
{
- return base + get_entry_size() * index;
+ return base + page_ext_size * index;
}
#if !defined(CONFIG_SPARSEMEM)
@@ -156,7 +151,7 @@ static int __init alloc_node_page_ext(int nid)
!IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
nr_pages += MAX_ORDER_NR_PAGES;
- table_size = get_entry_size() * nr_pages;
+ table_size = page_ext_size * nr_pages;
base = memblock_alloc_try_nid(
table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
@@ -234,7 +229,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid)
if (section->page_ext)
return 0;
- table_size = get_entry_size() * PAGES_PER_SECTION;
+ table_size = page_ext_size * PAGES_PER_SECTION;
base = alloc_page_ext(table_size, nid);
/*
@@ -254,7 +249,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid)
* we need to apply a mask.
*/
pfn &= PAGE_SECTION_MASK;
- section->page_ext = (void *)base - get_entry_size() * pfn;
+ section->page_ext = (void *)base - page_ext_size * pfn;
total_usage += table_size;
return 0;
}
@@ -267,7 +262,7 @@ static void free_page_ext(void *addr)
struct page *page = virt_to_page(addr);
size_t table_size;
- table_size = get_entry_size() * PAGES_PER_SECTION;
+ table_size = page_ext_size * PAGES_PER_SECTION;
BUG_ON(PageReserved(page));
kmemleak_free(addr);
diff --git a/mm/page_io.c b/mm/page_io.c
index 24ee600f9131..60a66a58b9bf 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -73,6 +73,7 @@ static void swap_slot_free_notify(struct page *page)
{
struct swap_info_struct *sis;
struct gendisk *disk;
+ swp_entry_t entry;
/*
* There is no guarantee that the page is in swap cache - the software
@@ -104,11 +105,10 @@ static void swap_slot_free_notify(struct page *page)
* we again wish to reclaim it.
*/
disk = sis->bdev->bd_disk;
- if (disk->fops->swap_slot_free_notify) {
- swp_entry_t entry;
+ entry.val = page_private(page);
+ if (disk->fops->swap_slot_free_notify && __swap_count(entry) == 1) {
unsigned long offset;
- entry.val = page_private(page);
offset = swp_offset(entry);
SetPageDirty(page);
diff --git a/mm/page_owner.c b/mm/page_owner.c
index dee931184788..18ecde9f45b2 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -24,12 +24,10 @@ struct page_owner {
short last_migrate_reason;
gfp_t gfp_mask;
depot_stack_handle_t handle;
-#ifdef CONFIG_DEBUG_PAGEALLOC
depot_stack_handle_t free_handle;
-#endif
};
-static bool page_owner_disabled = true;
+static bool page_owner_enabled = false;
DEFINE_STATIC_KEY_FALSE(page_owner_inited);
static depot_stack_handle_t dummy_handle;
@@ -44,7 +42,7 @@ static int __init early_page_owner_param(char *buf)
return -EINVAL;
if (strcmp(buf, "on") == 0)
- page_owner_disabled = false;
+ page_owner_enabled = true;
return 0;
}
@@ -52,10 +50,7 @@ early_param("page_owner", early_page_owner_param);
static bool need_page_owner(void)
{
- if (page_owner_disabled)
- return false;
-
- return true;
+ return page_owner_enabled;
}
static __always_inline depot_stack_handle_t create_dummy_stack(void)
@@ -84,7 +79,7 @@ static noinline void register_early_stack(void)
static void init_page_owner(void)
{
- if (page_owner_disabled)
+ if (!page_owner_enabled)
return;
register_dummy_stack();
@@ -148,25 +143,19 @@ void __reset_page_owner(struct page *page, unsigned int order)
{
int i;
struct page_ext *page_ext;
-#ifdef CONFIG_DEBUG_PAGEALLOC
depot_stack_handle_t handle = 0;
struct page_owner *page_owner;
- if (debug_pagealloc_enabled())
- handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
-#endif
+ handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
+ page_ext = lookup_page_ext(page);
+ if (unlikely(!page_ext))
+ return;
for (i = 0; i < (1 << order); i++) {
- page_ext = lookup_page_ext(page + i);
- if (unlikely(!page_ext))
- continue;
- __clear_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags);
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if (debug_pagealloc_enabled()) {
- page_owner = get_page_owner(page_ext);
- page_owner->free_handle = handle;
- }
-#endif
+ __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
+ page_owner = get_page_owner(page_ext);
+ page_owner->free_handle = handle;
+ page_ext = page_ext_next(page_ext);
}
}
@@ -184,9 +173,9 @@ static inline void __set_page_owner_handle(struct page *page,
page_owner->gfp_mask = gfp_mask;
page_owner->last_migrate_reason = -1;
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
- __set_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags);
+ __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
- page_ext = lookup_page_ext(page + i);
+ page_ext = page_ext_next(page_ext);
}
}
@@ -224,12 +213,10 @@ void __split_page_owner(struct page *page, unsigned int order)
if (unlikely(!page_ext))
return;
- page_owner = get_page_owner(page_ext);
- page_owner->order = 0;
- for (i = 1; i < (1 << order); i++) {
- page_ext = lookup_page_ext(page + i);
+ for (i = 0; i < (1 << order); i++) {
page_owner = get_page_owner(page_ext);
page_owner->order = 0;
+ page_ext = page_ext_next(page_ext);
}
}
@@ -260,7 +247,7 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
* the new page, which will be freed.
*/
__set_bit(PAGE_EXT_OWNER, &new_ext->flags);
- __set_bit(PAGE_EXT_OWNER_ACTIVE, &new_ext->flags);
+ __set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags);
}
void pagetypeinfo_showmixedcount_print(struct seq_file *m,
@@ -284,7 +271,8 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
* not matter as the mixed block count will still be correct
*/
for (; pfn < end_pfn; ) {
- if (!pfn_valid(pfn)) {
+ page = pfn_to_online_page(pfn);
+ if (!page) {
pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
continue;
}
@@ -292,13 +280,13 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
block_end_pfn = min(block_end_pfn, end_pfn);
- page = pfn_to_page(pfn);
pageblock_mt = get_pageblock_migratetype(page);
for (; pfn < block_end_pfn; pfn++) {
if (!pfn_valid_within(pfn))
continue;
+ /* The pageblock is online, no need to recheck. */
page = pfn_to_page(pfn);
if (page_zone(page) != zone)
@@ -320,7 +308,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
if (unlikely(!page_ext))
continue;
- if (!test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags))
+ if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
continue;
page_owner = get_page_owner(page_ext);
@@ -435,7 +423,7 @@ void __dump_page_owner(struct page *page)
return;
}
- if (test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags))
+ if (test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
pr_alert("page_owner tracks the page as allocated\n");
else
pr_alert("page_owner tracks the page as freed\n");
@@ -451,7 +439,6 @@ void __dump_page_owner(struct page *page)
stack_trace_print(entries, nr_entries, 0);
}
-#ifdef CONFIG_DEBUG_PAGEALLOC
handle = READ_ONCE(page_owner->free_handle);
if (!handle) {
pr_alert("page_owner free stack trace missing\n");
@@ -460,7 +447,6 @@ void __dump_page_owner(struct page *page)
pr_alert("page last free stack trace:\n");
stack_trace_print(entries, nr_entries, 0);
}
-#endif
if (page_owner->last_migrate_reason != -1)
pr_alert("page has been migrated, last migrate reason: %s\n",
@@ -527,7 +513,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
* Although we do have the info about past allocation of free
* pages, it's not relevant for current memory usage.
*/
- if (!test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags))
+ if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
continue;
page_owner = get_page_owner(page_ext);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index d48c2a986ea3..ea0b9e606ad1 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -10,8 +10,9 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *pte;
int err = 0;
const struct mm_walk_ops *ops = walk->ops;
+ spinlock_t *ptl;
- pte = pte_offset_map(pmd, addr);
+ pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
for (;;) {
err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
if (err)
@@ -22,7 +23,7 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte++;
}
- pte_unmap(pte);
+ pte_unmap_unlock(pte, ptl);
return err;
}
@@ -253,13 +254,23 @@ static int __walk_page_range(unsigned long start, unsigned long end,
{
int err = 0;
struct vm_area_struct *vma = walk->vma;
+ const struct mm_walk_ops *ops = walk->ops;
+
+ if (vma && ops->pre_vma) {
+ err = ops->pre_vma(start, end, walk);
+ if (err)
+ return err;
+ }
if (vma && is_vm_hugetlb_page(vma)) {
- if (walk->ops->hugetlb_entry)
+ if (ops->hugetlb_entry)
err = walk_hugetlb_range(start, end, walk);
} else
err = walk_pgd_range(start, end, walk);
+ if (vma && ops->post_vma)
+ ops->post_vma(walk);
+
return err;
}
@@ -290,6 +301,11 @@ static int __walk_page_range(unsigned long start, unsigned long end,
* its vm_flags. walk_page_test() and @ops->test_walk() are used for this
* purpose.
*
+ * If operations need to be staged before and committed after a vma is walked,
+ * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
+ * since it is intended to handle commit-type operations, can't return any
+ * errors.
+ *
* struct mm_walk keeps current values of some common data like vma and pmd,
* which are useful for the access from callbacks. If you want to pass some
* caller-specific data to callbacks, @private should be helpful.
@@ -376,3 +392,80 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
return err;
return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
}
+
+/**
+ * walk_page_mapping - walk all memory areas mapped into a struct address_space.
+ * @mapping: Pointer to the struct address_space
+ * @first_index: First page offset in the address_space
+ * @nr: Number of incremental page offsets to cover
+ * @ops: operation to call during the walk
+ * @private: private data for callbacks' usage
+ *
+ * This function walks all memory areas mapped into a struct address_space.
+ * The walk is limited to only the given page-size index range, but if
+ * the index boundaries cross a huge page-table entry, that entry will be
+ * included.
+ *
+ * Also see walk_page_range() for additional information.
+ *
+ * Locking:
+ * This function can't require that the struct mm_struct::mmap_sem is held,
+ * since @mapping may be mapped by multiple processes. Instead
+ * @mapping->i_mmap_rwsem must be held. This might have implications in the
+ * callbacks, and it's up tho the caller to ensure that the
+ * struct mm_struct::mmap_sem is not needed.
+ *
+ * Also this means that a caller can't rely on the struct
+ * vm_area_struct::vm_flags to be constant across a call,
+ * except for immutable flags. Callers requiring this shouldn't use
+ * this function.
+ *
+ * Return: 0 on success, negative error code on failure, positive number on
+ * caller defined premature termination.
+ */
+int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
+ pgoff_t nr, const struct mm_walk_ops *ops,
+ void *private)
+{
+ struct mm_walk walk = {
+ .ops = ops,
+ .private = private,
+ };
+ struct vm_area_struct *vma;
+ pgoff_t vba, vea, cba, cea;
+ unsigned long start_addr, end_addr;
+ int err = 0;
+
+ lockdep_assert_held(&mapping->i_mmap_rwsem);
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
+ first_index + nr - 1) {
+ /* Clip to the vma */
+ vba = vma->vm_pgoff;
+ vea = vba + vma_pages(vma);
+ cba = first_index;
+ cba = max(cba, vba);
+ cea = first_index + nr;
+ cea = min(cea, vea);
+
+ start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
+ end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
+ if (start_addr >= end_addr)
+ continue;
+
+ walk.vma = vma;
+ walk.mm = vma->vm_mm;
+
+ err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
+ if (err > 0) {
+ err = 0;
+ break;
+ } else if (err < 0)
+ break;
+
+ err = __walk_page_range(start_addr, end_addr, &walk);
+ if (err)
+ break;
+ }
+
+ return err;
+}
diff --git a/mm/rmap.c b/mm/rmap.c
index d9a23bb773bf..0c7b2a9400d4 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -61,6 +61,7 @@
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/hugetlb.h>
+#include <linux/huge_mm.h>
#include <linux/backing-dev.h>
#include <linux/page_idle.h>
#include <linux/memremap.h>
diff --git a/mm/shmem.c b/mm/shmem.c
index cd570cc79c76..220be9fa2c41 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3482,6 +3482,12 @@ static int shmem_parse_options(struct fs_context *fc, void *data)
{
char *options = data;
+ if (options) {
+ int err = security_sb_eat_lsm_opts(options, &fc->security);
+ if (err)
+ return err;
+ }
+
while (options != NULL) {
char *this_char = options;
for (;;) {
diff --git a/mm/shuffle.c b/mm/shuffle.c
index 3ce12481b1dc..b3fe97fd6654 100644
--- a/mm/shuffle.c
+++ b/mm/shuffle.c
@@ -33,7 +33,7 @@ __meminit void page_alloc_shuffle(enum mm_shuffle_ctl ctl)
}
static bool shuffle_param;
-extern int shuffle_show(char *buffer, const struct kernel_param *kp)
+static int shuffle_show(char *buffer, const struct kernel_param *kp)
{
return sprintf(buffer, "%c\n", test_bit(SHUFFLE_ENABLE, &shuffle_state)
? 'Y' : 'N');
diff --git a/mm/slab.c b/mm/slab.c
index 9df370558e5d..66e5d8032bae 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -4206,9 +4206,12 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
/**
* __ksize -- Uninstrumented ksize.
+ * @objp: pointer to the object
*
* Unlike ksize(), __ksize() is uninstrumented, and does not provide the same
* safety checks as ksize() with KASAN instrumentation enabled.
+ *
+ * Return: size of the actual memory used by @objp in bytes
*/
size_t __ksize(const void *objp)
{
diff --git a/mm/slab.h b/mm/slab.h
index 68e455f2b698..b2b01694dc43 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -323,8 +323,8 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
* Expects a pointer to a slab page. Please note, that PageSlab() check
* isn't sufficient, as it returns true also for tail compound slab pages,
* which do not have slab_cache pointer set.
- * So this function assumes that the page can pass PageHead() and PageSlab()
- * checks.
+ * So this function assumes that the page can pass PageSlab() && !PageTail()
+ * check.
*
* The kmem_cache can be reparented asynchronously. The caller must ensure
* the memcg lifetime, e.g. by taking rcu_read_lock() or cgroup_mutex.
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 6491c3a41805..f9fb27b4c843 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -178,10 +178,13 @@ static int init_memcg_params(struct kmem_cache *s,
static void destroy_memcg_params(struct kmem_cache *s)
{
- if (is_root_cache(s))
+ if (is_root_cache(s)) {
kvfree(rcu_access_pointer(s->memcg_params.memcg_caches));
- else
+ } else {
+ mem_cgroup_put(s->memcg_params.memcg);
+ WRITE_ONCE(s->memcg_params.memcg, NULL);
percpu_ref_exit(&s->memcg_params.refcnt);
+ }
}
static void free_memcg_params(struct rcu_head *rcu)
@@ -253,8 +256,6 @@ static void memcg_unlink_cache(struct kmem_cache *s)
} else {
list_del(&s->memcg_params.children_node);
list_del(&s->memcg_params.kmem_caches_node);
- mem_cgroup_put(s->memcg_params.memcg);
- WRITE_ONCE(s->memcg_params.memcg, NULL);
}
}
#else
@@ -1030,10 +1031,19 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name,
unsigned int useroffset, unsigned int usersize)
{
int err;
+ unsigned int align = ARCH_KMALLOC_MINALIGN;
s->name = name;
s->size = s->object_size = size;
- s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
+
+ /*
+ * For power of two sizes, guarantee natural alignment for kmalloc
+ * caches, regardless of SL*B debugging options.
+ */
+ if (is_power_of_2(size))
+ align = max(align, size);
+ s->align = calculate_alignment(flags, align, size);
+
s->useroffset = useroffset;
s->usersize = usersize;
@@ -1287,12 +1297,16 @@ void __init create_kmalloc_caches(slab_flags_t flags)
*/
void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
{
- void *ret;
+ void *ret = NULL;
struct page *page;
flags |= __GFP_COMP;
page = alloc_pages(flags, order);
- ret = page ? page_address(page) : NULL;
+ if (likely(page)) {
+ ret = page_address(page);
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
+ 1 << order);
+ }
ret = kasan_kmalloc_large(ret, size, flags);
/* As ret might get tagged, call kmemleak hook after KASAN. */
kmemleak_alloc(ret, size, 1, flags);
diff --git a/mm/slob.c b/mm/slob.c
index cf377beab962..fa53e9f73893 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -190,7 +190,7 @@ static int slob_last(slob_t *s)
static void *slob_new_pages(gfp_t gfp, int order, int node)
{
- void *page;
+ struct page *page;
#ifdef CONFIG_NUMA
if (node != NUMA_NO_NODE)
@@ -202,14 +202,21 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
if (!page)
return NULL;
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
+ 1 << order);
return page_address(page);
}
static void slob_free_pages(void *b, int order)
{
+ struct page *sp = virt_to_page(b);
+
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += 1 << order;
- free_pages((unsigned long)b, order);
+
+ mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE,
+ -(1 << order));
+ __free_pages(sp, order);
}
/*
@@ -217,6 +224,7 @@ static void slob_free_pages(void *b, int order)
* @sp: Page to look in.
* @size: Size of the allocation.
* @align: Allocation alignment.
+ * @align_offset: Offset in the allocated block that will be aligned.
* @page_removed_from_list: Return parameter.
*
* Tries to find a chunk of memory at least @size bytes big within @page.
@@ -227,7 +235,7 @@ static void slob_free_pages(void *b, int order)
* true (set to false otherwise).
*/
static void *slob_page_alloc(struct page *sp, size_t size, int align,
- bool *page_removed_from_list)
+ int align_offset, bool *page_removed_from_list)
{
slob_t *prev, *cur, *aligned = NULL;
int delta = 0, units = SLOB_UNITS(size);
@@ -236,8 +244,17 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align,
for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) {
slobidx_t avail = slob_units(cur);
+ /*
+ * 'aligned' will hold the address of the slob block so that the
+ * address 'aligned'+'align_offset' is aligned according to the
+ * 'align' parameter. This is for kmalloc() which prepends the
+ * allocated block with its size, so that the block itself is
+ * aligned when needed.
+ */
if (align) {
- aligned = (slob_t *)ALIGN((unsigned long)cur, align);
+ aligned = (slob_t *)
+ (ALIGN((unsigned long)cur + align_offset, align)
+ - align_offset);
delta = aligned - cur;
}
if (avail >= units + delta) { /* room enough? */
@@ -281,7 +298,8 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align,
/*
* slob_alloc: entry point into the slob allocator.
*/
-static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
+static void *slob_alloc(size_t size, gfp_t gfp, int align, int node,
+ int align_offset)
{
struct page *sp;
struct list_head *slob_list;
@@ -312,7 +330,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
if (sp->units < SLOB_UNITS(size))
continue;
- b = slob_page_alloc(sp, size, align, &page_removed_from_list);
+ b = slob_page_alloc(sp, size, align, align_offset, &page_removed_from_list);
if (!b)
continue;
@@ -349,7 +367,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
INIT_LIST_HEAD(&sp->slab_list);
set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
set_slob_page_free(sp, slob_list);
- b = slob_page_alloc(sp, size, align, &_unused);
+ b = slob_page_alloc(sp, size, align, align_offset, &_unused);
BUG_ON(!b);
spin_unlock_irqrestore(&slob_lock, flags);
}
@@ -451,7 +469,7 @@ static __always_inline void *
__do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
{
unsigned int *m;
- int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ int minalign = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
void *ret;
gfp &= gfp_allowed_mask;
@@ -459,19 +477,28 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
fs_reclaim_acquire(gfp);
fs_reclaim_release(gfp);
- if (size < PAGE_SIZE - align) {
+ if (size < PAGE_SIZE - minalign) {
+ int align = minalign;
+
+ /*
+ * For power of two sizes, guarantee natural alignment for
+ * kmalloc()'d objects.
+ */
+ if (is_power_of_2(size))
+ align = max(minalign, (int) size);
+
if (!size)
return ZERO_SIZE_PTR;
- m = slob_alloc(size + align, gfp, align, node);
+ m = slob_alloc(size + minalign, gfp, align, node, minalign);
if (!m)
return NULL;
*m = size;
- ret = (void *)m + align;
+ ret = (void *)m + minalign;
trace_kmalloc_node(caller, ret,
- size, size + align, gfp, node);
+ size, size + minalign, gfp, node);
} else {
unsigned int order = get_order(size);
@@ -521,8 +548,13 @@ void kfree(const void *block)
int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
unsigned int *m = (unsigned int *)(block - align);
slob_free(m, *m + align);
- } else
- __free_pages(sp, compound_order(sp));
+ } else {
+ unsigned int order = compound_order(sp);
+ mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE,
+ -(1 << order));
+ __free_pages(sp, order);
+
+ }
}
EXPORT_SYMBOL(kfree);
@@ -567,7 +599,7 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
fs_reclaim_release(flags);
if (c->size < PAGE_SIZE) {
- b = slob_alloc(c->size, flags, c->align, node);
+ b = slob_alloc(c->size, flags, c->align, node, 0);
trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
SLOB_UNITS(c->size) * SLOB_UNIT,
flags, node);
diff --git a/mm/slub.c b/mm/slub.c
index 42c1b3af3c98..e72e802fc569 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1433,12 +1433,15 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
void *old_tail = *tail ? *tail : *head;
int rsize;
- if (slab_want_init_on_free(s)) {
- void *p = NULL;
+ /* Head and tail of the reconstructed freelist */
+ *head = NULL;
+ *tail = NULL;
- do {
- object = next;
- next = get_freepointer(s, object);
+ do {
+ object = next;
+ next = get_freepointer(s, object);
+
+ if (slab_want_init_on_free(s)) {
/*
* Clear the object and the metadata, but don't touch
* the redzone.
@@ -1448,29 +1451,8 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
: 0;
memset((char *)object + s->inuse, 0,
s->size - s->inuse - rsize);
- set_freepointer(s, object, p);
- p = object;
- } while (object != old_tail);
- }
-/*
- * Compiler cannot detect this function can be removed if slab_free_hook()
- * evaluates to nothing. Thus, catch all relevant config debug options here.
- */
-#if defined(CONFIG_LOCKDEP) || \
- defined(CONFIG_DEBUG_KMEMLEAK) || \
- defined(CONFIG_DEBUG_OBJECTS_FREE) || \
- defined(CONFIG_KASAN)
-
- next = *head;
-
- /* Head and tail of the reconstructed freelist */
- *head = NULL;
- *tail = NULL;
-
- do {
- object = next;
- next = get_freepointer(s, object);
+ }
/* If object's reuse doesn't have to be delayed */
if (!slab_free_hook(s, object)) {
/* Move object to the new freelist */
@@ -1485,9 +1467,6 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
*tail = NULL;
return *head != NULL;
-#else
- return true;
-#endif
}
static void *setup_object(struct kmem_cache *s, struct page *page,
@@ -2672,6 +2651,17 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
}
/*
+ * If the object has been wiped upon free, make sure it's fully initialized by
+ * zeroing out freelist pointer.
+ */
+static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
+ void *obj)
+{
+ if (unlikely(slab_want_init_on_free(s)) && obj)
+ memset((void *)((char *)obj + s->offset), 0, sizeof(void *));
+}
+
+/*
* Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
* have the fastpath folded into their functions. So no function call
* overhead for requests that can be satisfied on the fastpath.
@@ -2759,12 +2749,8 @@ redo:
prefetch_freepointer(s, next_object);
stat(s, ALLOC_FASTPATH);
}
- /*
- * If the object has been wiped upon free, make sure it's fully
- * initialized by zeroing out freelist pointer.
- */
- if (unlikely(slab_want_init_on_free(s)) && object)
- memset(object + s->offset, 0, sizeof(void *));
+
+ maybe_wipe_obj_freeptr(s, object);
if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
memset(object, 0, s->object_size);
@@ -3178,10 +3164,13 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
goto error;
c = this_cpu_ptr(s->cpu_slab);
+ maybe_wipe_obj_freeptr(s, p[i]);
+
continue; /* goto for-loop */
}
c->freelist = get_freepointer(s, object);
p[i] = object;
+ maybe_wipe_obj_freeptr(s, p[i]);
}
c->tid = next_tid(c->tid);
local_irq_enable();
@@ -3821,11 +3810,15 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
{
struct page *page;
void *ptr = NULL;
+ unsigned int order = get_order(size);
flags |= __GFP_COMP;
- page = alloc_pages_node(node, flags, get_order(size));
- if (page)
+ page = alloc_pages_node(node, flags, order);
+ if (page) {
ptr = page_address(page);
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
+ 1 << order);
+ }
return kmalloc_large_node_hook(ptr, size, flags);
}
@@ -3951,9 +3944,13 @@ void kfree(const void *x)
page = virt_to_head_page(x);
if (unlikely(!PageSlab(page))) {
+ unsigned int order = compound_order(page);
+
BUG_ON(!PageCompound(page));
kfree_hook(object);
- __free_pages(page, compound_order(page));
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
+ -(1 << order));
+ __free_pages(page, order);
return;
}
slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
@@ -4838,7 +4835,17 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
}
}
- get_online_mems();
+ /*
+ * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex"
+ * already held which will conflict with an existing lock order:
+ *
+ * mem_hotplug_lock->slab_mutex->kernfs_mutex
+ *
+ * We don't really need mem_hotplug_lock (to hold off
+ * slab_mem_going_offline_callback) here because slab's memory hot
+ * unplug code doesn't destroy the kmem_cache->node[] data.
+ */
+
#ifdef CONFIG_SLUB_DEBUG
if (flags & SO_ALL) {
struct kmem_cache_node *n;
@@ -4879,7 +4886,6 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
x += sprintf(buf + x, " N%d=%lu",
node, nodes[node]);
#endif
- put_online_mems();
kfree(nodes);
return x + sprintf(buf + x, "\n");
}
diff --git a/mm/sparse.c b/mm/sparse.c
index bf32de9e666b..f6891c1992b1 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -219,7 +219,7 @@ static inline unsigned long first_present_section_nr(void)
return next_present_section_nr(-1);
}
-void subsection_mask_set(unsigned long *map, unsigned long pfn,
+static void subsection_mask_set(unsigned long *map, unsigned long pfn,
unsigned long nr_pages)
{
int idx = subsection_map_index(pfn);
diff --git a/mm/truncate.c b/mm/truncate.c
index 8563339041f6..dd9ebc1da356 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -592,6 +592,16 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
unlock_page(page);
continue;
}
+
+ /* Take a pin outside pagevec */
+ get_page(page);
+
+ /*
+ * Drop extra pins before trying to invalidate
+ * the huge page.
+ */
+ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
}
ret = invalidate_inode_page(page);
@@ -602,6 +612,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
*/
if (!ret)
deactivate_file_page(page);
+ if (PageTransHuge(page))
+ put_page(page);
count += ret;
}
pagevec_remove_exceptionals(&pvec);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a3c70e275f4e..4a7d7459c4f9 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2672,6 +2672,26 @@ void *vzalloc_node(unsigned long size, int node)
EXPORT_SYMBOL(vzalloc_node);
/**
+ * vmalloc_user_node_flags - allocate memory for userspace on a specific node
+ * @size: allocation size
+ * @node: numa node
+ * @flags: flags for the page level allocator
+ *
+ * The resulting memory area is zeroed so it can be mapped to userspace
+ * without leaking data.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags)
+{
+ return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
+ flags | __GFP_ZERO, PAGE_KERNEL,
+ VM_USERMAP, node,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(vmalloc_user_node_flags);
+
+/**
* vmalloc_exec - allocate virtually contiguous, executable memory
* @size: allocation size
*
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index f3b50811497a..4bac22fe1aa2 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -355,6 +355,9 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
* "hierarchy" or "local").
*
* To be used as memcg event method.
+ *
+ * Return: 0 on success, -ENOMEM on memory failure or -EINVAL if @args could
+ * not be parsed.
*/
int vmpressure_register_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, const char *args)
@@ -362,7 +365,7 @@ int vmpressure_register_event(struct mem_cgroup *memcg,
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
struct vmpressure_event *ev;
enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH;
- enum vmpressure_levels level = -1;
+ enum vmpressure_levels level;
char *spec, *spec_orig;
char *token;
int ret = 0;
@@ -375,20 +378,18 @@ int vmpressure_register_event(struct mem_cgroup *memcg,
/* Find required level */
token = strsep(&spec, ",");
- level = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token);
- if (level < 0) {
- ret = level;
+ ret = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token);
+ if (ret < 0)
goto out;
- }
+ level = ret;
/* Find optional mode */
token = strsep(&spec, ",");
if (token) {
- mode = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token);
- if (mode < 0) {
- ret = mode;
+ ret = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token);
+ if (ret < 0)
goto out;
- }
+ mode = ret;
}
ev = kzalloc(sizeof(*ev), GFP_KERNEL);
@@ -404,6 +405,7 @@ int vmpressure_register_event(struct mem_cgroup *memcg,
mutex_lock(&vmpr->events_lock);
list_add(&ev->node, &vmpr->events);
mutex_unlock(&vmpr->events_lock);
+ ret = 0;
out:
kfree(spec_orig);
return ret;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e5d52d6a24af..ee4eecc7e1c2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -351,12 +351,13 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
*/
unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
{
- unsigned long lru_size;
+ unsigned long lru_size = 0;
int zid;
- if (!mem_cgroup_disabled())
- lru_size = lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
- else
+ if (!mem_cgroup_disabled()) {
+ for (zid = 0; zid < MAX_NR_ZONES; zid++)
+ lru_size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
+ } else
lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
@@ -932,10 +933,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
* Note that if SetPageDirty is always performed via set_page_dirty,
* and thus under the i_pages lock, then this ordering is not required.
*/
- if (unlikely(PageTransHuge(page)) && PageSwapCache(page))
- refcount = 1 + HPAGE_PMD_NR;
- else
- refcount = 2;
+ refcount = 1 + compound_nr(page);
if (!page_ref_freeze(page, refcount))
goto cannot_free;
/* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
@@ -2459,17 +2457,70 @@ out:
*lru_pages = 0;
for_each_evictable_lru(lru) {
int file = is_file_lru(lru);
- unsigned long size;
+ unsigned long lruvec_size;
unsigned long scan;
+ unsigned long protection;
+
+ lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
+ protection = mem_cgroup_protection(memcg,
+ sc->memcg_low_reclaim);
+
+ if (protection) {
+ /*
+ * Scale a cgroup's reclaim pressure by proportioning
+ * its current usage to its memory.low or memory.min
+ * setting.
+ *
+ * This is important, as otherwise scanning aggression
+ * becomes extremely binary -- from nothing as we
+ * approach the memory protection threshold, to totally
+ * nominal as we exceed it. This results in requiring
+ * setting extremely liberal protection thresholds. It
+ * also means we simply get no protection at all if we
+ * set it too low, which is not ideal.
+ *
+ * If there is any protection in place, we reduce scan
+ * pressure by how much of the total memory used is
+ * within protection thresholds.
+ *
+ * There is one special case: in the first reclaim pass,
+ * we skip over all groups that are within their low
+ * protection. If that fails to reclaim enough pages to
+ * satisfy the reclaim goal, we come back and override
+ * the best-effort low protection. However, we still
+ * ideally want to honor how well-behaved groups are in
+ * that case instead of simply punishing them all
+ * equally. As such, we reclaim them based on how much
+ * memory they are using, reducing the scan pressure
+ * again by how much of the total memory used is under
+ * hard protection.
+ */
+ unsigned long cgroup_size = mem_cgroup_size(memcg);
+
+ /* Avoid TOCTOU with earlier protection check */
+ cgroup_size = max(cgroup_size, protection);
+
+ scan = lruvec_size - lruvec_size * protection /
+ cgroup_size;
+
+ /*
+ * Minimally target SWAP_CLUSTER_MAX pages to keep
+ * reclaim moving forwards, avoiding decremeting
+ * sc->priority further than desirable.
+ */
+ scan = max(scan, SWAP_CLUSTER_MAX);
+ } else {
+ scan = lruvec_size;
+ }
+
+ scan >>= sc->priority;
- size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
- scan = size >> sc->priority;
/*
* If the cgroup's already been deleted, make sure to
* scrape out the remaining cache.
*/
if (!scan && !mem_cgroup_online(memcg))
- scan = min(size, SWAP_CLUSTER_MAX);
+ scan = min(lruvec_size, SWAP_CLUSTER_MAX);
switch (scan_balance) {
case SCAN_EQUAL:
@@ -2489,7 +2540,7 @@ out:
case SCAN_ANON:
/* Scan one type exclusively */
if ((scan_balance == SCAN_FILE) != file) {
- size = 0;
+ lruvec_size = 0;
scan = 0;
}
break;
@@ -2498,7 +2549,7 @@ out:
BUG();
}
- *lru_pages += size;
+ *lru_pages += lruvec_size;
nr[lru] = scan;
}
}
@@ -2742,6 +2793,13 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
memcg_memory_event(memcg, MEMCG_LOW);
break;
case MEMCG_PROT_NONE:
+ /*
+ * All protection thresholds breached. We may
+ * still choose to vary the scan pressure
+ * applied based on by how much the cgroup in
+ * question has exceeded its protection
+ * thresholds (see get_scan_count).
+ */
break;
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6afc892a148a..a8222041bd44 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1383,12 +1383,29 @@ static void pagetypeinfo_showfree_print(struct seq_file *m,
unsigned long freecount = 0;
struct free_area *area;
struct list_head *curr;
+ bool overflow = false;
area = &(zone->free_area[order]);
- list_for_each(curr, &area->free_list[mtype])
- freecount++;
- seq_printf(m, "%6lu ", freecount);
+ list_for_each(curr, &area->free_list[mtype]) {
+ /*
+ * Cap the free_list iteration because it might
+ * be really large and we are under a spinlock
+ * so a long time spent here could trigger a
+ * hard lockup detector. Anyway this is a
+ * debugging tool so knowing there is a handful
+ * of pages of this order should be more than
+ * sufficient.
+ */
+ if (++freecount >= 100000) {
+ overflow = true;
+ break;
+ }
+ }
+ seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
+ spin_unlock_irq(&zone->lock);
+ cond_resched();
+ spin_lock_irq(&zone->lock);
}
seq_putc(m, '\n');
}
@@ -1972,7 +1989,7 @@ void __init init_mm_internals(void)
#endif
#ifdef CONFIG_PROC_FS
proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
- proc_create_seq("pagetypeinfo", 0444, NULL, &pagetypeinfo_op);
+ proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
#endif
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 05bdf90646e7..6d3d3f698ebb 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -998,9 +998,11 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
struct z3fold_header *zhdr;
struct page *page;
enum buddy bud;
+ bool page_claimed;
zhdr = handle_to_z3fold_header(handle);
page = virt_to_page(zhdr);
+ page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private);
if (test_bit(PAGE_HEADLESS, &page->private)) {
/* if a headless page is under reclaim, just leave.
@@ -1008,7 +1010,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
* has not been set before, we release this page
* immediately so we don't care about its value any more.
*/
- if (!test_and_set_bit(PAGE_CLAIMED, &page->private)) {
+ if (!page_claimed) {
spin_lock(&pool->lock);
list_del(&page->lru);
spin_unlock(&pool->lock);
@@ -1044,13 +1046,15 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
atomic64_dec(&pool->pages_nr);
return;
}
- if (test_bit(PAGE_CLAIMED, &page->private)) {
+ if (page_claimed) {
+ /* the page has not been claimed by us */
z3fold_page_unlock(zhdr);
return;
}
if (unlikely(PageIsolated(page)) ||
test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
z3fold_page_unlock(zhdr);
+ clear_bit(PAGE_CLAIMED, &page->private);
return;
}
if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
@@ -1060,10 +1064,12 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
zhdr->cpu = -1;
kref_get(&zhdr->refcount);
do_compact_page(zhdr, true);
+ clear_bit(PAGE_CLAIMED, &page->private);
return;
}
kref_get(&zhdr->refcount);
queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
+ clear_bit(PAGE_CLAIMED, &page->private);
z3fold_page_unlock(zhdr);
}
OpenPOWER on IntegriCloud