summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig.debug21
-rw-r--r--mm/Makefile2
-rw-r--r--mm/backing-dev.c1
-rw-r--r--mm/debug.c16
-rw-r--r--mm/early_ioremap.c8
-rw-r--r--mm/filemap.c34
-rw-r--r--mm/gup.c503
-rw-r--r--mm/gup_benchmark.c9
-rw-r--r--mm/hmm.c62
-rw-r--r--mm/huge_memory.c55
-rw-r--r--mm/kasan/common.c2
-rw-r--r--mm/kmemleak.c112
-rw-r--r--mm/memblock.c22
-rw-r--r--mm/memcontrol.c25
-rw-r--r--mm/memory.c44
-rw-r--r--mm/memory_hotplug.c128
-rw-r--r--mm/mempolicy.c6
-rw-r--r--mm/memremap.c77
-rw-r--r--mm/migrate.c82
-rw-r--r--mm/mincore.c1
-rw-r--r--mm/mmap.c36
-rw-r--r--mm/mmu_gather.c134
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page_alloc.c126
-rw-r--r--mm/page_io.c11
-rw-r--r--mm/page_isolation.c53
-rw-r--r--mm/page_vma_mapped.c12
-rw-r--r--mm/pagewalk.c163
-rw-r--r--mm/process_vm_access.c28
-rw-r--r--mm/ptdump.c139
-rw-r--r--mm/shmem.c40
-rw-r--r--mm/slab_common.c37
-rw-r--r--mm/slub.c88
-rw-r--r--mm/sparse.c12
-rw-r--r--mm/swap.c27
-rw-r--r--mm/swapfile.c16
-rw-r--r--mm/vmscan.c24
-rw-r--r--mm/zswap.c86
38 files changed, 1351 insertions, 893 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 327b3ebf23bf..0271b22e063f 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -117,3 +117,24 @@ config DEBUG_RODATA_TEST
depends on STRICT_KERNEL_RWX
---help---
This option enables a testcase for the setting rodata read-only.
+
+config GENERIC_PTDUMP
+ bool
+
+config PTDUMP_CORE
+ bool
+
+config PTDUMP_DEBUGFS
+ bool "Export kernel pagetable layout to userspace via debugfs"
+ depends on DEBUG_KERNEL
+ depends on DEBUG_FS
+ depends on GENERIC_PTDUMP
+ select PTDUMP_CORE
+ help
+ Say Y here if you want to show the kernel pagetable layout in a
+ debugfs file. This information is only useful for kernel developers
+ who are working in architecture specific areas of the kernel.
+ It is probably not a good idea to enable this feature in a production
+ kernel.
+
+ If in doubt, say N.
diff --git a/mm/Makefile b/mm/Makefile
index 1937cc251883..272e66039e70 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -20,6 +20,7 @@ KCOV_INSTRUMENT_kmemleak.o := n
KCOV_INSTRUMENT_memcontrol.o := n
KCOV_INSTRUMENT_mmzone.o := n
KCOV_INSTRUMENT_vmstat.o := n
+KCOV_INSTRUMENT_failslab.o := n
CFLAGS_init-mm.o += $(call cc-disable-warning, override-init)
CFLAGS_init-mm.o += $(call cc-disable-warning, initializer-overrides)
@@ -108,3 +109,4 @@ obj-$(CONFIG_ZONE_DEVICE) += memremap.o
obj-$(CONFIG_HMM_MIRROR) += hmm.o
obj-$(CONFIG_MEMFD_CREATE) += memfd.o
obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
+obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c360f6a6c844..62f05f605fb5 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -21,6 +21,7 @@ struct backing_dev_info noop_backing_dev_info = {
EXPORT_SYMBOL_GPL(noop_backing_dev_info);
static struct class *bdi_class;
+const char *bdi_unknown_name = "(unknown)";
/*
* bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
diff --git a/mm/debug.c b/mm/debug.c
index 74ee73cf7079..ecccd9f17801 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -46,7 +46,15 @@ void __dump_page(struct page *page, const char *reason)
{
struct address_space *mapping;
bool page_poisoned = PagePoisoned(page);
+ /*
+ * Accessing the pageblock without the zone lock. It could change to
+ * "isolate" again in the meantime, but since we are just dumping the
+ * state for debugging, it should be fine to accept a bit of
+ * inaccuracy here due to racing.
+ */
+ bool page_cma = is_migrate_cma_page(page);
int mapcount;
+ char *type = "";
/*
* If struct page is poisoned don't access Page*() functions as that
@@ -78,9 +86,9 @@ void __dump_page(struct page *page, const char *reason)
page, page_ref_count(page), mapcount,
page->mapping, page_to_pgoff(page));
if (PageKsm(page))
- pr_warn("ksm flags: %#lx(%pGp)\n", page->flags, &page->flags);
+ type = "ksm ";
else if (PageAnon(page))
- pr_warn("anon flags: %#lx(%pGp)\n", page->flags, &page->flags);
+ type = "anon ";
else if (mapping) {
if (mapping->host && mapping->host->i_dentry.first) {
struct dentry *dentry;
@@ -88,10 +96,12 @@ void __dump_page(struct page *page, const char *reason)
pr_warn("%ps name:\"%pd\"\n", mapping->a_ops, dentry);
} else
pr_warn("%ps\n", mapping->a_ops);
- pr_warn("flags: %#lx(%pGp)\n", page->flags, &page->flags);
}
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
+ pr_warn("%sflags: %#lx(%pGp)%s\n", type, page->flags, &page->flags,
+ page_cma ? " CMA" : "");
+
hex_only:
print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
sizeof(unsigned long), page,
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index 1826f191e72c..a0018ad1a1f6 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -121,8 +121,8 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
}
}
- if (WARN(slot < 0, "%s(%08llx, %08lx) not found slot\n",
- __func__, (u64)phys_addr, size))
+ if (WARN(slot < 0, "%s(%pa, %08lx) not found slot\n",
+ __func__, &phys_addr, size))
return NULL;
/* Don't allow wraparound or zero size */
@@ -158,8 +158,8 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
--idx;
--nrpages;
}
- WARN(early_ioremap_debug, "%s(%08llx, %08lx) [%d] => %08lx + %08lx\n",
- __func__, (u64)phys_addr, size, slot, offset, slot_virt[slot]);
+ WARN(early_ioremap_debug, "%s(%pa, %08lx) [%d] => %08lx + %08lx\n",
+ __func__, &phys_addr, size, slot, offset, slot_virt[slot]);
prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
return prev_map[slot];
diff --git a/mm/filemap.c b/mm/filemap.c
index bf6aa30be58d..1784478270e1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -632,33 +632,6 @@ static bool mapping_needs_writeback(struct address_space *mapping)
return mapping->nrpages;
}
-int filemap_write_and_wait(struct address_space *mapping)
-{
- int err = 0;
-
- if (mapping_needs_writeback(mapping)) {
- err = filemap_fdatawrite(mapping);
- /*
- * Even if the above returned error, the pages may be
- * written partially (e.g. -ENOSPC), so we wait for it.
- * But the -EIO is special case, it may indicate the worst
- * thing (e.g. bug) happened, so we avoid waiting for it.
- */
- if (err != -EIO) {
- int err2 = filemap_fdatawait(mapping);
- if (!err)
- err = err2;
- } else {
- /* Clear any previously stored errors */
- filemap_check_errors(mapping);
- }
- } else {
- err = filemap_check_errors(mapping);
- }
- return err;
-}
-EXPORT_SYMBOL(filemap_write_and_wait);
-
/**
* filemap_write_and_wait_range - write out & wait on a file range
* @mapping: the address_space for the pages
@@ -680,7 +653,12 @@ int filemap_write_and_wait_range(struct address_space *mapping,
if (mapping_needs_writeback(mapping)) {
err = __filemap_fdatawrite_range(mapping, lstart, lend,
WB_SYNC_ALL);
- /* See comment of filemap_write_and_wait() */
+ /*
+ * Even if the above returned error, the pages may be
+ * written partially (e.g. -ENOSPC), so we wait for it.
+ * But the -EIO is special case, it may indicate the worst
+ * thing (e.g. bug) happened, so we avoid waiting for it.
+ */
if (err != -EIO) {
int err2 = filemap_fdatawait_range(mapping,
lstart, lend);
diff --git a/mm/gup.c b/mm/gup.c
index 7646bf993b25..1b521e0ac1de 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -29,8 +29,23 @@ struct follow_page_context {
unsigned int page_mask;
};
+/*
+ * Return the compound head page with ref appropriately incremented,
+ * or NULL if that failed.
+ */
+static inline struct page *try_get_compound_head(struct page *page, int refs)
+{
+ struct page *head = compound_head(page);
+
+ if (WARN_ON_ONCE(page_ref_count(head) < 0))
+ return NULL;
+ if (unlikely(!page_cache_add_speculative(head, refs)))
+ return NULL;
+ return head;
+}
+
/**
- * put_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
+ * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
* @pages: array of pages to be maybe marked dirty, and definitely released.
* @npages: number of pages in the @pages array.
* @make_dirty: whether to mark the pages dirty
@@ -40,19 +55,19 @@ struct follow_page_context {
*
* For each page in the @pages array, make that page (or its head page, if a
* compound page) dirty, if @make_dirty is true, and if the page was previously
- * listed as clean. In any case, releases all pages using put_user_page(),
- * possibly via put_user_pages(), for the non-dirty case.
+ * listed as clean. In any case, releases all pages using unpin_user_page(),
+ * possibly via unpin_user_pages(), for the non-dirty case.
*
- * Please see the put_user_page() documentation for details.
+ * Please see the unpin_user_page() documentation for details.
*
* set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
* required, then the caller should a) verify that this is really correct,
* because _lock() is usually required, and b) hand code it:
- * set_page_dirty_lock(), put_user_page().
+ * set_page_dirty_lock(), unpin_user_page().
*
*/
-void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
- bool make_dirty)
+void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
+ bool make_dirty)
{
unsigned long index;
@@ -63,7 +78,7 @@ void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
*/
if (!make_dirty) {
- put_user_pages(pages, npages);
+ unpin_user_pages(pages, npages);
return;
}
@@ -91,21 +106,21 @@ void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
*/
if (!PageDirty(page))
set_page_dirty_lock(page);
- put_user_page(page);
+ unpin_user_page(page);
}
}
-EXPORT_SYMBOL(put_user_pages_dirty_lock);
+EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
/**
- * put_user_pages() - release an array of gup-pinned pages.
+ * unpin_user_pages() - release an array of gup-pinned pages.
* @pages: array of pages to be marked dirty and released.
* @npages: number of pages in the @pages array.
*
- * For each page in the @pages array, release the page using put_user_page().
+ * For each page in the @pages array, release the page using unpin_user_page().
*
- * Please see the put_user_page() documentation for details.
+ * Please see the unpin_user_page() documentation for details.
*/
-void put_user_pages(struct page **pages, unsigned long npages)
+void unpin_user_pages(struct page **pages, unsigned long npages)
{
unsigned long index;
@@ -115,9 +130,9 @@ void put_user_pages(struct page **pages, unsigned long npages)
* single operation to the head page should suffice.
*/
for (index = 0; index < npages; index++)
- put_user_page(pages[index]);
+ unpin_user_page(pages[index]);
}
-EXPORT_SYMBOL(put_user_pages);
+EXPORT_SYMBOL(unpin_user_pages);
#ifdef CONFIG_MMU
static struct page *no_page_table(struct vm_area_struct *vma,
@@ -179,6 +194,10 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
spinlock_t *ptl;
pte_t *ptep, pte;
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
+ (FOLL_PIN | FOLL_GET)))
+ return ERR_PTR(-EINVAL);
retry:
if (unlikely(pmd_bad(*pmd)))
return no_page_table(vma, flags);
@@ -323,7 +342,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
pmdval = READ_ONCE(*pmd);
if (pmd_none(pmdval))
return no_page_table(vma, flags);
- if (pmd_huge(pmdval) && vma->vm_flags & VM_HUGETLB) {
+ if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
page = follow_huge_pmd(mm, address, pmd, flags);
if (page)
return page;
@@ -433,7 +452,7 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
pud = pud_offset(p4dp, address);
if (pud_none(*pud))
return no_page_table(vma, flags);
- if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
+ if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
page = follow_huge_pud(mm, address, pud, flags);
if (page)
return page;
@@ -796,7 +815,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
start = untagged_addr(start);
- VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
+ VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
/*
* If FOLL_FORCE is set then do not force a full fault as the hinting
@@ -1020,7 +1039,16 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
BUG_ON(*locked != 1);
}
- if (pages)
+ /*
+ * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
+ * is to set FOLL_GET if the caller wants pages[] filled in (but has
+ * carelessly failed to specify FOLL_GET), so keep doing that, but only
+ * for FOLL_GET, not for the newer FOLL_PIN.
+ *
+ * FOLL_PIN always expects pages to be non-null, but no need to assert
+ * that here, as any failures will be obvious enough.
+ */
+ if (pages && !(flags & FOLL_PIN))
flags |= FOLL_GET;
pages_done = 0;
@@ -1096,88 +1124,6 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
return pages_done;
}
-/*
- * get_user_pages_remote() - pin user pages in memory
- * @tsk: the task_struct to use for page fault accounting, or
- * NULL if faults are not to be recorded.
- * @mm: mm_struct of target mm
- * @start: starting user address
- * @nr_pages: number of pages from start to pin
- * @gup_flags: flags modifying lookup behaviour
- * @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long. Or NULL, if caller
- * only intends to ensure the pages are faulted in.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
- * @locked: pointer to lock flag indicating whether lock is held and
- * subsequently whether VM_FAULT_RETRY functionality can be
- * utilised. Lock must initially be held.
- *
- * Returns either number of pages pinned (which may be less than the
- * number requested), or an error. Details about the return value:
- *
- * -- If nr_pages is 0, returns 0.
- * -- If nr_pages is >0, but no pages were pinned, returns -errno.
- * -- If nr_pages is >0, and some pages were pinned, returns the number of
- * pages pinned. Again, this may be less than nr_pages.
- *
- * The caller is responsible for releasing returned @pages, via put_page().
- *
- * @vmas are valid only as long as mmap_sem is held.
- *
- * Must be called with mmap_sem held for read or write.
- *
- * get_user_pages walks a process's page tables and takes a reference to
- * each struct page that each user address corresponds to at a given
- * instant. That is, it takes the page that would be accessed if a user
- * thread accesses the given user virtual address at that instant.
- *
- * This does not guarantee that the page exists in the user mappings when
- * get_user_pages returns, and there may even be a completely different
- * page there in some cases (eg. if mmapped pagecache has been invalidated
- * and subsequently re faulted). However it does guarantee that the page
- * won't be freed completely. And mostly callers simply care that the page
- * contains data that was valid *at some point in time*. Typically, an IO
- * or similar operation cannot guarantee anything stronger anyway because
- * locks can't be held over the syscall boundary.
- *
- * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
- * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
- * be called after the page is finished with, and before put_page is called.
- *
- * get_user_pages is typically used for fewer-copy IO operations, to get a
- * handle on the memory by some means other than accesses via the user virtual
- * addresses. The pages may be submitted for DMA to devices or accessed via
- * their kernel linear mapping (via the kmap APIs). Care should be taken to
- * use the correct cache flushing APIs.
- *
- * See also get_user_pages_fast, for performance critical applications.
- *
- * get_user_pages should be phased out in favor of
- * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
- * should use get_user_pages because it cannot pass
- * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
- */
-long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked)
-{
- /*
- * FIXME: Current FOLL_LONGTERM behavior is incompatible with
- * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
- * vmas. As there are no users of this flag in this call we simply
- * disallow this option for now.
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
- return -EINVAL;
-
- return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
- locked,
- gup_flags | FOLL_TOUCH | FOLL_REMOTE);
-}
-EXPORT_SYMBOL(get_user_pages_remote);
-
/**
* populate_vma_page_range() - populate a range of pages in the vma.
* @vma: target vma
@@ -1612,6 +1558,116 @@ static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
#endif /* CONFIG_FS_DAX || CONFIG_CMA */
/*
+ * get_user_pages_remote() - pin user pages in memory
+ * @tsk: the task_struct to use for page fault accounting, or
+ * NULL if faults are not to be recorded.
+ * @mm: mm_struct of target mm
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying lookup behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @vmas: array of pointers to vmas corresponding to each page.
+ * Or NULL if the caller does not require them.
+ * @locked: pointer to lock flag indicating whether lock is held and
+ * subsequently whether VM_FAULT_RETRY functionality can be
+ * utilised. Lock must initially be held.
+ *
+ * Returns either number of pages pinned (which may be less than the
+ * number requested), or an error. Details about the return value:
+ *
+ * -- If nr_pages is 0, returns 0.
+ * -- If nr_pages is >0, but no pages were pinned, returns -errno.
+ * -- If nr_pages is >0, and some pages were pinned, returns the number of
+ * pages pinned. Again, this may be less than nr_pages.
+ *
+ * The caller is responsible for releasing returned @pages, via put_page().
+ *
+ * @vmas are valid only as long as mmap_sem is held.
+ *
+ * Must be called with mmap_sem held for read or write.
+ *
+ * get_user_pages walks a process's page tables and takes a reference to
+ * each struct page that each user address corresponds to at a given
+ * instant. That is, it takes the page that would be accessed if a user
+ * thread accesses the given user virtual address at that instant.
+ *
+ * This does not guarantee that the page exists in the user mappings when
+ * get_user_pages returns, and there may even be a completely different
+ * page there in some cases (eg. if mmapped pagecache has been invalidated
+ * and subsequently re faulted). However it does guarantee that the page
+ * won't be freed completely. And mostly callers simply care that the page
+ * contains data that was valid *at some point in time*. Typically, an IO
+ * or similar operation cannot guarantee anything stronger anyway because
+ * locks can't be held over the syscall boundary.
+ *
+ * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
+ * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
+ * be called after the page is finished with, and before put_page is called.
+ *
+ * get_user_pages is typically used for fewer-copy IO operations, to get a
+ * handle on the memory by some means other than accesses via the user virtual
+ * addresses. The pages may be submitted for DMA to devices or accessed via
+ * their kernel linear mapping (via the kmap APIs). Care should be taken to
+ * use the correct cache flushing APIs.
+ *
+ * See also get_user_pages_fast, for performance critical applications.
+ *
+ * get_user_pages should be phased out in favor of
+ * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
+ * should use get_user_pages because it cannot pass
+ * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
+ */
+#ifdef CONFIG_MMU
+long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas, int *locked)
+{
+ /*
+ * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
+ * never directly by the caller, so enforce that with an assertion:
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ return -EINVAL;
+
+ /*
+ * Parts of FOLL_LONGTERM behavior are incompatible with
+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+ * vmas. However, this only comes up if locked is set, and there are
+ * callers that do request FOLL_LONGTERM, but do not set locked. So,
+ * allow what we can.
+ */
+ if (gup_flags & FOLL_LONGTERM) {
+ if (WARN_ON_ONCE(locked))
+ return -EINVAL;
+ /*
+ * This will check the vmas (even if our vmas arg is NULL)
+ * and return -ENOTSUPP if DAX isn't allowed in this case:
+ */
+ return __gup_longterm_locked(tsk, mm, start, nr_pages, pages,
+ vmas, gup_flags | FOLL_TOUCH |
+ FOLL_REMOTE);
+ }
+
+ return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
+ locked,
+ gup_flags | FOLL_TOUCH | FOLL_REMOTE);
+}
+EXPORT_SYMBOL(get_user_pages_remote);
+
+#else /* CONFIG_MMU */
+long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas, int *locked)
+{
+ return 0;
+}
+#endif /* !CONFIG_MMU */
+
+/*
* This is the same as get_user_pages_remote(), just with a
* less-flexible calling convention where we assume that the task
* and mm being operated on are the current task's and don't allow
@@ -1622,6 +1678,13 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas)
{
+ /*
+ * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
+ * never directly by the caller, so enforce that with an assertion:
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ return -EINVAL;
+
return __gup_longterm_locked(current, current->mm, start, nr_pages,
pages, vmas, gup_flags | FOLL_TOUCH);
}
@@ -1729,7 +1792,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
* Before activating this code, please be aware that the following assumptions
* are currently made:
*
- * *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
+ * *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
* free pages containing page tables or TLB flushing requires IPI broadcast.
*
* *) ptes can be read atomically by the architecture.
@@ -1807,20 +1870,6 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
}
}
-/*
- * Return the compund head page with ref appropriately incremented,
- * or NULL if that failed.
- */
-static inline struct page *try_get_compound_head(struct page *page, int refs)
-{
- struct page *head = compound_head(page);
- if (WARN_ON_ONCE(page_ref_count(head) < 0))
- return NULL;
- if (unlikely(!page_cache_add_speculative(head, refs)))
- return NULL;
- return head;
-}
-
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
@@ -1978,6 +2027,29 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
}
#endif
+static int record_subpages(struct page *page, unsigned long addr,
+ unsigned long end, struct page **pages)
+{
+ int nr;
+
+ for (nr = 0; addr != end; addr += PAGE_SIZE)
+ pages[nr++] = page++;
+
+ return nr;
+}
+
+static void put_compound_head(struct page *page, int refs)
+{
+ VM_BUG_ON_PAGE(page_ref_count(page) < refs, page);
+ /*
+ * Calling put_page() for each ref is unnecessarily slow. Only the last
+ * ref needs a put_page().
+ */
+ if (refs > 1)
+ page_ref_sub(page, refs - 1);
+ put_page(page);
+}
+
#ifdef CONFIG_ARCH_HAS_HUGEPD
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
unsigned long sz)
@@ -2007,32 +2079,20 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
/* hugepages are never "special" */
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
- refs = 0;
head = pte_page(pte);
-
page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
- do {
- VM_BUG_ON(compound_head(page) != head);
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
+ refs = record_subpages(page, addr, end, pages + *nr);
head = try_get_compound_head(head, refs);
- if (!head) {
- *nr -= refs;
+ if (!head)
return 0;
- }
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
- /* Could be optimized better */
- *nr -= refs;
- while (refs--)
- put_page(head);
+ put_compound_head(head, refs);
return 0;
}
+ *nr += refs;
SetPageReferenced(head);
return 1;
}
@@ -2079,28 +2139,19 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
}
- refs = 0;
page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- do {
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
+ refs = record_subpages(page, addr, end, pages + *nr);
head = try_get_compound_head(pmd_page(orig), refs);
- if (!head) {
- *nr -= refs;
+ if (!head)
return 0;
- }
if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
- *nr -= refs;
- while (refs--)
- put_page(head);
+ put_compound_head(head, refs);
return 0;
}
+ *nr += refs;
SetPageReferenced(head);
return 1;
}
@@ -2120,28 +2171,19 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
}
- refs = 0;
page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- do {
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
+ refs = record_subpages(page, addr, end, pages + *nr);
head = try_get_compound_head(pud_page(orig), refs);
- if (!head) {
- *nr -= refs;
+ if (!head)
return 0;
- }
if (unlikely(pud_val(orig) != pud_val(*pudp))) {
- *nr -= refs;
- while (refs--)
- put_page(head);
+ put_compound_head(head, refs);
return 0;
}
+ *nr += refs;
SetPageReferenced(head);
return 1;
}
@@ -2157,28 +2199,20 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
return 0;
BUILD_BUG_ON(pgd_devmap(orig));
- refs = 0;
+
page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
- do {
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
+ refs = record_subpages(page, addr, end, pages + *nr);
head = try_get_compound_head(pgd_page(orig), refs);
- if (!head) {
- *nr -= refs;
+ if (!head)
return 0;
- }
if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
- *nr -= refs;
- while (refs--)
- put_page(head);
+ put_compound_head(head, refs);
return 0;
}
+ *nr += refs;
SetPageReferenced(head);
return 1;
}
@@ -2237,7 +2271,7 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
pud_t pud = READ_ONCE(*pudp);
next = pud_addr_end(addr, end);
- if (pud_none(pud))
+ if (unlikely(!pud_present(pud)))
return 0;
if (unlikely(pud_huge(pud))) {
if (!gup_huge_pud(pud, pudp, addr, next, flags,
@@ -2393,29 +2427,15 @@ static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
return ret;
}
-/**
- * get_user_pages_fast() - pin user pages in memory
- * @start: starting user address
- * @nr_pages: number of pages from start to pin
- * @gup_flags: flags modifying pin behaviour
- * @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long.
- *
- * Attempt to pin user pages in memory without taking mm->mmap_sem.
- * If not successful, it will fall back to taking the lock and
- * calling get_user_pages().
- *
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno.
- */
-int get_user_pages_fast(unsigned long start, int nr_pages,
- unsigned int gup_flags, struct page **pages)
+static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags,
+ struct page **pages)
{
unsigned long addr, len, end;
int nr = 0, ret = 0;
- if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM)))
+ if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
+ FOLL_FORCE | FOLL_PIN)))
return -EINVAL;
start = untagged_addr(start) & PAGE_MASK;
@@ -2455,4 +2475,103 @@ int get_user_pages_fast(unsigned long start, int nr_pages,
return ret;
}
+
+/**
+ * get_user_pages_fast() - pin user pages in memory
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying pin behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long.
+ *
+ * Attempt to pin user pages in memory without taking mm->mmap_sem.
+ * If not successful, it will fall back to taking the lock and
+ * calling get_user_pages().
+ *
+ * Returns number of pages pinned. This may be fewer than the number requested.
+ * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
+ * -errno.
+ */
+int get_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
+{
+ /*
+ * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
+ * never directly by the caller, so enforce that:
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ return -EINVAL;
+
+ return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
+}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
+
+/**
+ * pin_user_pages_fast() - pin user pages in memory without taking locks
+ *
+ * For now, this is a placeholder function, until various call sites are
+ * converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
+ * this is identical to get_user_pages_fast().
+ *
+ * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
+ * is NOT intended for Case 2 (RDMA: long-term pins).
+ */
+int pin_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
+{
+ /*
+ * This is a placeholder, until the pin functionality is activated.
+ * Until then, just behave like the corresponding get_user_pages*()
+ * routine.
+ */
+ return get_user_pages_fast(start, nr_pages, gup_flags, pages);
+}
+EXPORT_SYMBOL_GPL(pin_user_pages_fast);
+
+/**
+ * pin_user_pages_remote() - pin pages of a remote process (task != current)
+ *
+ * For now, this is a placeholder function, until various call sites are
+ * converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
+ * this is identical to get_user_pages_remote().
+ *
+ * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
+ * is NOT intended for Case 2 (RDMA: long-term pins).
+ */
+long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas, int *locked)
+{
+ /*
+ * This is a placeholder, until the pin functionality is activated.
+ * Until then, just behave like the corresponding get_user_pages*()
+ * routine.
+ */
+ return get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags, pages,
+ vmas, locked);
+}
+EXPORT_SYMBOL(pin_user_pages_remote);
+
+/**
+ * pin_user_pages() - pin user pages in memory for use by other devices
+ *
+ * For now, this is a placeholder function, until various call sites are
+ * converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
+ * this is identical to get_user_pages().
+ *
+ * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
+ * is NOT intended for Case 2 (RDMA: long-term pins).
+ */
+long pin_user_pages(unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas)
+{
+ /*
+ * This is a placeholder, until the pin functionality is activated.
+ * Until then, just behave like the corresponding get_user_pages*()
+ * routine.
+ */
+ return get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+}
+EXPORT_SYMBOL(pin_user_pages);
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
index ad9d5b1c4473..8dba38e79a9f 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_benchmark.c
@@ -49,18 +49,21 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
nr = (next - addr) / PAGE_SIZE;
}
+ /* Filter out most gup flags: only allow a tiny subset here: */
+ gup->flags &= FOLL_WRITE;
+
switch (cmd) {
case GUP_FAST_BENCHMARK:
- nr = get_user_pages_fast(addr, nr, gup->flags & 1,
+ nr = get_user_pages_fast(addr, nr, gup->flags,
pages + i);
break;
case GUP_LONGTERM_BENCHMARK:
nr = get_user_pages(addr, nr,
- (gup->flags & 1) | FOLL_LONGTERM,
+ gup->flags | FOLL_LONGTERM,
pages + i, NULL);
break;
case GUP_BENCHMARK:
- nr = get_user_pages(addr, nr, gup->flags & 1, pages + i,
+ nr = get_user_pages(addr, nr, gup->flags, pages + i,
NULL);
break;
default:
diff --git a/mm/hmm.c b/mm/hmm.c
index d379cb6496ae..72e5a6d9a417 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -186,7 +186,7 @@ static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
}
static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
- struct mm_walk *walk)
+ __always_unused int depth, struct mm_walk *walk)
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
@@ -380,7 +380,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
again:
pmd = READ_ONCE(*pmdp);
if (pmd_none(pmd))
- return hmm_vma_walk_hole(start, end, walk);
+ return hmm_vma_walk_hole(start, end, -1, walk);
if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
bool fault, write_fault;
@@ -474,23 +474,32 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
- unsigned long addr = start, next;
- pmd_t *pmdp;
+ unsigned long addr = start;
pud_t pud;
- int ret;
+ int ret = 0;
+ spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma);
+
+ if (!ptl)
+ return 0;
+
+ /* Normally we don't want to split the huge page */
+ walk->action = ACTION_CONTINUE;
-again:
pud = READ_ONCE(*pudp);
- if (pud_none(pud))
- return hmm_vma_walk_hole(start, end, walk);
+ if (pud_none(pud)) {
+ ret = hmm_vma_walk_hole(start, end, -1, walk);
+ goto out_unlock;
+ }
if (pud_huge(pud) && pud_devmap(pud)) {
unsigned long i, npages, pfn;
uint64_t *pfns, cpu_flags;
bool fault, write_fault;
- if (!pud_present(pud))
- return hmm_vma_walk_hole(start, end, walk);
+ if (!pud_present(pud)) {
+ ret = hmm_vma_walk_hole(start, end, -1, walk);
+ goto out_unlock;
+ }
i = (addr - range->start) >> PAGE_SHIFT;
npages = (end - addr) >> PAGE_SHIFT;
@@ -499,16 +508,20 @@ again:
cpu_flags = pud_to_hmm_pfn_flags(range, pud);
hmm_range_need_fault(hmm_vma_walk, pfns, npages,
cpu_flags, &fault, &write_fault);
- if (fault || write_fault)
- return hmm_vma_walk_hole_(addr, end, fault,
- write_fault, walk);
+ if (fault || write_fault) {
+ ret = hmm_vma_walk_hole_(addr, end, fault,
+ write_fault, walk);
+ goto out_unlock;
+ }
pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
for (i = 0; i < npages; ++i, ++pfn) {
hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
hmm_vma_walk->pgmap);
- if (unlikely(!hmm_vma_walk->pgmap))
- return -EBUSY;
+ if (unlikely(!hmm_vma_walk->pgmap)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
cpu_flags;
}
@@ -517,22 +530,15 @@ again:
hmm_vma_walk->pgmap = NULL;
}
hmm_vma_walk->last = end;
- return 0;
+ goto out_unlock;
}
- split_huge_pud(walk->vma, pudp, addr);
- if (pud_none(*pudp))
- goto again;
+ /* Ask for the PUD to be split */
+ walk->action = ACTION_SUBTREE;
- pmdp = pmd_offset(pudp, addr);
- do {
- next = pmd_addr_end(addr, end);
- ret = hmm_vma_walk_pmd(pmdp, addr, next, walk);
- if (ret)
- return ret;
- } while (pmdp++, addr = next, addr != end);
-
- return 0;
+out_unlock:
+ spin_unlock(ptl);
+ return ret;
}
#else
#define hmm_vma_walk_pud NULL
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a88093213674..b08b199f9a11 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -177,16 +177,13 @@ static ssize_t enabled_store(struct kobject *kobj,
{
ssize_t ret = count;
- if (!memcmp("always", buf,
- min(sizeof("always")-1, count))) {
+ if (sysfs_streq(buf, "always")) {
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
- } else if (!memcmp("madvise", buf,
- min(sizeof("madvise")-1, count))) {
+ } else if (sysfs_streq(buf, "madvise")) {
clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
- } else if (!memcmp("never", buf,
- min(sizeof("never")-1, count))) {
+ } else if (sysfs_streq(buf, "never")) {
clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
} else
@@ -250,32 +247,27 @@ static ssize_t defrag_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- if (!memcmp("always", buf,
- min(sizeof("always")-1, count))) {
+ if (sysfs_streq(buf, "always")) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
- } else if (!memcmp("defer+madvise", buf,
- min(sizeof("defer+madvise")-1, count))) {
+ } else if (sysfs_streq(buf, "defer+madvise")) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
- } else if (!memcmp("defer", buf,
- min(sizeof("defer")-1, count))) {
+ } else if (sysfs_streq(buf, "defer")) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
- } else if (!memcmp("madvise", buf,
- min(sizeof("madvise")-1, count))) {
+ } else if (sysfs_streq(buf, "madvise")) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
- } else if (!memcmp("never", buf,
- min(sizeof("never")-1, count))) {
+ } else if (sysfs_streq(buf, "never")) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
@@ -527,6 +519,17 @@ void prep_transhuge_page(struct page *page)
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
}
+bool is_transparent_hugepage(struct page *page)
+{
+ if (!PageCompound(page))
+ return 0;
+
+ page = compound_head(page);
+ return is_huge_zero_page(page) ||
+ page[1].compound_dtor == TRANSHUGE_PAGE_DTOR;
+}
+EXPORT_SYMBOL_GPL(is_transparent_hugepage);
+
static unsigned long __thp_get_unmapped_area(struct file *filp,
unsigned long addr, unsigned long len,
loff_t off, unsigned long flags, unsigned long size)
@@ -2704,7 +2707,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
{
struct page *head = compound_head(page);
struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
- struct deferred_split *ds_queue = get_deferred_split_queue(page);
+ struct deferred_split *ds_queue = get_deferred_split_queue(head);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
int count, mapcount, extra_pins, ret;
@@ -2712,11 +2715,11 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
unsigned long flags;
pgoff_t end;
- VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageCompound(page), page);
+ VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
+ VM_BUG_ON_PAGE(!PageLocked(head), head);
+ VM_BUG_ON_PAGE(!PageCompound(head), head);
- if (PageWriteback(page))
+ if (PageWriteback(head))
return -EBUSY;
if (PageAnon(head)) {
@@ -2767,7 +2770,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
goto out_unlock;
}
- mlocked = PageMlocked(page);
+ mlocked = PageMlocked(head);
unmap_page(head);
VM_BUG_ON_PAGE(compound_mapcount(head), head);
@@ -2799,14 +2802,14 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
ds_queue->split_queue_len--;
list_del(page_deferred_list(head));
}
+ spin_unlock(&ds_queue->split_queue_lock);
if (mapping) {
- if (PageSwapBacked(page))
- __dec_node_page_state(page, NR_SHMEM_THPS);
+ if (PageSwapBacked(head))
+ __dec_node_page_state(head, NR_SHMEM_THPS);
else
- __dec_node_page_state(page, NR_FILE_THPS);
+ __dec_node_page_state(head, NR_FILE_THPS);
}
- spin_unlock(&ds_queue->split_queue_lock);
__split_huge_page(page, list, end, flags);
if (PageSwapCache(head)) {
swp_entry_t entry = { .val = page_private(head) };
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index c15d8ae68c96..6aa51723b92b 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -110,6 +110,7 @@ void *memset(void *addr, int c, size_t len)
return __memset(addr, c, len);
}
+#ifdef __HAVE_ARCH_MEMMOVE
#undef memmove
void *memmove(void *dest, const void *src, size_t len)
{
@@ -118,6 +119,7 @@ void *memmove(void *dest, const void *src, size_t len)
return __memmove(dest, src, len);
}
+#endif
#undef memcpy
void *memcpy(void *dest, const void *src, size_t len)
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 244607663363..3a4259eeb5a0 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -13,7 +13,7 @@
*
* The following locks and mutexes are used by kmemleak:
*
- * - kmemleak_lock (rwlock): protects the object_list modifications and
+ * - kmemleak_lock (raw_spinlock_t): protects the object_list modifications and
* accesses to the object_tree_root. The object_list is the main list
* holding the metadata (struct kmemleak_object) for the allocated memory
* blocks. The object_tree_root is a red black tree used to look-up
@@ -22,13 +22,13 @@
* object_tree_root in the create_object() function called from the
* kmemleak_alloc() callback and removed in delete_object() called from the
* kmemleak_free() callback
- * - kmemleak_object.lock (spinlock): protects a kmemleak_object. Accesses to
- * the metadata (e.g. count) are protected by this lock. Note that some
- * members of this structure may be protected by other means (atomic or
- * kmemleak_lock). This lock is also held when scanning the corresponding
- * memory block to avoid the kernel freeing it via the kmemleak_free()
- * callback. This is less heavyweight than holding a global lock like
- * kmemleak_lock during scanning
+ * - kmemleak_object.lock (raw_spinlock_t): protects a kmemleak_object.
+ * Accesses to the metadata (e.g. count) are protected by this lock. Note
+ * that some members of this structure may be protected by other means
+ * (atomic or kmemleak_lock). This lock is also held when scanning the
+ * corresponding memory block to avoid the kernel freeing it via the
+ * kmemleak_free() callback. This is less heavyweight than holding a global
+ * lock like kmemleak_lock during scanning.
* - scan_mutex (mutex): ensures that only one thread may scan the memory for
* unreferenced objects at a time. The gray_list contains the objects which
* are already referenced or marked as false positives and need to be
@@ -135,7 +135,7 @@ struct kmemleak_scan_area {
* (use_count) and freed using the RCU mechanism.
*/
struct kmemleak_object {
- spinlock_t lock;
+ raw_spinlock_t lock;
unsigned int flags; /* object status flags */
struct list_head object_list;
struct list_head gray_list;
@@ -191,8 +191,8 @@ static int mem_pool_free_count = ARRAY_SIZE(mem_pool);
static LIST_HEAD(mem_pool_free_list);
/* search tree for object boundaries */
static struct rb_root object_tree_root = RB_ROOT;
-/* rw_lock protecting the access to object_list and object_tree_root */
-static DEFINE_RWLOCK(kmemleak_lock);
+/* protecting the access to object_list and object_tree_root */
+static DEFINE_RAW_SPINLOCK(kmemleak_lock);
/* allocation caches for kmemleak internal data */
static struct kmem_cache *object_cache;
@@ -426,7 +426,7 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp)
}
/* slab allocation failed, try the memory pool */
- write_lock_irqsave(&kmemleak_lock, flags);
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
object = list_first_entry_or_null(&mem_pool_free_list,
typeof(*object), object_list);
if (object)
@@ -435,7 +435,7 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp)
object = &mem_pool[--mem_pool_free_count];
else
pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n");
- write_unlock_irqrestore(&kmemleak_lock, flags);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
return object;
}
@@ -453,9 +453,9 @@ static void mem_pool_free(struct kmemleak_object *object)
}
/* add the object to the memory pool free list */
- write_lock_irqsave(&kmemleak_lock, flags);
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
list_add(&object->object_list, &mem_pool_free_list);
- write_unlock_irqrestore(&kmemleak_lock, flags);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
}
/*
@@ -514,9 +514,9 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
struct kmemleak_object *object;
rcu_read_lock();
- read_lock_irqsave(&kmemleak_lock, flags);
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
object = lookup_object(ptr, alias);
- read_unlock_irqrestore(&kmemleak_lock, flags);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
/* check whether the object is still available */
if (object && !get_object(object))
@@ -546,11 +546,11 @@ static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int ali
unsigned long flags;
struct kmemleak_object *object;
- write_lock_irqsave(&kmemleak_lock, flags);
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
object = lookup_object(ptr, alias);
if (object)
__remove_object(object);
- write_unlock_irqrestore(&kmemleak_lock, flags);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
return object;
}
@@ -585,7 +585,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
INIT_LIST_HEAD(&object->object_list);
INIT_LIST_HEAD(&object->gray_list);
INIT_HLIST_HEAD(&object->area_list);
- spin_lock_init(&object->lock);
+ raw_spin_lock_init(&object->lock);
atomic_set(&object->use_count, 1);
object->flags = OBJECT_ALLOCATED;
object->pointer = ptr;
@@ -617,7 +617,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
/* kernel backtrace */
object->trace_len = __save_stack_trace(object->trace);
- write_lock_irqsave(&kmemleak_lock, flags);
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
min_addr = min(min_addr, untagged_ptr);
@@ -649,7 +649,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
list_add_tail_rcu(&object->object_list, &object_list);
out:
- write_unlock_irqrestore(&kmemleak_lock, flags);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
return object;
}
@@ -667,9 +667,9 @@ static void __delete_object(struct kmemleak_object *object)
* Locking here also ensures that the corresponding memory block
* cannot be freed when it is being scanned.
*/
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
object->flags &= ~OBJECT_ALLOCATED;
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
@@ -739,9 +739,9 @@ static void paint_it(struct kmemleak_object *object, int color)
{
unsigned long flags;
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
__paint_it(object, color);
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
}
static void paint_ptr(unsigned long ptr, int color)
@@ -798,7 +798,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
if (scan_area_cache)
area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
if (!area) {
pr_warn_once("Cannot allocate a scan area, scanning the full object\n");
/* mark the object for full scan to avoid false positives */
@@ -820,7 +820,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
hlist_add_head(&area->node, &object->area_list);
out_unlock:
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
@@ -842,9 +842,9 @@ static void object_set_excess_ref(unsigned long ptr, unsigned long excess_ref)
return;
}
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
object->excess_ref = excess_ref;
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
@@ -864,9 +864,9 @@ static void object_no_scan(unsigned long ptr)
return;
}
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
object->flags |= OBJECT_NO_SCAN;
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
@@ -1026,9 +1026,9 @@ void __ref kmemleak_update_trace(const void *ptr)
return;
}
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
object->trace_len = __save_stack_trace(object->trace);
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
@@ -1233,7 +1233,7 @@ static void scan_block(void *_start, void *_end,
unsigned long flags;
unsigned long untagged_ptr;
- read_lock_irqsave(&kmemleak_lock, flags);
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
for (ptr = start; ptr < end; ptr++) {
struct kmemleak_object *object;
unsigned long pointer;
@@ -1268,7 +1268,7 @@ static void scan_block(void *_start, void *_end,
* previously acquired in scan_object(). These locks are
* enclosed by scan_mutex.
*/
- spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
+ raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
/* only pass surplus references (object already gray) */
if (color_gray(object)) {
excess_ref = object->excess_ref;
@@ -1277,7 +1277,7 @@ static void scan_block(void *_start, void *_end,
excess_ref = 0;
update_refs(object);
}
- spin_unlock(&object->lock);
+ raw_spin_unlock(&object->lock);
if (excess_ref) {
object = lookup_object(excess_ref, 0);
@@ -1286,12 +1286,12 @@ static void scan_block(void *_start, void *_end,
if (object == scanned)
/* circular reference, ignore */
continue;
- spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
+ raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
update_refs(object);
- spin_unlock(&object->lock);
+ raw_spin_unlock(&object->lock);
}
}
- read_unlock_irqrestore(&kmemleak_lock, flags);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
}
/*
@@ -1324,7 +1324,7 @@ static void scan_object(struct kmemleak_object *object)
* Once the object->lock is acquired, the corresponding memory block
* cannot be freed (the same lock is acquired in delete_object).
*/
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
if (object->flags & OBJECT_NO_SCAN)
goto out;
if (!(object->flags & OBJECT_ALLOCATED))
@@ -1344,9 +1344,9 @@ static void scan_object(struct kmemleak_object *object)
if (start >= end)
break;
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
cond_resched();
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
} while (object->flags & OBJECT_ALLOCATED);
} else
hlist_for_each_entry(area, &object->area_list, node)
@@ -1354,7 +1354,7 @@ static void scan_object(struct kmemleak_object *object)
(void *)(area->start + area->size),
object);
out:
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
}
/*
@@ -1407,7 +1407,7 @@ static void kmemleak_scan(void)
/* prepare the kmemleak_object's */
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
#ifdef DEBUG
/*
* With a few exceptions there should be a maximum of
@@ -1424,7 +1424,7 @@ static void kmemleak_scan(void)
if (color_gray(object) && get_object(object))
list_add_tail(&object->gray_list, &gray_list);
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();
@@ -1492,14 +1492,14 @@ static void kmemleak_scan(void)
*/
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
if (color_white(object) && (object->flags & OBJECT_ALLOCATED)
&& update_checksum(object) && get_object(object)) {
/* color it gray temporarily */
object->count = object->min_count;
list_add_tail(&object->gray_list, &gray_list);
}
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();
@@ -1519,7 +1519,7 @@ static void kmemleak_scan(void)
*/
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
if (unreferenced_object(object) &&
!(object->flags & OBJECT_REPORTED)) {
object->flags |= OBJECT_REPORTED;
@@ -1529,7 +1529,7 @@ static void kmemleak_scan(void)
new_leaks++;
}
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();
@@ -1681,10 +1681,10 @@ static int kmemleak_seq_show(struct seq_file *seq, void *v)
struct kmemleak_object *object = v;
unsigned long flags;
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object))
print_unreferenced(seq, object);
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
return 0;
}
@@ -1714,9 +1714,9 @@ static int dump_str_object_info(const char *str)
return -EINVAL;
}
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
dump_object_info(object);
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
return 0;
@@ -1735,11 +1735,11 @@ static void kmemleak_clear(void)
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
if ((object->flags & OBJECT_REPORTED) &&
unreferenced_object(object))
__paint_it(object, KMEMLEAK_GREY);
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();
diff --git a/mm/memblock.c b/mm/memblock.c
index 4bc2c7d8bf42..eba94ee3de0b 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -575,7 +575,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
* Return:
* 0 on success, -errno on failure.
*/
-int __init_memblock memblock_add_range(struct memblock_type *type,
+static int __init_memblock memblock_add_range(struct memblock_type *type,
phys_addr_t base, phys_addr_t size,
int nid, enum memblock_flags flags)
{
@@ -694,7 +694,7 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
- memblock_dbg("memblock_add: [%pa-%pa] %pS\n",
+ memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
&base, &end, (void *)_RET_IP_);
return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0);
@@ -795,7 +795,7 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
- memblock_dbg("memblock_remove: [%pa-%pa] %pS\n",
+ memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
&base, &end, (void *)_RET_IP_);
return memblock_remove_range(&memblock.memory, base, size);
@@ -813,7 +813,7 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
- memblock_dbg(" memblock_free: [%pa-%pa] %pS\n",
+ memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
&base, &end, (void *)_RET_IP_);
kmemleak_free_part_phys(base, size);
@@ -824,12 +824,24 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
- memblock_dbg("memblock_reserve: [%pa-%pa] %pS\n",
+ memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
&base, &end, (void *)_RET_IP_);
return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0);
}
+#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
+int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size)
+{
+ phys_addr_t end = base + size - 1;
+
+ memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
+ &base, &end, (void *)_RET_IP_);
+
+ return memblock_add_range(&memblock.physmem, base, size, MAX_NUMNODES, 0);
+}
+#endif
+
/**
* memblock_setclr_flag - set or clear flag for a memory region
* @base: base address of the region
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6c83cf4ed970..6f6dc8712e39 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5340,14 +5340,6 @@ static int mem_cgroup_move_account(struct page *page,
__mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (compound && !list_empty(page_deferred_list(page))) {
- spin_lock(&from->deferred_split_queue.split_queue_lock);
- list_del_init(page_deferred_list(page));
- from->deferred_split_queue.split_queue_len--;
- spin_unlock(&from->deferred_split_queue.split_queue_lock);
- }
-#endif
/*
* It is safe to change page->mem_cgroup here because the page
* is referenced, charged, and isolated - we can't race with
@@ -5357,16 +5349,6 @@ static int mem_cgroup_move_account(struct page *page,
/* caller should have done css_get */
page->mem_cgroup = to;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (compound && list_empty(page_deferred_list(page))) {
- spin_lock(&to->deferred_split_queue.split_queue_lock);
- list_add_tail(page_deferred_list(page),
- &to->deferred_split_queue.split_queue);
- to->deferred_split_queue.split_queue_len++;
- spin_unlock(&to->deferred_split_queue.split_queue_lock);
- }
-#endif
-
spin_unlock_irqrestore(&from->move_lock, flags);
ret = 0;
@@ -6651,7 +6633,6 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
{
struct mem_cgroup *memcg;
unsigned int nr_pages;
- bool compound;
unsigned long flags;
VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
@@ -6673,8 +6654,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
return;
/* Force-charge the new page. The old one will be freed soon */
- compound = PageTransHuge(newpage);
- nr_pages = compound ? hpage_nr_pages(newpage) : 1;
+ nr_pages = hpage_nr_pages(newpage);
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
@@ -6684,7 +6664,8 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
commit_charge(newpage, memcg, false);
local_irq_save(flags);
- mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
+ mem_cgroup_charge_statistics(memcg, newpage, PageTransHuge(newpage),
+ nr_pages);
memcg_check_events(memcg, newpage);
local_irq_restore(flags);
}
diff --git a/mm/memory.c b/mm/memory.c
index 1c4be871a237..0bccc622e482 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1664,6 +1664,9 @@ out_unlock:
* vmf_insert_pfn_prot should only be used if using multiple VMAs is
* impractical.
*
+ * See vmf_insert_mixed_prot() for a discussion of the implication of using
+ * a value of @pgprot different from that of @vma->vm_page_prot.
+ *
* Context: Process context. May allocate using %GFP_KERNEL.
* Return: vm_fault_t value.
*/
@@ -1737,9 +1740,9 @@ static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
}
static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
- unsigned long addr, pfn_t pfn, bool mkwrite)
+ unsigned long addr, pfn_t pfn, pgprot_t pgprot,
+ bool mkwrite)
{
- pgprot_t pgprot = vma->vm_page_prot;
int err;
BUG_ON(!vm_mixed_ok(vma, pfn));
@@ -1782,10 +1785,43 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
return VM_FAULT_NOPAGE;
}
+/**
+ * vmf_insert_mixed_prot - insert single pfn into user vma with specified pgprot
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @pfn: source kernel pfn
+ * @pgprot: pgprot flags for the inserted page
+ *
+ * This is exactly like vmf_insert_mixed(), except that it allows drivers to
+ * to override pgprot on a per-page basis.
+ *
+ * Typically this function should be used by drivers to set caching- and
+ * encryption bits different than those of @vma->vm_page_prot, because
+ * the caching- or encryption mode may not be known at mmap() time.
+ * This is ok as long as @vma->vm_page_prot is not used by the core vm
+ * to set caching and encryption bits for those vmas (except for COW pages).
+ * This is ensured by core vm only modifying these page table entries using
+ * functions that don't touch caching- or encryption bits, using pte_modify()
+ * if needed. (See for example mprotect()).
+ * Also when new page-table entries are created, this is only done using the
+ * fault() callback, and never using the value of vma->vm_page_prot,
+ * except for page-table entries that point to anonymous pages as the result
+ * of COW.
+ *
+ * Context: Process context. May allocate using %GFP_KERNEL.
+ * Return: vm_fault_t value.
+ */
+vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
+ pfn_t pfn, pgprot_t pgprot)
+{
+ return __vm_insert_mixed(vma, addr, pfn, pgprot, false);
+}
+EXPORT_SYMBOL(vmf_insert_mixed_prot);
+
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn)
{
- return __vm_insert_mixed(vma, addr, pfn, false);
+ return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, false);
}
EXPORT_SYMBOL(vmf_insert_mixed);
@@ -1797,7 +1833,7 @@ EXPORT_SYMBOL(vmf_insert_mixed);
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
unsigned long addr, pfn_t pfn)
{
- return __vm_insert_mixed(vma, addr, pfn, true);
+ return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, true);
}
EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a91a072f2b2c..0a54ffac8c68 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -355,7 +355,7 @@ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
if (unlikely(pfn_to_nid(start_pfn) != nid))
continue;
- if (zone && zone != page_zone(pfn_to_page(start_pfn)))
+ if (zone != page_zone(pfn_to_page(start_pfn)))
continue;
return start_pfn;
@@ -380,7 +380,7 @@ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
if (unlikely(pfn_to_nid(pfn) != nid))
continue;
- if (zone && zone != page_zone(pfn_to_page(pfn)))
+ if (zone != page_zone(pfn_to_page(pfn)))
continue;
return pfn;
@@ -392,14 +392,11 @@ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
unsigned long end_pfn)
{
- unsigned long zone_start_pfn = zone->zone_start_pfn;
- unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
- unsigned long zone_end_pfn = z;
unsigned long pfn;
int nid = zone_to_nid(zone);
zone_span_writelock(zone);
- if (zone_start_pfn == start_pfn) {
+ if (zone->zone_start_pfn == start_pfn) {
/*
* If the section is smallest section in the zone, it need
* shrink zone->zone_start_pfn and zone->zone_spanned_pages.
@@ -407,50 +404,30 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
* for shrinking zone.
*/
pfn = find_smallest_section_pfn(nid, zone, end_pfn,
- zone_end_pfn);
+ zone_end_pfn(zone));
if (pfn) {
+ zone->spanned_pages = zone_end_pfn(zone) - pfn;
zone->zone_start_pfn = pfn;
- zone->spanned_pages = zone_end_pfn - pfn;
+ } else {
+ zone->zone_start_pfn = 0;
+ zone->spanned_pages = 0;
}
- } else if (zone_end_pfn == end_pfn) {
+ } else if (zone_end_pfn(zone) == end_pfn) {
/*
* If the section is biggest section in the zone, it need
* shrink zone->spanned_pages.
* In this case, we find second biggest valid mem_section for
* shrinking zone.
*/
- pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
+ pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn,
start_pfn);
if (pfn)
- zone->spanned_pages = pfn - zone_start_pfn + 1;
- }
-
- /*
- * The section is not biggest or smallest mem_section in the zone, it
- * only creates a hole in the zone. So in this case, we need not
- * change the zone. But perhaps, the zone has only hole data. Thus
- * it check the zone has only hole or not.
- */
- pfn = zone_start_pfn;
- for (; pfn < zone_end_pfn; pfn += PAGES_PER_SUBSECTION) {
- if (unlikely(!pfn_to_online_page(pfn)))
- continue;
-
- if (page_zone(pfn_to_page(pfn)) != zone)
- continue;
-
- /* Skip range to be removed */
- if (pfn >= start_pfn && pfn < end_pfn)
- continue;
-
- /* If we find valid section, we have nothing to do */
- zone_span_writeunlock(zone);
- return;
+ zone->spanned_pages = pfn - zone->zone_start_pfn + 1;
+ else {
+ zone->zone_start_pfn = 0;
+ zone->spanned_pages = 0;
+ }
}
-
- /* The zone has no valid section */
- zone->zone_start_pfn = 0;
- zone->spanned_pages = 0;
zone_span_writeunlock(zone);
}
@@ -490,6 +467,9 @@ void __ref remove_pfn_range_from_zone(struct zone *zone,
struct pglist_data *pgdat = zone->zone_pgdat;
unsigned long flags;
+ /* Poison struct pages because they are now uninitialized again. */
+ page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
+
#ifdef CONFIG_ZONE_DEVICE
/*
* Zone shrinking code cannot properly deal with ZONE_DEVICE. So
@@ -536,25 +516,20 @@ static void __remove_section(unsigned long pfn, unsigned long nr_pages,
void __remove_pages(unsigned long pfn, unsigned long nr_pages,
struct vmem_altmap *altmap)
{
+ const unsigned long end_pfn = pfn + nr_pages;
+ unsigned long cur_nr_pages;
unsigned long map_offset = 0;
- unsigned long nr, start_sec, end_sec;
map_offset = vmem_altmap_offset(altmap);
if (check_pfn_span(pfn, nr_pages, "remove"))
return;
- start_sec = pfn_to_section_nr(pfn);
- end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
- for (nr = start_sec; nr <= end_sec; nr++) {
- unsigned long pfns;
-
+ for (; pfn < end_pfn; pfn += cur_nr_pages) {
cond_resched();
- pfns = min(nr_pages, PAGES_PER_SECTION
- - (pfn & ~PAGE_SECTION_MASK));
- __remove_section(pfn, pfns, map_offset, altmap);
- pfn += pfns;
- nr_pages -= pfns;
+ /* Select all remaining pages up to the next section boundary */
+ cur_nr_pages = min(end_pfn - pfn, -(pfn | PAGE_SECTION_MASK));
+ __remove_section(pfn, cur_nr_pages, map_offset, altmap);
map_offset = 0;
}
}
@@ -783,27 +758,18 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
return default_zone_for_pfn(nid, start_pfn, nr_pages);
}
-int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
+int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
+ int online_type, int nid)
{
unsigned long flags;
unsigned long onlined_pages = 0;
struct zone *zone;
int need_zonelists_rebuild = 0;
- int nid;
int ret;
struct memory_notify arg;
- struct memory_block *mem;
mem_hotplug_begin();
- /*
- * We can't use pfn_to_nid() because nid might be stored in struct page
- * which is not yet initialized. Instead, we find nid from memory block.
- */
- mem = find_memory_block(__pfn_to_section(pfn));
- nid = mem->nid;
- put_device(&mem->dev);
-
/* associate pfn range with the zone */
zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
move_pfn_range_to_zone(zone, pfn, nr_pages, NULL);
@@ -1182,7 +1148,7 @@ static bool is_pageblock_removable_nolock(unsigned long pfn)
if (!zone_spans_pfn(zone, pfn))
return false;
- return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE,
+ return !has_unmovable_pages(zone, page, MIGRATE_MOVABLE,
MEMORY_OFFLINE);
}
@@ -1206,14 +1172,13 @@ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
}
/*
- * Confirm all pages in a range [start, end) belong to the same zone.
- * When true, return its valid [start, end).
+ * Confirm all pages in a range [start, end) belong to the same zone (skipping
+ * memory holes). When true, return the zone.
*/
-int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
- unsigned long *valid_start, unsigned long *valid_end)
+struct zone *test_pages_in_a_zone(unsigned long start_pfn,
+ unsigned long end_pfn)
{
unsigned long pfn, sec_end_pfn;
- unsigned long start, end;
struct zone *zone = NULL;
struct page *page;
int i;
@@ -1234,24 +1199,15 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
continue;
/* Check if we got outside of the zone */
if (zone && !zone_spans_pfn(zone, pfn + i))
- return 0;
+ return NULL;
page = pfn_to_page(pfn + i);
if (zone && page_zone(page) != zone)
- return 0;
- if (!zone)
- start = pfn + i;
+ return NULL;
zone = page_zone(page);
- end = pfn + MAX_ORDER_NR_PAGES;
}
}
- if (zone) {
- *valid_start = start;
- *valid_end = min(end, end_pfn);
- return 1;
- } else {
- return 0;
- }
+ return zone;
}
/*
@@ -1496,7 +1452,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
unsigned long offlined_pages = 0;
int ret, node, nr_isolate_pageblock;
unsigned long flags;
- unsigned long valid_start, valid_end;
struct zone *zone;
struct memory_notify arg;
char *reason;
@@ -1521,14 +1476,12 @@ static int __ref __offline_pages(unsigned long start_pfn,
/* This makes hotplug much easier...and readable.
we assume this for now. .*/
- if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start,
- &valid_end)) {
+ zone = test_pages_in_a_zone(start_pfn, end_pfn);
+ if (!zone) {
ret = -EINVAL;
reason = "multizone range";
goto failed_removal;
}
-
- zone = page_zone(pfn_to_page(valid_start));
node = zone_to_nid(zone);
/* set above range as isolated */
@@ -1764,8 +1717,6 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
BUG_ON(check_hotplug_memory_range(start, size));
- mem_hotplug_begin();
-
/*
* All memory blocks must be offlined before removing memory. Check
* whether all memory blocks in question are offline and return error
@@ -1778,9 +1729,14 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
/* remove memmap entry */
firmware_map_remove(start, start + size, "System RAM");
- /* remove memory block devices before removing memory */
+ /*
+ * Memory block device removal under the device_hotplug_lock is
+ * a barrier against racing online attempts.
+ */
remove_memory_block_devices(start, size);
+ mem_hotplug_begin();
+
arch_remove_memory(nid, start, size, NULL);
memblock_free(start, size);
memblock_remove(start, size);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b2920ae87a61..977c641f78cf 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2821,6 +2821,9 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
char *flags = strchr(str, '=');
int err = 1, mode;
+ if (flags)
+ *flags++ = '\0'; /* terminate mode string */
+
if (nodelist) {
/* NUL-terminate mode or flags string */
*nodelist++ = '\0';
@@ -2831,9 +2834,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
} else
nodes_clear(nodes);
- if (flags)
- *flags++ = '\0'; /* terminate mode string */
-
mode = match_string(policy_modes, MPOL_MAX, str);
if (mode < 0)
goto out;
diff --git a/mm/memremap.c b/mm/memremap.c
index c51c6bd2fe34..09b5b7adc773 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -27,7 +27,8 @@ static void devmap_managed_enable_put(void)
static int devmap_managed_enable_get(struct dev_pagemap *pgmap)
{
- if (!pgmap->ops || !pgmap->ops->page_free) {
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE &&
+ (!pgmap->ops || !pgmap->ops->page_free)) {
WARN(1, "Missing page_free method\n");
return -EINVAL;
}
@@ -119,6 +120,8 @@ void memunmap_pages(struct dev_pagemap *pgmap)
nid = page_to_nid(first_page);
mem_hotplug_begin();
+ remove_pfn_range_from_zone(page_zone(first_page), PHYS_PFN(res->start),
+ PHYS_PFN(resource_size(res)));
if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
__remove_pages(PHYS_PFN(res->start),
PHYS_PFN(resource_size(res)), NULL);
@@ -410,48 +413,42 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
EXPORT_SYMBOL_GPL(get_dev_pagemap);
#ifdef CONFIG_DEV_PAGEMAP_OPS
-void __put_devmap_managed_page(struct page *page)
+void free_devmap_managed_page(struct page *page)
{
- int count = page_ref_dec_return(page);
-
- /*
- * If refcount is 1 then page is freed and refcount is stable as nobody
- * holds a reference on the page.
- */
- if (count == 1) {
- /* Clear Active bit in case of parallel mark_page_accessed */
- __ClearPageActive(page);
- __ClearPageWaiters(page);
+ /* notify page idle for dax */
+ if (!is_device_private_page(page)) {
+ wake_up_var(&page->_refcount);
+ return;
+ }
- mem_cgroup_uncharge(page);
+ /* Clear Active bit in case of parallel mark_page_accessed */
+ __ClearPageActive(page);
+ __ClearPageWaiters(page);
- /*
- * When a device_private page is freed, the page->mapping field
- * may still contain a (stale) mapping value. For example, the
- * lower bits of page->mapping may still identify the page as
- * an anonymous page. Ultimately, this entire field is just
- * stale and wrong, and it will cause errors if not cleared.
- * One example is:
- *
- * migrate_vma_pages()
- * migrate_vma_insert_page()
- * page_add_new_anon_rmap()
- * __page_set_anon_rmap()
- * ...checks page->mapping, via PageAnon(page) call,
- * and incorrectly concludes that the page is an
- * anonymous page. Therefore, it incorrectly,
- * silently fails to set up the new anon rmap.
- *
- * For other types of ZONE_DEVICE pages, migration is either
- * handled differently or not done at all, so there is no need
- * to clear page->mapping.
- */
- if (is_device_private_page(page))
- page->mapping = NULL;
+ mem_cgroup_uncharge(page);
- page->pgmap->ops->page_free(page);
- } else if (!count)
- __put_page(page);
+ /*
+ * When a device_private page is freed, the page->mapping field
+ * may still contain a (stale) mapping value. For example, the
+ * lower bits of page->mapping may still identify the page as an
+ * anonymous page. Ultimately, this entire field is just stale
+ * and wrong, and it will cause errors if not cleared. One
+ * example is:
+ *
+ * migrate_vma_pages()
+ * migrate_vma_insert_page()
+ * page_add_new_anon_rmap()
+ * __page_set_anon_rmap()
+ * ...checks page->mapping, via PageAnon(page) call,
+ * and incorrectly concludes that the page is an
+ * anonymous page. Therefore, it incorrectly,
+ * silently fails to set up the new anon rmap.
+ *
+ * For other types of ZONE_DEVICE pages, migration is either
+ * handled differently or not done at all, so there is no need
+ * to clear page->mapping.
+ */
+ page->mapping = NULL;
+ page->pgmap->ops->page_free(page);
}
-EXPORT_SYMBOL(__put_devmap_managed_page);
#endif /* CONFIG_DEV_PAGEMAP_OPS */
diff --git a/mm/migrate.c b/mm/migrate.c
index 86873b6f38a7..b1092876e537 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -48,6 +48,7 @@
#include <linux/page_owner.h>
#include <linux/sched/mm.h>
#include <linux/ptrace.h>
+#include <linux/oom.h>
#include <asm/tlbflush.h>
@@ -986,7 +987,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
}
/*
- * Anonymous and movable page->mapping will be cleard by
+ * Anonymous and movable page->mapping will be cleared by
* free_pages_prepare so don't reset it here for keeping
* the type to work PageAnon, for example.
*/
@@ -1199,8 +1200,7 @@ out:
/*
* A page that has been migrated has all references
* removed and will be freed. A page that has not been
- * migrated will have kepts its references and be
- * restored.
+ * migrated will have kept its references and be restored.
*/
list_del(&page->lru);
@@ -1627,8 +1627,19 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
start = i;
} else if (node != current_node) {
err = do_move_pages_to_node(mm, &pagelist, current_node);
- if (err)
+ if (err) {
+ /*
+ * Positive err means the number of failed
+ * pages to migrate. Since we are going to
+ * abort and return the number of non-migrated
+ * pages, so need to incude the rest of the
+ * nr_pages that have not been attempted as
+ * well.
+ */
+ if (err > 0)
+ err += nr_pages - i - 1;
goto out;
+ }
err = store_status(status, start, current_node, i - start);
if (err)
goto out;
@@ -1659,8 +1670,11 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
goto out_flush;
err = do_move_pages_to_node(mm, &pagelist, current_node);
- if (err)
+ if (err) {
+ if (err > 0)
+ err += nr_pages - i - 1;
goto out;
+ }
if (i > start) {
err = store_status(status, start, current_node, i - start);
if (err)
@@ -1674,9 +1688,16 @@ out_flush:
/* Make sure we do not overwrite the existing error */
err1 = do_move_pages_to_node(mm, &pagelist, current_node);
+ /*
+ * Don't have to report non-attempted pages here since:
+ * - If the above loop is done gracefully all pages have been
+ * attempted.
+ * - If the above loop is aborted it means a fatal error
+ * happened, should return ret.
+ */
if (!err1)
err1 = store_status(status, start, current_node, i - start);
- if (!err)
+ if (err >= 0)
err = err1;
out:
return err;
@@ -2130,12 +2151,13 @@ out_unlock:
#ifdef CONFIG_DEVICE_PRIVATE
static int migrate_vma_collect_hole(unsigned long start,
unsigned long end,
+ __always_unused int depth,
struct mm_walk *walk)
{
struct migrate_vma *migrate = walk->private;
unsigned long addr;
- for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
migrate->dst[migrate->npages] = 0;
migrate->npages++;
@@ -2152,7 +2174,7 @@ static int migrate_vma_collect_skip(unsigned long start,
struct migrate_vma *migrate = walk->private;
unsigned long addr;
- for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
migrate->dst[migrate->npages] = 0;
migrate->src[migrate->npages++] = 0;
}
@@ -2174,7 +2196,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
again:
if (pmd_none(*pmdp))
- return migrate_vma_collect_hole(start, end, walk);
+ return migrate_vma_collect_hole(start, end, -1, walk);
if (pmd_trans_huge(*pmdp)) {
struct page *page;
@@ -2207,7 +2229,7 @@ again:
return migrate_vma_collect_skip(start, end,
walk);
if (pmd_none(*pmdp))
- return migrate_vma_collect_hole(start, end,
+ return migrate_vma_collect_hole(start, end, -1,
walk);
}
}
@@ -2675,6 +2697,14 @@ int migrate_vma_setup(struct migrate_vma *args)
}
EXPORT_SYMBOL(migrate_vma_setup);
+/*
+ * This code closely matches the code in:
+ * __handle_mm_fault()
+ * handle_pte_fault()
+ * do_anonymous_page()
+ * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
+ * private page.
+ */
static void migrate_vma_insert_page(struct migrate_vma *migrate,
unsigned long addr,
struct page *page,
@@ -2755,30 +2785,24 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ if (check_stable_address_space(mm))
+ goto unlock_abort;
+
if (pte_present(*ptep)) {
unsigned long pfn = pte_pfn(*ptep);
- if (!is_zero_pfn(pfn)) {
- pte_unmap_unlock(ptep, ptl);
- mem_cgroup_cancel_charge(page, memcg, false);
- goto abort;
- }
+ if (!is_zero_pfn(pfn))
+ goto unlock_abort;
flush = true;
- } else if (!pte_none(*ptep)) {
- pte_unmap_unlock(ptep, ptl);
- mem_cgroup_cancel_charge(page, memcg, false);
- goto abort;
- }
+ } else if (!pte_none(*ptep))
+ goto unlock_abort;
/*
- * Check for usefaultfd but do not deliver the fault. Instead,
+ * Check for userfaultfd but do not deliver the fault. Instead,
* just back off.
*/
- if (userfaultfd_missing(vma)) {
- pte_unmap_unlock(ptep, ptl);
- mem_cgroup_cancel_charge(page, memcg, false);
- goto abort;
- }
+ if (userfaultfd_missing(vma))
+ goto unlock_abort;
inc_mm_counter(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, addr, false);
@@ -2802,6 +2826,9 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
*src = MIGRATE_PFN_MIGRATE;
return;
+unlock_abort:
+ pte_unmap_unlock(ptep, ptl);
+ mem_cgroup_cancel_charge(page, memcg, false);
abort:
*src &= ~MIGRATE_PFN_MIGRATE;
}
@@ -2834,9 +2861,8 @@ void migrate_vma_pages(struct migrate_vma *migrate)
}
if (!page) {
- if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) {
+ if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
continue;
- }
if (!notified) {
notified = true;
diff --git a/mm/mincore.c b/mm/mincore.c
index 49b6fa2f6aa1..0e6dd9948f1a 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -112,6 +112,7 @@ static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
}
static int mincore_unmapped_range(unsigned long addr, unsigned long end,
+ __always_unused int depth,
struct mm_walk *walk)
{
walk->private += __mincore_unmapped_range(addr, end,
diff --git a/mm/mmap.c b/mm/mmap.c
index bc788548c4e5..6756b8bb0033 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1270,26 +1270,22 @@ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_
*/
struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
{
- struct anon_vma *anon_vma;
- struct vm_area_struct *near;
-
- near = vma->vm_next;
- if (!near)
- goto try_prev;
-
- anon_vma = reusable_anon_vma(near, vma, near);
- if (anon_vma)
- return anon_vma;
-try_prev:
- near = vma->vm_prev;
- if (!near)
- goto none;
-
- anon_vma = reusable_anon_vma(near, near, vma);
- if (anon_vma)
- return anon_vma;
-none:
+ struct anon_vma *anon_vma = NULL;
+
+ /* Try next first. */
+ if (vma->vm_next) {
+ anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next);
+ if (anon_vma)
+ return anon_vma;
+ }
+
+ /* Try prev next. */
+ if (vma->vm_prev)
+ anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma);
+
/*
+ * We might reach here with anon_vma == NULL if we can't find
+ * any reusable anon_vma.
* There's no absolute need to look only at touching neighbours:
* we could search further afield for "compatible" anon_vmas.
* But it would probably just be a waste of time searching,
@@ -1297,7 +1293,7 @@ none:
* We're trying to allow mprotect remerging later on,
* not trying to minimize memory used for anon_vmas.
*/
- return NULL;
+ return anon_vma;
}
/*
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 7d70e5c78f97..a3538cb2bcbe 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -11,7 +11,7 @@
#include <asm/pgalloc.h>
#include <asm/tlb.h>
-#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
+#ifndef CONFIG_MMU_GATHER_NO_GATHER
static bool tlb_next_batch(struct mmu_gather *tlb)
{
@@ -69,7 +69,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
VM_BUG_ON(!tlb->end);
-#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE
+#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
VM_WARN_ON(tlb->page_size != page_size);
#endif
@@ -89,58 +89,108 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
return false;
}
-#endif /* HAVE_MMU_GATHER_NO_GATHER */
+#endif /* MMU_GATHER_NO_GATHER */
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+#ifdef CONFIG_MMU_GATHER_TABLE_FREE
-/*
- * See the comment near struct mmu_table_batch.
- */
+static void __tlb_remove_table_free(struct mmu_table_batch *batch)
+{
+ int i;
+
+ for (i = 0; i < batch->nr; i++)
+ __tlb_remove_table(batch->tables[i]);
+
+ free_page((unsigned long)batch);
+}
+
+#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
/*
- * If we want tlb_remove_table() to imply TLB invalidates.
+ * Semi RCU freeing of the page directories.
+ *
+ * This is needed by some architectures to implement software pagetable walkers.
+ *
+ * gup_fast() and other software pagetable walkers do a lockless page-table
+ * walk and therefore needs some synchronization with the freeing of the page
+ * directories. The chosen means to accomplish that is by disabling IRQs over
+ * the walk.
+ *
+ * Architectures that use IPIs to flush TLBs will then automagically DTRT,
+ * since we unlink the page, flush TLBs, free the page. Since the disabling of
+ * IRQs delays the completion of the TLB flush we can never observe an already
+ * freed page.
+ *
+ * Architectures that do not have this (PPC) need to delay the freeing by some
+ * other means, this is that means.
+ *
+ * What we do is batch the freed directory pages (tables) and RCU free them.
+ * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling
+ * holds off grace periods.
+ *
+ * However, in order to batch these pages we need to allocate storage, this
+ * allocation is deep inside the MM code and can thus easily fail on memory
+ * pressure. To guarantee progress we fall back to single table freeing, see
+ * the implementation of tlb_remove_table_one().
+ *
*/
-static inline void tlb_table_invalidate(struct mmu_gather *tlb)
-{
-#ifndef CONFIG_HAVE_RCU_TABLE_NO_INVALIDATE
- /*
- * Invalidate page-table caches used by hardware walkers. Then we still
- * need to RCU-sched wait while freeing the pages because software
- * walkers can still be in-flight.
- */
- tlb_flush_mmu_tlbonly(tlb);
-#endif
-}
static void tlb_remove_table_smp_sync(void *arg)
{
/* Simply deliver the interrupt */
}
-static void tlb_remove_table_one(void *table)
+static void tlb_remove_table_sync_one(void)
{
/*
* This isn't an RCU grace period and hence the page-tables cannot be
* assumed to be actually RCU-freed.
*
* It is however sufficient for software page-table walkers that rely on
- * IRQ disabling. See the comment near struct mmu_table_batch.
+ * IRQ disabling.
*/
smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
- __tlb_remove_table(table);
}
static void tlb_remove_table_rcu(struct rcu_head *head)
{
- struct mmu_table_batch *batch;
- int i;
+ __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
+}
+
+static void tlb_remove_table_free(struct mmu_table_batch *batch)
+{
+ call_rcu(&batch->rcu, tlb_remove_table_rcu);
+}
- batch = container_of(head, struct mmu_table_batch, rcu);
+#else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */
- for (i = 0; i < batch->nr; i++)
- __tlb_remove_table(batch->tables[i]);
+static void tlb_remove_table_sync_one(void) { }
- free_page((unsigned long)batch);
+static void tlb_remove_table_free(struct mmu_table_batch *batch)
+{
+ __tlb_remove_table_free(batch);
+}
+
+#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
+
+/*
+ * If we want tlb_remove_table() to imply TLB invalidates.
+ */
+static inline void tlb_table_invalidate(struct mmu_gather *tlb)
+{
+ if (tlb_needs_table_invalidate()) {
+ /*
+ * Invalidate page-table caches used by hardware walkers. Then
+ * we still need to RCU-sched wait while freeing the pages
+ * because software walkers can still be in-flight.
+ */
+ tlb_flush_mmu_tlbonly(tlb);
+ }
+}
+
+static void tlb_remove_table_one(void *table)
+{
+ tlb_remove_table_sync_one();
+ __tlb_remove_table(table);
}
static void tlb_table_flush(struct mmu_gather *tlb)
@@ -149,7 +199,7 @@ static void tlb_table_flush(struct mmu_gather *tlb)
if (*batch) {
tlb_table_invalidate(tlb);
- call_rcu(&(*batch)->rcu, tlb_remove_table_rcu);
+ tlb_remove_table_free(*batch);
*batch = NULL;
}
}
@@ -173,14 +223,22 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
tlb_table_flush(tlb);
}
-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
+static inline void tlb_table_init(struct mmu_gather *tlb)
+{
+ tlb->batch = NULL;
+}
+
+#else /* !CONFIG_MMU_GATHER_TABLE_FREE */
+
+static inline void tlb_table_flush(struct mmu_gather *tlb) { }
+static inline void tlb_table_init(struct mmu_gather *tlb) { }
+
+#endif /* CONFIG_MMU_GATHER_TABLE_FREE */
static void tlb_flush_mmu_free(struct mmu_gather *tlb)
{
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
tlb_table_flush(tlb);
-#endif
-#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
+#ifndef CONFIG_MMU_GATHER_NO_GATHER
tlb_batch_pages_flush(tlb);
#endif
}
@@ -211,7 +269,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
/* Is it from 0 to ~0? */
tlb->fullmm = !(start | (end+1));
-#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
+#ifndef CONFIG_MMU_GATHER_NO_GATHER
tlb->need_flush_all = 0;
tlb->local.next = NULL;
tlb->local.nr = 0;
@@ -220,10 +278,8 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
tlb->batch_count = 0;
#endif
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
- tlb->batch = NULL;
-#endif
-#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE
+ tlb_table_init(tlb);
+#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
tlb->page_size = 0;
#endif
@@ -271,7 +327,7 @@ void tlb_finish_mmu(struct mmu_gather *tlb,
tlb_flush_mmu(tlb);
-#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
+#ifndef CONFIG_MMU_GATHER_NO_GATHER
tlb_batch_list_free(tlb);
#endif
dec_tlb_flush_pending(tlb->mm);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d58c481b3df8..dfc357614e56 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -26,6 +26,7 @@
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/task.h>
+#include <linux/sched/debug.h>
#include <linux/swap.h>
#include <linux/timex.h>
#include <linux/jiffies.h>
@@ -620,6 +621,7 @@ static void oom_reap_task(struct task_struct *tsk)
pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
task_pid_nr(tsk), tsk->comm);
+ sched_show_task(tsk);
debug_show_all_locks();
done:
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d047bf7d8fd4..3c4eb750a199 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5848,6 +5848,23 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
return false;
}
+#ifdef CONFIG_SPARSEMEM
+/* Skip PFNs that belong to non-present sections */
+static inline __meminit unsigned long next_pfn(unsigned long pfn)
+{
+ const unsigned long section_nr = pfn_to_section_nr(++pfn);
+
+ if (present_section_nr(section_nr))
+ return pfn;
+ return section_nr_to_pfn(next_present_section_nr(section_nr));
+}
+#else
+static inline __meminit unsigned long next_pfn(unsigned long pfn)
+{
+ return pfn++;
+}
+#endif
+
/*
* Initially all pages are reserved - free ones are freed
* up by memblock_free_all() once the early boot process is
@@ -5881,16 +5898,20 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
}
#endif
- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ for (pfn = start_pfn; pfn < end_pfn; ) {
/*
* There can be holes in boot-time mem_map[]s handed to this
* function. They do not exist on hotplugged memory.
*/
if (context == MEMMAP_EARLY) {
- if (!early_pfn_valid(pfn))
+ if (!early_pfn_valid(pfn)) {
+ pfn = next_pfn(pfn);
continue;
- if (!early_pfn_in_nid(pfn, nid))
+ }
+ if (!early_pfn_in_nid(pfn, nid)) {
+ pfn++;
continue;
+ }
if (overlap_memmap_init(zone, &pfn))
continue;
if (defer_init(nid, pfn, end_pfn))
@@ -5918,16 +5939,17 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
cond_resched();
}
+ pfn++;
}
}
#ifdef CONFIG_ZONE_DEVICE
void __ref memmap_init_zone_device(struct zone *zone,
unsigned long start_pfn,
- unsigned long size,
+ unsigned long nr_pages,
struct dev_pagemap *pgmap)
{
- unsigned long pfn, end_pfn = start_pfn + size;
+ unsigned long pfn, end_pfn = start_pfn + nr_pages;
struct pglist_data *pgdat = zone->zone_pgdat;
struct vmem_altmap *altmap = pgmap_altmap(pgmap);
unsigned long zone_idx = zone_idx(zone);
@@ -5944,7 +5966,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
*/
if (altmap) {
start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
- size = end_pfn - start_pfn;
+ nr_pages = end_pfn - start_pfn;
}
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
@@ -5991,7 +6013,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
}
pr_info("%s initialised %lu pages in %ums\n", __func__,
- size, jiffies_to_msecs(jiffies - start));
+ nr_pages, jiffies_to_msecs(jiffies - start));
}
#endif
@@ -6890,10 +6912,10 @@ void __init free_area_init_node(int nid, unsigned long *zones_size,
#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
/*
- * Zero all valid struct pages in range [spfn, epfn), return number of struct
- * pages zeroed
+ * Initialize all valid struct pages in the range [spfn, epfn) and mark them
+ * PageReserved(). Return the number of struct pages that were initialized.
*/
-static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
+static u64 __init init_unavailable_range(unsigned long spfn, unsigned long epfn)
{
unsigned long pfn;
u64 pgcnt = 0;
@@ -6904,7 +6926,13 @@ static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
+ pageblock_nr_pages - 1;
continue;
}
- mm_zero_struct_page(pfn_to_page(pfn));
+ /*
+ * Use a fake node/zone (0) for now. Some of these pages
+ * (in memblock.reserved but not in memblock.memory) will
+ * get re-initialized via reserve_bootmem_region() later.
+ */
+ __init_single_page(pfn_to_page(pfn), pfn, 0, 0);
+ __SetPageReserved(pfn_to_page(pfn));
pgcnt++;
}
@@ -6916,14 +6944,15 @@ static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
* initialized by going through __init_single_page(). But, there are some
* struct pages which are reserved in memblock allocator and their fields
* may be accessed (for example page_to_pfn() on some configuration accesses
- * flags). We must explicitly zero those struct pages.
+ * flags). We must explicitly initialize those struct pages.
*
* This function also addresses a similar issue where struct pages are left
* uninitialized because the physical address range is not covered by
* memblock.memory or memblock.reserved. That could happen when memblock
- * layout is manually configured via memmap=.
+ * layout is manually configured via memmap=, or when the highest physical
+ * address (max_pfn) does not end on a section boundary.
*/
-void __init zero_resv_unavail(void)
+static void __init init_unavailable_mem(void)
{
phys_addr_t start, end;
u64 i, pgcnt;
@@ -6936,10 +6965,20 @@ void __init zero_resv_unavail(void)
for_each_mem_range(i, &memblock.memory, NULL,
NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) {
if (next < start)
- pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start));
+ pgcnt += init_unavailable_range(PFN_DOWN(next),
+ PFN_UP(start));
next = end;
}
- pgcnt += zero_pfn_range(PFN_DOWN(next), max_pfn);
+
+ /*
+ * Early sections always have a fully populated memmap for the whole
+ * section - see pfn_valid(). If the last section has holes at the
+ * end and that section is marked "online", the memmap will be
+ * considered initialized. Make sure that memmap has a well defined
+ * state.
+ */
+ pgcnt += init_unavailable_range(PFN_DOWN(next),
+ round_up(max_pfn, PAGES_PER_SECTION));
/*
* Struct pages that do not have backing memory. This could be because
@@ -6948,6 +6987,10 @@ void __init zero_resv_unavail(void)
if (pgcnt)
pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);
}
+#else
+static inline void __init init_unavailable_mem(void)
+{
+}
#endif /* !CONFIG_FLAT_NODE_MEM_MAP */
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -7377,7 +7420,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
/* Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
- zero_resv_unavail();
+ init_unavailable_mem();
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
free_area_init_node(nid, NULL,
@@ -7572,7 +7615,7 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
void __init free_area_init(unsigned long *zones_size)
{
- zero_resv_unavail();
+ init_unavailable_mem();
free_area_init_node(0, zones_size,
__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
}
@@ -8154,20 +8197,22 @@ void *__init alloc_large_system_hash(const char *tablename,
/*
* This function checks whether pageblock includes unmovable pages or not.
- * If @count is not zero, it is okay to include less @count unmovable pages
*
* PageLRU check without isolation or lru_lock could race so that
* MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
* check without lock_page also may miss some movable non-lru pages at
* race condition. So you can't expect this function should be exact.
+ *
+ * Returns a page without holding a reference. If the caller wants to
+ * dereference that page (e.g., dumping), it has to make sure that that it
+ * cannot get removed (e.g., via memory unplug) concurrently.
+ *
*/
-bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
- int migratetype, int flags)
+struct page *has_unmovable_pages(struct zone *zone, struct page *page,
+ int migratetype, int flags)
{
- unsigned long found;
unsigned long iter = 0;
unsigned long pfn = page_to_pfn(page);
- const char *reason = "unmovable page";
/*
* TODO we could make this much more efficient by not checking every
@@ -8184,22 +8229,19 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
* so consider them movable here.
*/
if (is_migrate_cma(migratetype))
- return false;
+ return NULL;
- reason = "CMA page";
- goto unmovable;
+ return page;
}
- for (found = 0; iter < pageblock_nr_pages; iter++) {
- unsigned long check = pfn + iter;
-
- if (!pfn_valid_within(check))
+ for (; iter < pageblock_nr_pages; iter++) {
+ if (!pfn_valid_within(pfn + iter))
continue;
- page = pfn_to_page(check);
+ page = pfn_to_page(pfn + iter);
if (PageReserved(page))
- goto unmovable;
+ return page;
/*
* If the zone is movable and we have ruled out all reserved
@@ -8219,7 +8261,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
unsigned int skip_pages;
if (!hugepage_migration_supported(page_hstate(head)))
- goto unmovable;
+ return page;
skip_pages = compound_nr(head) - (page - head);
iter += skip_pages - 1;
@@ -8245,11 +8287,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
continue;
- if (__PageMovable(page))
+ if (__PageMovable(page) || PageLRU(page))
continue;
- if (!PageLRU(page))
- found++;
/*
* If there are RECLAIMABLE pages, we need to check
* it. But now, memory offline itself doesn't call
@@ -8263,15 +8303,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
* is set to both of a memory hole page and a _used_ kernel
* page at boot.
*/
- if (found > count)
- goto unmovable;
+ return page;
}
- return false;
-unmovable:
- WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
- if (flags & REPORT_FAILURE)
- dump_page(pfn_to_page(pfn + iter), reason);
- return true;
+ return NULL;
}
#ifdef CONFIG_CONTIG_ALLOC
@@ -8675,10 +8709,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
BUG_ON(!PageBuddy(page));
order = page_order(page);
offlined_pages += 1 << order;
-#ifdef CONFIG_DEBUG_VM
- pr_info("remove from free list %lx %d %lx\n",
- pfn, 1 << order, end_pfn);
-#endif
del_page_from_free_area(page, &zone->free_area[order]);
pfn += (1 << order);
}
diff --git a/mm/page_io.c b/mm/page_io.c
index 3a198deb8bb1..76965be1d40e 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -177,8 +177,9 @@ int generic_swapfile_activate(struct swap_info_struct *sis,
cond_resched();
- first_block = bmap(inode, probe_block);
- if (first_block == 0)
+ first_block = probe_block;
+ ret = bmap(inode, &first_block);
+ if (ret || !first_block)
goto bad_bmap;
/*
@@ -193,9 +194,11 @@ int generic_swapfile_activate(struct swap_info_struct *sis,
block_in_page++) {
sector_t block;
- block = bmap(inode, probe_block + block_in_page);
- if (block == 0)
+ block = probe_block + block_in_page;
+ ret = bmap(inode, &block);
+ if (ret || !block)
goto bad_bmap;
+
if (block != first_block + block_in_page) {
/* Discontiguity */
probe_block++;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 04ee1663cdbe..a9fd7c740c23 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -17,10 +17,9 @@
static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags)
{
+ struct page *unmovable = NULL;
struct zone *zone;
- unsigned long flags, pfn;
- struct memory_isolate_notify arg;
- int notifier_ret;
+ unsigned long flags;
int ret = -EBUSY;
zone = page_zone(page);
@@ -35,41 +34,12 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
if (is_migrate_isolate_page(page))
goto out;
- pfn = page_to_pfn(page);
- arg.start_pfn = pfn;
- arg.nr_pages = pageblock_nr_pages;
- arg.pages_found = 0;
-
- /*
- * It may be possible to isolate a pageblock even if the
- * migratetype is not MIGRATE_MOVABLE. The memory isolation
- * notifier chain is used by balloon drivers to return the
- * number of pages in a range that are held by the balloon
- * driver to shrink memory. If all the pages are accounted for
- * by balloons, are free, or on the LRU, isolation can continue.
- * Later, for example, when memory hotplug notifier runs, these
- * pages reported as "can be isolated" should be isolated(freed)
- * by the balloon driver through the memory notifier chain.
- */
- notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
- notifier_ret = notifier_to_errno(notifier_ret);
- if (notifier_ret)
- goto out;
/*
* FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
* We just check MOVABLE pages.
*/
- if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype,
- isol_flags))
- ret = 0;
-
- /*
- * immobile means "not-on-lru" pages. If immobile is larger than
- * removable-by-driver pages reported by notifier, we'll fail.
- */
-
-out:
- if (!ret) {
+ unmovable = has_unmovable_pages(zone, page, migratetype, isol_flags);
+ if (!unmovable) {
unsigned long nr_pages;
int mt = get_pageblock_migratetype(page);
@@ -79,11 +49,24 @@ out:
NULL);
__mod_zone_freepage_state(zone, -nr_pages, mt);
+ ret = 0;
}
+out:
spin_unlock_irqrestore(&zone->lock, flags);
- if (!ret)
+ if (!ret) {
drain_all_pages(zone);
+ } else {
+ WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
+
+ if ((isol_flags & REPORT_FAILURE) && unmovable)
+ /*
+ * printk() with zone->lock held will likely trigger a
+ * lockdep splat, so defer it here.
+ */
+ dump_page(unmovable, "unmovable page");
+ }
+
return ret;
}
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index eff4b4520c8d..719c35246cfa 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -52,12 +52,16 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
return true;
}
-static inline bool pfn_in_hpage(struct page *hpage, unsigned long pfn)
+static inline bool pfn_is_match(struct page *page, unsigned long pfn)
{
- unsigned long hpage_pfn = page_to_pfn(hpage);
+ unsigned long page_pfn = page_to_pfn(page);
+
+ /* normal page and hugetlbfs page */
+ if (!PageTransCompound(page) || PageHuge(page))
+ return page_pfn == pfn;
/* THP can be referenced by any subpage */
- return pfn >= hpage_pfn && pfn - hpage_pfn < hpage_nr_pages(hpage);
+ return pfn >= page_pfn && pfn - page_pfn < hpage_nr_pages(page);
}
/**
@@ -108,7 +112,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
pfn = pte_pfn(*pvmw->pte);
}
- return pfn_in_hpage(pvmw->page, pfn);
+ return pfn_is_match(pvmw->page, pfn);
}
/**
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index ea0b9e606ad1..928df1638c30 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -4,26 +4,57 @@
#include <linux/sched.h>
#include <linux/hugetlb.h>
-static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
- struct mm_walk *walk)
+/*
+ * We want to know the real level where a entry is located ignoring any
+ * folding of levels which may be happening. For example if p4d is folded then
+ * a missing entry found at level 1 (p4d) is actually at level 0 (pgd).
+ */
+static int real_depth(int depth)
+{
+ if (depth == 3 && PTRS_PER_PMD == 1)
+ depth = 2;
+ if (depth == 2 && PTRS_PER_PUD == 1)
+ depth = 1;
+ if (depth == 1 && PTRS_PER_P4D == 1)
+ depth = 0;
+ return depth;
+}
+
+static int walk_pte_range_inner(pte_t *pte, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
{
- pte_t *pte;
- int err = 0;
const struct mm_walk_ops *ops = walk->ops;
- spinlock_t *ptl;
+ int err = 0;
- pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
for (;;) {
err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
if (err)
break;
- addr += PAGE_SIZE;
- if (addr == end)
+ if (addr >= end - PAGE_SIZE)
break;
+ addr += PAGE_SIZE;
pte++;
}
+ return err;
+}
+
+static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ pte_t *pte;
+ int err = 0;
+ spinlock_t *ptl;
+
+ if (walk->no_vma) {
+ pte = pte_offset_map(pmd, addr);
+ err = walk_pte_range_inner(pte, addr, end, walk);
+ pte_unmap(pte);
+ } else {
+ pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ err = walk_pte_range_inner(pte, addr, end, walk);
+ pte_unmap_unlock(pte, ptl);
+ }
- pte_unmap_unlock(pte, ptl);
return err;
}
@@ -34,18 +65,22 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
unsigned long next;
const struct mm_walk_ops *ops = walk->ops;
int err = 0;
+ int depth = real_depth(3);
pmd = pmd_offset(pud, addr);
do {
again:
next = pmd_addr_end(addr, end);
- if (pmd_none(*pmd) || !walk->vma) {
+ if (pmd_none(*pmd) || (!walk->vma && !walk->no_vma)) {
if (ops->pte_hole)
- err = ops->pte_hole(addr, next, walk);
+ err = ops->pte_hole(addr, next, depth, walk);
if (err)
break;
continue;
}
+
+ walk->action = ACTION_SUBTREE;
+
/*
* This implies that each ->pmd_entry() handler
* needs to know about pmd_trans_huge() pmds
@@ -55,16 +90,24 @@ again:
if (err)
break;
+ if (walk->action == ACTION_AGAIN)
+ goto again;
+
/*
* Check this here so we only break down trans_huge
* pages when we _need_ to
*/
- if (!ops->pte_entry)
+ if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) ||
+ walk->action == ACTION_CONTINUE ||
+ !(ops->pte_entry))
continue;
- split_huge_pmd(walk->vma, pmd, addr);
- if (pmd_trans_unstable(pmd))
- goto again;
+ if (walk->vma) {
+ split_huge_pmd(walk->vma, pmd, addr);
+ if (pmd_trans_unstable(pmd))
+ goto again;
+ }
+
err = walk_pte_range(pmd, addr, next, walk);
if (err)
break;
@@ -80,37 +123,41 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
unsigned long next;
const struct mm_walk_ops *ops = walk->ops;
int err = 0;
+ int depth = real_depth(2);
pud = pud_offset(p4d, addr);
do {
again:
next = pud_addr_end(addr, end);
- if (pud_none(*pud) || !walk->vma) {
+ if (pud_none(*pud) || (!walk->vma && !walk->no_vma)) {
if (ops->pte_hole)
- err = ops->pte_hole(addr, next, walk);
+ err = ops->pte_hole(addr, next, depth, walk);
if (err)
break;
continue;
}
- if (ops->pud_entry) {
- spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma);
+ walk->action = ACTION_SUBTREE;
- if (ptl) {
- err = ops->pud_entry(pud, addr, next, walk);
- spin_unlock(ptl);
- if (err)
- break;
- continue;
- }
- }
+ if (ops->pud_entry)
+ err = ops->pud_entry(pud, addr, next, walk);
+ if (err)
+ break;
+
+ if (walk->action == ACTION_AGAIN)
+ goto again;
- split_huge_pud(walk->vma, pud, addr);
+ if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
+ walk->action == ACTION_CONTINUE ||
+ !(ops->pmd_entry || ops->pte_entry))
+ continue;
+
+ if (walk->vma)
+ split_huge_pud(walk->vma, pud, addr);
if (pud_none(*pud))
goto again;
- if (ops->pmd_entry || ops->pte_entry)
- err = walk_pmd_range(pud, addr, next, walk);
+ err = walk_pmd_range(pud, addr, next, walk);
if (err)
break;
} while (pud++, addr = next, addr != end);
@@ -125,18 +172,24 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
unsigned long next;
const struct mm_walk_ops *ops = walk->ops;
int err = 0;
+ int depth = real_depth(1);
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
if (p4d_none_or_clear_bad(p4d)) {
if (ops->pte_hole)
- err = ops->pte_hole(addr, next, walk);
+ err = ops->pte_hole(addr, next, depth, walk);
if (err)
break;
continue;
}
- if (ops->pmd_entry || ops->pte_entry)
+ if (ops->p4d_entry) {
+ err = ops->p4d_entry(p4d, addr, next, walk);
+ if (err)
+ break;
+ }
+ if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
err = walk_pud_range(p4d, addr, next, walk);
if (err)
break;
@@ -153,17 +206,26 @@ static int walk_pgd_range(unsigned long addr, unsigned long end,
const struct mm_walk_ops *ops = walk->ops;
int err = 0;
- pgd = pgd_offset(walk->mm, addr);
+ if (walk->pgd)
+ pgd = walk->pgd + pgd_index(addr);
+ else
+ pgd = pgd_offset(walk->mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd)) {
if (ops->pte_hole)
- err = ops->pte_hole(addr, next, walk);
+ err = ops->pte_hole(addr, next, 0, walk);
if (err)
break;
continue;
}
- if (ops->pmd_entry || ops->pte_entry)
+ if (ops->pgd_entry) {
+ err = ops->pgd_entry(pgd, addr, next, walk);
+ if (err)
+ break;
+ }
+ if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry ||
+ ops->pte_entry)
err = walk_p4d_range(pgd, addr, next, walk);
if (err)
break;
@@ -199,7 +261,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
if (pte)
err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
else if (ops->pte_hole)
- err = ops->pte_hole(addr, next, walk);
+ err = ops->pte_hole(addr, next, -1, walk);
if (err)
break;
@@ -243,7 +305,7 @@ static int walk_page_test(unsigned long start, unsigned long end,
if (vma->vm_flags & VM_PFNMAP) {
int err = 1;
if (ops->pte_hole)
- err = ops->pte_hole(start, end, walk);
+ err = ops->pte_hole(start, end, -1, walk);
return err ? err : 1;
}
return 0;
@@ -369,6 +431,33 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
return err;
}
+/*
+ * Similar to walk_page_range() but can walk any page tables even if they are
+ * not backed by VMAs. Because 'unusual' entries may be walked this function
+ * will also not lock the PTEs for the pte_entry() callback. This is useful for
+ * walking the kernel pages tables or page tables for firmware.
+ */
+int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ pgd_t *pgd,
+ void *private)
+{
+ struct mm_walk walk = {
+ .ops = ops,
+ .mm = mm,
+ .pgd = pgd,
+ .private = private,
+ .no_vma = true
+ };
+
+ if (start >= end || !walk.mm)
+ return -EINVAL;
+
+ lockdep_assert_held(&walk.mm->mmap_sem);
+
+ return __walk_page_range(start, end, &walk);
+}
+
int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
void *private)
{
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 357aa7bef6c0..de41e830cdac 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -42,12 +42,11 @@ static int process_vm_rw_pages(struct page **pages,
if (copy > len)
copy = len;
- if (vm_write) {
+ if (vm_write)
copied = copy_page_from_iter(page, offset, copy, iter);
- set_page_dirty_lock(page);
- } else {
+ else
copied = copy_page_to_iter(page, offset, copy, iter);
- }
+
len -= copied;
if (copied < copy && iov_iter_count(iter))
return -EFAULT;
@@ -96,7 +95,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
flags |= FOLL_WRITE;
while (!rc && nr_pages && iov_iter_count(iter)) {
- int pages = min(nr_pages, max_pages_per_loop);
+ int pinned_pages = min(nr_pages, max_pages_per_loop);
int locked = 1;
size_t bytes;
@@ -106,14 +105,15 @@ static int process_vm_rw_single_vec(unsigned long addr,
* current/current->mm
*/
down_read(&mm->mmap_sem);
- pages = get_user_pages_remote(task, mm, pa, pages, flags,
- process_pages, NULL, &locked);
+ pinned_pages = pin_user_pages_remote(task, mm, pa, pinned_pages,
+ flags, process_pages,
+ NULL, &locked);
if (locked)
up_read(&mm->mmap_sem);
- if (pages <= 0)
+ if (pinned_pages <= 0)
return -EFAULT;
- bytes = pages * PAGE_SIZE - start_offset;
+ bytes = pinned_pages * PAGE_SIZE - start_offset;
if (bytes > len)
bytes = len;
@@ -122,10 +122,12 @@ static int process_vm_rw_single_vec(unsigned long addr,
vm_write);
len -= bytes;
start_offset = 0;
- nr_pages -= pages;
- pa += pages * PAGE_SIZE;
- while (pages)
- put_page(process_pages[--pages]);
+ nr_pages -= pinned_pages;
+ pa += pinned_pages * PAGE_SIZE;
+
+ /* If vm_write is set, the pages need to be made dirty: */
+ unpin_user_pages_dirty_lock(process_pages, pinned_pages,
+ vm_write);
}
return rc;
diff --git a/mm/ptdump.c b/mm/ptdump.c
new file mode 100644
index 000000000000..26208d0d03b7
--- /dev/null
+++ b/mm/ptdump.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/pagewalk.h>
+#include <linux/ptdump.h>
+#include <linux/kasan.h>
+
+#ifdef CONFIG_KASAN
+/*
+ * This is an optimization for KASAN=y case. Since all kasan page tables
+ * eventually point to the kasan_early_shadow_page we could call note_page()
+ * right away without walking through lower level page tables. This saves
+ * us dozens of seconds (minutes for 5-level config) while checking for
+ * W+X mapping or reading kernel_page_tables debugfs file.
+ */
+static inline int note_kasan_page_table(struct mm_walk *walk,
+ unsigned long addr)
+{
+ struct ptdump_state *st = walk->private;
+
+ st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]));
+
+ walk->action = ACTION_CONTINUE;
+
+ return 0;
+}
+#endif
+
+static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct ptdump_state *st = walk->private;
+ pgd_t val = READ_ONCE(*pgd);
+
+#if CONFIG_PGTABLE_LEVELS > 4 && defined(CONFIG_KASAN)
+ if (pgd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_p4d)))
+ return note_kasan_page_table(walk, addr);
+#endif
+
+ if (pgd_leaf(val))
+ st->note_page(st, addr, 0, pgd_val(val));
+
+ return 0;
+}
+
+static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct ptdump_state *st = walk->private;
+ p4d_t val = READ_ONCE(*p4d);
+
+#if CONFIG_PGTABLE_LEVELS > 3 && defined(CONFIG_KASAN)
+ if (p4d_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pud)))
+ return note_kasan_page_table(walk, addr);
+#endif
+
+ if (p4d_leaf(val))
+ st->note_page(st, addr, 1, p4d_val(val));
+
+ return 0;
+}
+
+static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct ptdump_state *st = walk->private;
+ pud_t val = READ_ONCE(*pud);
+
+#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_KASAN)
+ if (pud_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pmd)))
+ return note_kasan_page_table(walk, addr);
+#endif
+
+ if (pud_leaf(val))
+ st->note_page(st, addr, 2, pud_val(val));
+
+ return 0;
+}
+
+static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct ptdump_state *st = walk->private;
+ pmd_t val = READ_ONCE(*pmd);
+
+#if defined(CONFIG_KASAN)
+ if (pmd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pte)))
+ return note_kasan_page_table(walk, addr);
+#endif
+
+ if (pmd_leaf(val))
+ st->note_page(st, addr, 3, pmd_val(val));
+
+ return 0;
+}
+
+static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct ptdump_state *st = walk->private;
+
+ st->note_page(st, addr, 4, pte_val(READ_ONCE(*pte)));
+
+ return 0;
+}
+
+static int ptdump_hole(unsigned long addr, unsigned long next,
+ int depth, struct mm_walk *walk)
+{
+ struct ptdump_state *st = walk->private;
+
+ st->note_page(st, addr, depth, 0);
+
+ return 0;
+}
+
+static const struct mm_walk_ops ptdump_ops = {
+ .pgd_entry = ptdump_pgd_entry,
+ .p4d_entry = ptdump_p4d_entry,
+ .pud_entry = ptdump_pud_entry,
+ .pmd_entry = ptdump_pmd_entry,
+ .pte_entry = ptdump_pte_entry,
+ .pte_hole = ptdump_hole,
+};
+
+void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
+{
+ const struct ptdump_range *range = st->range;
+
+ down_read(&mm->mmap_sem);
+ while (range->start != range->end) {
+ walk_page_range_novma(mm, range->start, range->end,
+ &ptdump_ops, pgd, st);
+ range++;
+ }
+ up_read(&mm->mmap_sem);
+
+ /* Flush out the last page */
+ st->note_page(st, 0, -1, 0);
+}
diff --git a/mm/shmem.c b/mm/shmem.c
index 8793e8cc1a48..c8f7540ef048 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3381,9 +3381,19 @@ enum shmem_param {
Opt_uid,
};
-static const struct fs_parameter_spec shmem_param_specs[] = {
+static const struct constant_table shmem_param_enums_huge[] = {
+ {"never", SHMEM_HUGE_NEVER },
+ {"always", SHMEM_HUGE_ALWAYS },
+ {"within_size", SHMEM_HUGE_WITHIN_SIZE },
+ {"advise", SHMEM_HUGE_ADVISE },
+ {"deny", SHMEM_HUGE_DENY },
+ {"force", SHMEM_HUGE_FORCE },
+ {}
+};
+
+const struct fs_parameter_spec shmem_fs_parameters[] = {
fsparam_u32 ("gid", Opt_gid),
- fsparam_enum ("huge", Opt_huge),
+ fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge),
fsparam_u32oct("mode", Opt_mode),
fsparam_string("mpol", Opt_mpol),
fsparam_string("nr_blocks", Opt_nr_blocks),
@@ -3393,20 +3403,6 @@ static const struct fs_parameter_spec shmem_param_specs[] = {
{}
};
-static const struct fs_parameter_enum shmem_param_enums[] = {
- { Opt_huge, "never", SHMEM_HUGE_NEVER },
- { Opt_huge, "always", SHMEM_HUGE_ALWAYS },
- { Opt_huge, "within_size", SHMEM_HUGE_WITHIN_SIZE },
- { Opt_huge, "advise", SHMEM_HUGE_ADVISE },
- {}
-};
-
-const struct fs_parameter_description shmem_fs_parameters = {
- .name = "tmpfs",
- .specs = shmem_param_specs,
- .enums = shmem_param_enums,
-};
-
static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
{
struct shmem_options *ctx = fc->fs_private;
@@ -3415,7 +3411,7 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
char *rest;
int opt;
- opt = fs_parse(fc, &shmem_fs_parameters, param, &result);
+ opt = fs_parse(fc, shmem_fs_parameters, param, &result);
if (opt < 0)
return opt;
@@ -3479,9 +3475,9 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
return 0;
unsupported_parameter:
- return invalf(fc, "tmpfs: Unsupported parameter '%s'", param->key);
+ return invalfc(fc, "Unsupported parameter '%s'", param->key);
bad_value:
- return invalf(fc, "tmpfs: Bad value for '%s'", param->key);
+ return invalfc(fc, "Bad value for '%s'", param->key);
}
static int shmem_parse_options(struct fs_context *fc, void *data)
@@ -3587,7 +3583,7 @@ static int shmem_reconfigure(struct fs_context *fc)
return 0;
out:
spin_unlock(&sbinfo->stat_lock);
- return invalf(fc, "tmpfs: %s", err);
+ return invalfc(fc, "%s", err);
}
static int shmem_show_options(struct seq_file *seq, struct dentry *root)
@@ -3889,7 +3885,7 @@ static struct file_system_type shmem_fs_type = {
.name = "tmpfs",
.init_fs_context = shmem_init_fs_context,
#ifdef CONFIG_TMPFS
- .parameters = &shmem_fs_parameters,
+ .parameters = shmem_fs_parameters,
#endif
.kill_sb = kill_litter_super,
.fs_flags = FS_USERNS_MOUNT,
@@ -4035,7 +4031,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma)
static struct file_system_type shmem_fs_type = {
.name = "tmpfs",
.init_fs_context = ramfs_init_fs_context,
- .parameters = &ramfs_fs_parameters,
+ .parameters = ramfs_fs_parameters,
.kill_sb = kill_litter_super,
.fs_flags = FS_USERNS_MOUNT,
};
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 0d95ddea13b0..1907cb2903c7 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1580,18 +1580,17 @@ static int slabinfo_open(struct inode *inode, struct file *file)
return seq_open(file, &slabinfo_op);
}
-static const struct file_operations proc_slabinfo_operations = {
- .open = slabinfo_open,
- .read = seq_read,
- .write = slabinfo_write,
- .llseek = seq_lseek,
- .release = seq_release,
+static const struct proc_ops slabinfo_proc_ops = {
+ .proc_open = slabinfo_open,
+ .proc_read = seq_read,
+ .proc_write = slabinfo_write,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
};
static int __init slab_proc_init(void)
{
- proc_create("slabinfo", SLABINFO_RIGHTS, NULL,
- &proc_slabinfo_operations);
+ proc_create("slabinfo", SLABINFO_RIGHTS, NULL, &slabinfo_proc_ops);
return 0;
}
module_init(slab_proc_init);
@@ -1677,28 +1676,6 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size,
}
/**
- * __krealloc - like krealloc() but don't free @p.
- * @p: object to reallocate memory for.
- * @new_size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- *
- * This function is like krealloc() except it never frees the originally
- * allocated buffer. Use this if you don't want to free the buffer immediately
- * like, for example, with RCU.
- *
- * Return: pointer to the allocated memory or %NULL in case of error
- */
-void *__krealloc(const void *p, size_t new_size, gfp_t flags)
-{
- if (unlikely(!new_size))
- return ZERO_SIZE_PTR;
-
- return __do_krealloc(p, new_size, flags);
-
-}
-EXPORT_SYMBOL(__krealloc);
-
-/**
* krealloc - reallocate memory. The contents will remain unchanged.
* @p: object to reallocate memory for.
* @new_size: how many bytes of memory are required.
diff --git a/mm/slub.c b/mm/slub.c
index 0ab92ec8c2a6..17dc00e33115 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -439,19 +439,38 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
}
#ifdef CONFIG_SLUB_DEBUG
+static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
+static DEFINE_SPINLOCK(object_map_lock);
+
/*
* Determine a map of object in use on a page.
*
* Node listlock must be held to guarantee that the page does
* not vanish from under us.
*/
-static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
+static unsigned long *get_map(struct kmem_cache *s, struct page *page)
{
void *p;
void *addr = page_address(page);
+ VM_BUG_ON(!irqs_disabled());
+
+ spin_lock(&object_map_lock);
+
+ bitmap_zero(object_map, page->objects);
+
for (p = page->freelist; p; p = get_freepointer(s, p))
- set_bit(slab_index(p, s, addr), map);
+ set_bit(slab_index(p, s, addr), object_map);
+
+ return object_map;
+}
+
+static void put_map(unsigned long *map)
+{
+ VM_BUG_ON(map != object_map);
+ lockdep_assert_held(&object_map_lock);
+
+ spin_unlock(&object_map_lock);
}
static inline unsigned int size_from_object(struct kmem_cache *s)
@@ -3675,13 +3694,12 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
#ifdef CONFIG_SLUB_DEBUG
void *addr = page_address(page);
void *p;
- unsigned long *map = bitmap_zalloc(page->objects, GFP_ATOMIC);
- if (!map)
- return;
+ unsigned long *map;
+
slab_err(s, page, text, s->name);
slab_lock(page);
- get_map(s, page, map);
+ map = get_map(s, page);
for_each_object(p, s, addr, page->objects) {
if (!test_bit(slab_index(p, s, addr), map)) {
@@ -3689,8 +3707,9 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
print_tracking(s, p);
}
}
+ put_map(map);
+
slab_unlock(page);
- bitmap_free(map);
#endif
}
@@ -4384,19 +4403,19 @@ static int count_total(struct page *page)
#endif
#ifdef CONFIG_SLUB_DEBUG
-static void validate_slab(struct kmem_cache *s, struct page *page,
- unsigned long *map)
+static void validate_slab(struct kmem_cache *s, struct page *page)
{
void *p;
void *addr = page_address(page);
+ unsigned long *map;
+
+ slab_lock(page);
if (!check_slab(s, page) || !on_freelist(s, page, NULL))
- return;
+ goto unlock;
/* Now we know that a valid freelist exists */
- bitmap_zero(map, page->objects);
-
- get_map(s, page, map);
+ map = get_map(s, page);
for_each_object(p, s, addr, page->objects) {
u8 val = test_bit(slab_index(p, s, addr), map) ?
SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
@@ -4404,18 +4423,13 @@ static void validate_slab(struct kmem_cache *s, struct page *page,
if (!check_object(s, page, p, val))
break;
}
-}
-
-static void validate_slab_slab(struct kmem_cache *s, struct page *page,
- unsigned long *map)
-{
- slab_lock(page);
- validate_slab(s, page, map);
+ put_map(map);
+unlock:
slab_unlock(page);
}
static int validate_slab_node(struct kmem_cache *s,
- struct kmem_cache_node *n, unsigned long *map)
+ struct kmem_cache_node *n)
{
unsigned long count = 0;
struct page *page;
@@ -4424,7 +4438,7 @@ static int validate_slab_node(struct kmem_cache *s,
spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, slab_list) {
- validate_slab_slab(s, page, map);
+ validate_slab(s, page);
count++;
}
if (count != n->nr_partial)
@@ -4435,7 +4449,7 @@ static int validate_slab_node(struct kmem_cache *s,
goto out;
list_for_each_entry(page, &n->full, slab_list) {
- validate_slab_slab(s, page, map);
+ validate_slab(s, page);
count++;
}
if (count != atomic_long_read(&n->nr_slabs))
@@ -4452,15 +4466,11 @@ static long validate_slab_cache(struct kmem_cache *s)
int node;
unsigned long count = 0;
struct kmem_cache_node *n;
- unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
-
- if (!map)
- return -ENOMEM;
flush_all(s);
for_each_kmem_cache_node(s, node, n)
- count += validate_slab_node(s, n, map);
- bitmap_free(map);
+ count += validate_slab_node(s, n);
+
return count;
}
/*
@@ -4590,18 +4600,17 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
}
static void process_slab(struct loc_track *t, struct kmem_cache *s,
- struct page *page, enum track_item alloc,
- unsigned long *map)
+ struct page *page, enum track_item alloc)
{
void *addr = page_address(page);
void *p;
+ unsigned long *map;
- bitmap_zero(map, page->objects);
- get_map(s, page, map);
-
+ map = get_map(s, page);
for_each_object(p, s, addr, page->objects)
if (!test_bit(slab_index(p, s, addr), map))
add_location(t, s, get_track(s, p, alloc));
+ put_map(map);
}
static int list_locations(struct kmem_cache *s, char *buf,
@@ -4612,11 +4621,9 @@ static int list_locations(struct kmem_cache *s, char *buf,
struct loc_track t = { 0, 0, NULL };
int node;
struct kmem_cache_node *n;
- unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
- if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
- GFP_KERNEL)) {
- bitmap_free(map);
+ if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
+ GFP_KERNEL)) {
return sprintf(buf, "Out of memory\n");
}
/* Push back cpu slabs */
@@ -4631,9 +4638,9 @@ static int list_locations(struct kmem_cache *s, char *buf,
spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, slab_list)
- process_slab(&t, s, page, alloc, map);
+ process_slab(&t, s, page, alloc);
list_for_each_entry(page, &n->full, slab_list)
- process_slab(&t, s, page, alloc, map);
+ process_slab(&t, s, page, alloc);
spin_unlock_irqrestore(&n->list_lock, flags);
}
@@ -4682,7 +4689,6 @@ static int list_locations(struct kmem_cache *s, char *buf,
}
free_loc_track(&t);
- bitmap_free(map);
if (!t.count)
len += sprintf(buf, "No data\n");
return len;
diff --git a/mm/sparse.c b/mm/sparse.c
index 3822ecbd8a1f..c184b69460b7 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -198,16 +198,6 @@ static void section_mark_present(struct mem_section *ms)
ms->section_mem_map |= SECTION_MARKED_PRESENT;
}
-static inline unsigned long next_present_section_nr(unsigned long section_nr)
-{
- do {
- section_nr++;
- if (present_section_nr(section_nr))
- return section_nr;
- } while ((section_nr <= __highest_present_section_nr));
-
- return -1;
-}
#define for_each_present_section_nr(start, section_nr) \
for (section_nr = next_present_section_nr(start-1); \
((section_nr != -1) && \
@@ -789,7 +779,7 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
ms->usage = NULL;
}
memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
- ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
+ ms->section_mem_map = (unsigned long)NULL;
}
if (section_is_early && memmap)
diff --git a/mm/swap.c b/mm/swap.c
index 5341ae93861f..cf39d24ada2a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -813,8 +813,10 @@ void release_pages(struct page **pages, int nr)
* processing, and instead, expect a call to
* put_page_testzero().
*/
- if (put_devmap_managed_page(page))
+ if (page_is_devmap_managed(page)) {
+ put_devmap_managed_page(page);
continue;
+ }
}
page = compound_head(page);
@@ -1102,3 +1104,26 @@ void __init swap_setup(void)
* _really_ don't want to cluster much more
*/
}
+
+#ifdef CONFIG_DEV_PAGEMAP_OPS
+void put_devmap_managed_page(struct page *page)
+{
+ int count;
+
+ if (WARN_ON_ONCE(!page_is_devmap_managed(page)))
+ return;
+
+ count = page_ref_dec_return(page);
+
+ /*
+ * devmap page refcounts are 1-based, rather than 0-based: if
+ * refcount is 1, then the page is free and the refcount is
+ * stable because nobody holds a reference on the page.
+ */
+ if (count == 1)
+ free_devmap_managed_page(page);
+ else if (!count)
+ __put_page(page);
+}
+EXPORT_SYMBOL(put_devmap_managed_page);
+#endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index bb3261d45b6a..2c33ff456ed5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2737,10 +2737,10 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
else
type = si->type + 1;
+ ++(*pos);
for (; (si = swap_type_to_swap_info(type)); type++) {
if (!(si->flags & SWP_USED) || !si->swap_map)
continue;
- ++*pos;
return si;
}
@@ -2796,17 +2796,17 @@ static int swaps_open(struct inode *inode, struct file *file)
return 0;
}
-static const struct file_operations proc_swaps_operations = {
- .open = swaps_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
- .poll = swaps_poll,
+static const struct proc_ops swaps_proc_ops = {
+ .proc_open = swaps_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
+ .proc_poll = swaps_poll,
};
static int __init procswaps_init(void)
{
- proc_create("swaps", 0, NULL, &proc_swaps_operations);
+ proc_create("swaps", 0, NULL, &swaps_proc_ops);
return 0;
}
__initcall(procswaps_init);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 572fb17c6273..c05eb9efec07 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -146,20 +146,6 @@ struct scan_control {
struct reclaim_state reclaim_state;
};
-#ifdef ARCH_HAS_PREFETCH
-#define prefetch_prev_lru_page(_page, _base, _field) \
- do { \
- if ((_page)->lru.prev != _base) { \
- struct page *prev; \
- \
- prev = lru_to_page(&(_page->lru)); \
- prefetch(&prev->_field); \
- } \
- } while (0)
-#else
-#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
-#endif
-
#ifdef ARCH_HAS_PREFETCHW
#define prefetchw_prev_lru_page(_page, _base, _field) \
do { \
@@ -2695,7 +2681,7 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
} while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
}
-static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
+static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
{
struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long nr_reclaimed, nr_scanned;
@@ -2874,8 +2860,6 @@ again:
*/
if (reclaimable)
pgdat->kswapd_failures = 0;
-
- return reclaimable;
}
/*
@@ -4126,10 +4110,8 @@ module_init(kswapd_init)
*/
int node_reclaim_mode __read_mostly;
-#define RECLAIM_OFF 0
-#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
-#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
-#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */
+#define RECLAIM_WRITE (1<<0) /* Writeout pages during reclaim */
+#define RECLAIM_UNMAP (1<<1) /* Unmap pages during reclaim */
/*
* Priority for NODE_RECLAIM. This determines the fraction of pages
diff --git a/mm/zswap.c b/mm/zswap.c
index 46a322316e52..55094e63b72d 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -32,6 +32,7 @@
#include <linux/swapops.h>
#include <linux/writeback.h>
#include <linux/pagemap.h>
+#include <linux/workqueue.h>
/*********************************
* statistics
@@ -65,6 +66,11 @@ static u64 zswap_reject_kmemcache_fail;
/* Duplicate store was encountered (rare) */
static u64 zswap_duplicate_entry;
+/* Shrinker work queue */
+static struct workqueue_struct *shrink_wq;
+/* Pool limit was hit, we need to calm down */
+static bool zswap_pool_reached_full;
+
/*********************************
* tunables
**********************************/
@@ -109,6 +115,11 @@ module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644);
static unsigned int zswap_max_pool_percent = 20;
module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
+/* The threshold for accepting new pages after the max_pool_percent was hit */
+static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */
+module_param_named(accept_threshold_percent, zswap_accept_thr_percent,
+ uint, 0644);
+
/* Enable/disable handling same-value filled pages (enabled by default) */
static bool zswap_same_filled_pages_enabled = true;
module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
@@ -123,7 +134,8 @@ struct zswap_pool {
struct crypto_comp * __percpu *tfm;
struct kref kref;
struct list_head list;
- struct work_struct work;
+ struct work_struct release_work;
+ struct work_struct shrink_work;
struct hlist_node node;
char tfm_name[CRYPTO_MAX_ALG_NAME];
};
@@ -214,6 +226,13 @@ static bool zswap_is_full(void)
DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
}
+static bool zswap_can_accept(void)
+{
+ return totalram_pages() * zswap_accept_thr_percent / 100 *
+ zswap_max_pool_percent / 100 >
+ DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
+}
+
static void zswap_update_total_size(void)
{
struct zswap_pool *pool;
@@ -501,6 +520,16 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
return NULL;
}
+static void shrink_worker(struct work_struct *w)
+{
+ struct zswap_pool *pool = container_of(w, typeof(*pool),
+ shrink_work);
+
+ if (zpool_shrink(pool->zpool, 1, NULL))
+ zswap_reject_reclaim_fail++;
+ zswap_pool_put(pool);
+}
+
static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
{
struct zswap_pool *pool;
@@ -551,6 +580,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
*/
kref_init(&pool->kref);
INIT_LIST_HEAD(&pool->list);
+ INIT_WORK(&pool->shrink_work, shrink_worker);
zswap_pool_debug("created", pool);
@@ -624,7 +654,8 @@ static int __must_check zswap_pool_get(struct zswap_pool *pool)
static void __zswap_pool_release(struct work_struct *work)
{
- struct zswap_pool *pool = container_of(work, typeof(*pool), work);
+ struct zswap_pool *pool = container_of(work, typeof(*pool),
+ release_work);
synchronize_rcu();
@@ -647,8 +678,8 @@ static void __zswap_pool_empty(struct kref *kref)
list_del_rcu(&pool->list);
- INIT_WORK(&pool->work, __zswap_pool_release);
- schedule_work(&pool->work);
+ INIT_WORK(&pool->release_work, __zswap_pool_release);
+ schedule_work(&pool->release_work);
spin_unlock(&zswap_pools_lock);
}
@@ -942,22 +973,6 @@ end:
return ret;
}
-static int zswap_shrink(void)
-{
- struct zswap_pool *pool;
- int ret;
-
- pool = zswap_pool_last_get();
- if (!pool)
- return -ENOENT;
-
- ret = zpool_shrink(pool->zpool, 1, NULL);
-
- zswap_pool_put(pool);
-
- return ret;
-}
-
static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
{
unsigned int pos;
@@ -1011,21 +1026,23 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
/* reclaim space if needed */
if (zswap_is_full()) {
+ struct zswap_pool *pool;
+
zswap_pool_limit_hit++;
- if (zswap_shrink()) {
- zswap_reject_reclaim_fail++;
- ret = -ENOMEM;
- goto reject;
- }
+ zswap_pool_reached_full = true;
+ pool = zswap_pool_last_get();
+ if (pool)
+ queue_work(shrink_wq, &pool->shrink_work);
+ ret = -ENOMEM;
+ goto reject;
+ }
- /* A second zswap_is_full() check after
- * zswap_shrink() to make sure it's now
- * under the max_pool_percent
- */
- if (zswap_is_full()) {
+ if (zswap_pool_reached_full) {
+ if (!zswap_can_accept()) {
ret = -ENOMEM;
goto reject;
- }
+ } else
+ zswap_pool_reached_full = false;
}
/* allocate entry */
@@ -1332,11 +1349,18 @@ static int __init init_zswap(void)
zswap_enabled = false;
}
+ shrink_wq = create_workqueue("zswap-shrink");
+ if (!shrink_wq)
+ goto fallback_fail;
+
frontswap_register_ops(&zswap_frontswap_ops);
if (zswap_debugfs_init())
pr_warn("debugfs initialization failed\n");
return 0;
+fallback_fail:
+ if (pool)
+ zswap_pool_destroy(pool);
hp_fail:
cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE);
dstmem_fail:
OpenPOWER on IntegriCloud