summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorJames Morris <james.morris@microsoft.com>2018-09-04 11:35:54 -0700
committerJames Morris <james.morris@microsoft.com>2018-09-04 11:35:54 -0700
commite42f6f9be4f83c537aa81b4c6239ea94ff5b29ce (patch)
treef956a5ea0e83fc6d0df3e64681e7bbc1f201f3ee /mm
parent4408e300a67ab2ce2505087986a9fe922c800ffd (diff)
parent57361846b52bc686112da6ca5368d11210796804 (diff)
downloadblackbird-obmc-linux-e42f6f9be4f83c537aa81b4c6239ea94ff5b29ce.tar.gz
blackbird-obmc-linux-e42f6f9be4f83c537aa81b4c6239ea94ff5b29ce.zip
Merge tag 'v4.19-rc2' into next-general
Sync to Linux 4.19-rc2 for downstream developers.
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig16
-rw-r--r--mm/Kconfig.debug6
-rw-r--r--mm/backing-dev.c15
-rw-r--r--mm/bootmem.c159
-rw-r--r--mm/cma.c8
-rw-r--r--mm/cma_debug.c2
-rw-r--r--mm/debug.c18
-rw-r--r--mm/fadvise.c8
-rw-r--r--mm/gup.c6
-rw-r--r--mm/hmm.c28
-rw-r--r--mm/huge_memory.c54
-rw-r--r--mm/hugetlb.c87
-rw-r--r--mm/init-mm.c11
-rw-r--r--mm/internal.h14
-rw-r--r--mm/kasan/kasan.c5
-rw-r--r--mm/kasan/kasan_init.c316
-rw-r--r--mm/khugepaged.c63
-rw-r--r--mm/ksm.c13
-rw-r--r--mm/list_lru.c147
-rw-r--r--mm/madvise.c16
-rw-r--r--mm/memblock.c255
-rw-r--r--mm/memcontrol.c599
-rw-r--r--mm/memfd.c2
-rw-r--r--mm/memory-failure.c256
-rw-r--r--mm/memory.c336
-rw-r--r--mm/memory_hotplug.c112
-rw-r--r--mm/mempolicy.c6
-rw-r--r--mm/mempool.c13
-rw-r--r--mm/migrate.c10
-rw-r--r--mm/mlock.c3
-rw-r--r--mm/mm_init.c9
-rw-r--r--mm/mmap.c80
-rw-r--r--mm/mmu_notifier.c19
-rw-r--r--mm/mprotect.c49
-rw-r--r--mm/nobootmem.c20
-rw-r--r--mm/nommu.c16
-rw-r--r--mm/oom_kill.c239
-rw-r--r--mm/page-writeback.c5
-rw-r--r--mm/page_alloc.c249
-rw-r--r--mm/page_ext.c4
-rw-r--r--mm/page_io.c3
-rw-r--r--mm/percpu.c29
-rw-r--r--mm/readahead.c19
-rw-r--r--mm/rmap.c8
-rw-r--r--mm/shmem.c69
-rw-r--r--mm/slab.h6
-rw-r--r--mm/slab_common.c12
-rw-r--r--mm/slub.c11
-rw-r--r--mm/sparse-vmemmap.c61
-rw-r--r--mm/sparse.c290
-rw-r--r--mm/swap_slots.c12
-rw-r--r--mm/swapfile.c270
-rw-r--r--mm/usercopy.c25
-rw-r--r--mm/util.c9
-rw-r--r--mm/vmacache.c38
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/vmscan.c244
-rw-r--r--mm/vmstat.c2
-rw-r--r--mm/workingset.c30
-rw-r--r--mm/zsmalloc.c36
-rw-r--r--mm/zswap.c9
61 files changed, 2867 insertions, 1594 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index ce95491abd6a..a550635ea5c3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1,3 +1,6 @@
+
+menu "Memory Management options"
+
config SELECT_MEMORY_MODEL
def_bool y
depends on ARCH_SELECT_MEMORY_MODEL
@@ -115,10 +118,6 @@ config SPARSEMEM_EXTREME
config SPARSEMEM_VMEMMAP_ENABLE
bool
-config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
- def_bool y
- depends on SPARSEMEM && X86_64
-
config SPARSEMEM_VMEMMAP
bool "Sparse Memory virtual memmap"
depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE
@@ -420,10 +419,11 @@ config ARCH_WANTS_THP_SWAP
config THP_SWAP
def_bool y
- depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP
+ depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP
help
Swap transparent huge pages in one piece, without splitting.
- XXX: For now this only does clustered swap space allocation.
+ XXX: For now, swap cluster backing transparent huge page
+ will be split after swapout.
For selection by architectures with reasonable THP sizes.
@@ -635,7 +635,7 @@ config DEFERRED_STRUCT_PAGE_INIT
bool "Defer initialisation of struct pages to kthreads"
default n
depends on NO_BOOTMEM
- depends on !FLATMEM
+ depends on SPARSEMEM
depends on !NEED_PER_CPU_KM
help
Ordinarily all struct pages are initialised during early boot in a
@@ -762,3 +762,5 @@ config GUP_BENCHMARK
config ARCH_HAS_PTE_SPECIAL
bool
+
+endmenu
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index e5e606ee5f71..9a7b8b049d04 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -46,7 +46,8 @@ config PAGE_POISONING
Fill the pages with poison patterns after free_pages() and verify
the patterns before alloc_pages. The filling of the memory helps
reduce the risk of information leaks from freed data. This does
- have a potential performance impact.
+ have a potential performance impact if enabled with the
+ "page_poison=1" kernel boot option.
Note that "poison" here is not the same thing as the "HWPoison"
for CONFIG_MEMORY_FAILURE. This is software poisoning only.
@@ -65,7 +66,7 @@ config PAGE_POISONING_NO_SANITY
say N.
config PAGE_POISONING_ZERO
- bool "Use zero for poisoning instead of random data"
+ bool "Use zero for poisoning instead of debugging value"
depends on PAGE_POISONING
---help---
Instead of using the existing poison value, fill the pages with
@@ -75,7 +76,6 @@ config PAGE_POISONING_ZERO
allocation.
If unsure, say N
- bool
config DEBUG_PAGE_REF
bool "Enable tracepoint to track down page reference manipulation"
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 2e5d3df0853d..f5981e9d6ae2 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -438,10 +438,10 @@ retry:
if (new_congested) {
/* !found and storage for new one already allocated, insert */
congested = new_congested;
- new_congested = NULL;
rb_link_node(&congested->rb_node, parent, node);
rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree);
- goto found;
+ spin_unlock_irqrestore(&cgwb_lock, flags);
+ return congested;
}
spin_unlock_irqrestore(&cgwb_lock, flags);
@@ -451,13 +451,13 @@ retry:
if (!new_congested)
return NULL;
- atomic_set(&new_congested->refcnt, 0);
+ refcount_set(&new_congested->refcnt, 1);
new_congested->__bdi = bdi;
new_congested->blkcg_id = blkcg_id;
goto retry;
found:
- atomic_inc(&congested->refcnt);
+ refcount_inc(&congested->refcnt);
spin_unlock_irqrestore(&cgwb_lock, flags);
kfree(new_congested);
return congested;
@@ -473,11 +473,8 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
{
unsigned long flags;
- local_irq_save(flags);
- if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
- local_irq_restore(flags);
+ if (!refcount_dec_and_lock_irqsave(&congested->refcnt, &cgwb_lock, &flags))
return;
- }
/* bdi might already have been destroyed leaving @congested unlinked */
if (congested->__bdi) {
@@ -804,7 +801,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
if (!bdi->wb_congested)
return -ENOMEM;
- atomic_set(&bdi->wb_congested->refcnt, 1);
+ refcount_set(&bdi->wb_congested->refcnt, 1);
err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
if (err) {
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 9e197987b67d..97db0e8e362b 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -21,6 +21,53 @@
#include "internal.h"
+/**
+ * DOC: bootmem overview
+ *
+ * Bootmem is a boot-time physical memory allocator and configurator.
+ *
+ * It is used early in the boot process before the page allocator is
+ * set up.
+ *
+ * Bootmem is based on the most basic of allocators, a First Fit
+ * allocator which uses a bitmap to represent memory. If a bit is 1,
+ * the page is allocated and 0 if unallocated. To satisfy allocations
+ * of sizes smaller than a page, the allocator records the Page Frame
+ * Number (PFN) of the last allocation and the offset the allocation
+ * ended at. Subsequent small allocations are merged together and
+ * stored on the same page.
+ *
+ * The information used by the bootmem allocator is represented by
+ * :c:type:`struct bootmem_data`. An array to hold up to %MAX_NUMNODES
+ * such structures is statically allocated and then it is discarded
+ * when the system initialization completes. Each entry in this array
+ * corresponds to a node with memory. For UMA systems only entry 0 is
+ * used.
+ *
+ * The bootmem allocator is initialized during early architecture
+ * specific setup. Each architecture is required to supply a
+ * :c:func:`setup_arch` function which, among other tasks, is
+ * responsible for acquiring the necessary parameters to initialise
+ * the boot memory allocator. These parameters define limits of usable
+ * physical memory:
+ *
+ * * @min_low_pfn - the lowest PFN that is available in the system
+ * * @max_low_pfn - the highest PFN that may be addressed by low
+ * memory (%ZONE_NORMAL)
+ * * @max_pfn - the last PFN available to the system.
+ *
+ * After those limits are determined, the :c:func:`init_bootmem` or
+ * :c:func:`init_bootmem_node` function should be called to initialize
+ * the bootmem allocator. The UMA case should use the `init_bootmem`
+ * function. It will initialize ``contig_page_data`` structure that
+ * represents the only memory node in the system. In the NUMA case the
+ * `init_bootmem_node` function should be called to initialize the
+ * bootmem allocator for each node.
+ *
+ * Once the allocator is set up, it is possible to use either single
+ * node or NUMA variant of the allocation APIs.
+ */
+
#ifndef CONFIG_NEED_MULTIPLE_NODES
struct pglist_data __refdata contig_page_data = {
.bdata = &bootmem_node_data[0]
@@ -62,6 +109,8 @@ static unsigned long __init bootmap_bytes(unsigned long pages)
/**
* bootmem_bootmap_pages - calculate bitmap size in pages
* @pages: number of pages the bitmap has to represent
+ *
+ * Return: the number of pages needed to hold the bitmap.
*/
unsigned long __init bootmem_bootmap_pages(unsigned long pages)
{
@@ -121,7 +170,7 @@ static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,
* @startpfn: first pfn on the node
* @endpfn: first pfn after the node
*
- * Returns the number of bytes needed to hold the bitmap for this node.
+ * Return: the number of bytes needed to hold the bitmap for this node.
*/
unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
unsigned long startpfn, unsigned long endpfn)
@@ -134,7 +183,7 @@ unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
* @start: pfn where the bitmap is to be placed
* @pages: number of available physical pages
*
- * Returns the number of bytes needed to hold the bitmap.
+ * Return: the number of bytes needed to hold the bitmap.
*/
unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
{
@@ -143,15 +192,6 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
}
-/*
- * free_bootmem_late - free bootmem pages directly to page allocator
- * @addr: starting physical address of the range
- * @size: size of the range in bytes
- *
- * This is only useful when the bootmem allocator has already been torn
- * down, but we are still initializing the system. Pages are given directly
- * to the page allocator, no bootmem metadata is updated because it is gone.
- */
void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
{
unsigned long cursor, end;
@@ -264,11 +304,6 @@ void __init reset_all_zones_managed_pages(void)
reset_managed_pages_done = 1;
}
-/**
- * free_all_bootmem - release free pages to the buddy allocator
- *
- * Returns the number of pages actually released.
- */
unsigned long __init free_all_bootmem(void)
{
unsigned long total_pages = 0;
@@ -385,16 +420,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
BUG();
}
-/**
- * free_bootmem_node - mark a page range as usable
- * @pgdat: node the range resides on
- * @physaddr: starting address of the range
- * @size: size of the range in bytes
- *
- * Partial pages will be considered reserved and left as they are.
- *
- * The range must reside completely on the specified node.
- */
void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size)
{
@@ -408,15 +433,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
}
-/**
- * free_bootmem - mark a page range as usable
- * @physaddr: starting physical address of the range
- * @size: size of the range in bytes
- *
- * Partial pages will be considered reserved and left as they are.
- *
- * The range must be contiguous but may span node boundaries.
- */
void __init free_bootmem(unsigned long physaddr, unsigned long size)
{
unsigned long start, end;
@@ -439,6 +455,8 @@ void __init free_bootmem(unsigned long physaddr, unsigned long size)
* Partial pages will be reserved.
*
* The range must reside completely on the specified node.
+ *
+ * Return: 0 on success, -errno on failure.
*/
int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size, int flags)
@@ -460,6 +478,8 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
* Partial pages will be reserved.
*
* The range must be contiguous but may span node boundaries.
+ *
+ * Return: 0 on success, -errno on failure.
*/
int __init reserve_bootmem(unsigned long addr, unsigned long size,
int flags)
@@ -646,19 +666,6 @@ restart:
return NULL;
}
-/**
- * __alloc_bootmem_nopanic - allocate boot memory without panicking
- * @size: size of the request in bytes
- * @align: alignment of the region
- * @goal: preferred starting address of the region
- *
- * The goal is dropped if it can not be satisfied and the allocation will
- * fall back to memory below @goal.
- *
- * Allocation may happen on any node in the system.
- *
- * Returns NULL on failure.
- */
void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
unsigned long goal)
{
@@ -682,19 +689,6 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
return NULL;
}
-/**
- * __alloc_bootmem - allocate boot memory
- * @size: size of the request in bytes
- * @align: alignment of the region
- * @goal: preferred starting address of the region
- *
- * The goal is dropped if it can not be satisfied and the allocation will
- * fall back to memory below @goal.
- *
- * Allocation may happen on any node in the system.
- *
- * The function panics if the request can not be satisfied.
- */
void * __init __alloc_bootmem(unsigned long size, unsigned long align,
unsigned long goal)
{
@@ -754,21 +748,6 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
return NULL;
}
-/**
- * __alloc_bootmem_node - allocate boot memory from a specific node
- * @pgdat: node to allocate from
- * @size: size of the request in bytes
- * @align: alignment of the region
- * @goal: preferred starting address of the region
- *
- * The goal is dropped if it can not be satisfied and the allocation will
- * fall back to memory below @goal.
- *
- * Allocation may fall back to any node in the system if the specified node
- * can not hold the requested memory.
- *
- * The function panics if the request can not be satisfied.
- */
void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
@@ -807,19 +786,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
}
-/**
- * __alloc_bootmem_low - allocate low boot memory
- * @size: size of the request in bytes
- * @align: alignment of the region
- * @goal: preferred starting address of the region
- *
- * The goal is dropped if it can not be satisfied and the allocation will
- * fall back to memory below @goal.
- *
- * Allocation may happen on any node in the system.
- *
- * The function panics if the request can not be satisfied.
- */
void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
unsigned long goal)
{
@@ -834,21 +800,6 @@ void * __init __alloc_bootmem_low_nopanic(unsigned long size,
ARCH_LOW_ADDRESS_LIMIT);
}
-/**
- * __alloc_bootmem_low_node - allocate low boot memory from a specific node
- * @pgdat: node to allocate from
- * @size: size of the request in bytes
- * @align: alignment of the region
- * @goal: preferred starting address of the region
- *
- * The goal is dropped if it can not be satisfied and the allocation will
- * fall back to memory below @goal.
- *
- * Allocation may fall back to any node in the system if the specified node
- * can not hold the requested memory.
- *
- * The function panics if the request can not be satisfied.
- */
void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
diff --git a/mm/cma.c b/mm/cma.c
index 5809bbe360d7..4cb76121a3ab 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -395,13 +395,13 @@ static inline void cma_debug_show_areas(struct cma *cma) { }
* @cma: Contiguous memory region for which the allocation is performed.
* @count: Requested number of pages.
* @align: Requested alignment of pages (in PAGE_SIZE order).
- * @gfp_mask: GFP mask to use during compaction
+ * @no_warn: Avoid printing message about failed allocation
*
* This function allocates part of contiguous memory on specific
* contiguous memory area.
*/
struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
- gfp_t gfp_mask)
+ bool no_warn)
{
unsigned long mask, offset;
unsigned long pfn = -1;
@@ -447,7 +447,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit);
mutex_lock(&cma_mutex);
ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA,
- gfp_mask);
+ GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0));
mutex_unlock(&cma_mutex);
if (ret == 0) {
page = pfn_to_page(pfn);
@@ -466,7 +466,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
trace_cma_alloc(pfn, page, count, align);
- if (ret && !(gfp_mask & __GFP_NOWARN)) {
+ if (ret && !no_warn) {
pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n",
__func__, count, ret);
cma_debug_show_areas(cma);
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index f23467291cfb..ad6723e9d110 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -139,7 +139,7 @@ static int cma_alloc_mem(struct cma *cma, int count)
if (!mem)
return -ENOMEM;
- p = cma_alloc(cma, count, 0, GFP_KERNEL);
+ p = cma_alloc(cma, count, 0, false);
if (!p) {
kfree(mem);
return -ENOMEM;
diff --git a/mm/debug.c b/mm/debug.c
index 56e2d9125ea5..38c926520c97 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -43,12 +43,25 @@ const struct trace_print_flags vmaflag_names[] = {
void __dump_page(struct page *page, const char *reason)
{
+ bool page_poisoned = PagePoisoned(page);
+ int mapcount;
+
+ /*
+ * If struct page is poisoned don't access Page*() functions as that
+ * leads to recursive loop. Page*() check for poisoned pages, and calls
+ * dump_page() when detected.
+ */
+ if (page_poisoned) {
+ pr_emerg("page:%px is uninitialized and poisoned", page);
+ goto hex_only;
+ }
+
/*
* Avoid VM_BUG_ON() in page_mapcount().
* page->_mapcount space in struct page is used by sl[aou]b pages to
* encode own info.
*/
- int mapcount = PageSlab(page) ? 0 : page_mapcount(page);
+ mapcount = PageSlab(page) ? 0 : page_mapcount(page);
pr_emerg("page:%px count:%d mapcount:%d mapping:%px index:%#lx",
page, page_ref_count(page), mapcount,
@@ -60,6 +73,7 @@ void __dump_page(struct page *page, const char *reason)
pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags);
+hex_only:
print_hex_dump(KERN_ALERT, "raw: ", DUMP_PREFIX_NONE, 32,
sizeof(unsigned long), page,
sizeof(struct page), false);
@@ -68,7 +82,7 @@ void __dump_page(struct page *page, const char *reason)
pr_alert("page dumped because: %s\n", reason);
#ifdef CONFIG_MEMCG
- if (page->mem_cgroup)
+ if (!page_poisoned && page->mem_cgroup)
pr_alert("page->mem_cgroup:%px\n", page->mem_cgroup);
#endif
}
diff --git a/mm/fadvise.c b/mm/fadvise.c
index afa41491d324..2d8376e3c640 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -72,8 +72,12 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
goto out;
}
- /* Careful about overflows. Len == 0 means "as much as possible" */
- endbyte = offset + len;
+ /*
+ * Careful about overflows. Len == 0 means "as much as possible". Use
+ * unsigned math because signed overflows are undefined and UBSan
+ * complains.
+ */
+ endbyte = (u64)offset + (u64)len;
if (!len || endbyte < len)
endbyte = -1;
else
diff --git a/mm/gup.c b/mm/gup.c
index b70d7ba7cc13..1abc8b4afff6 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -497,7 +497,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
unsigned long address, unsigned int *flags, int *nonblocking)
{
unsigned int fault_flags = 0;
- int ret;
+ vm_fault_t ret;
/* mlock all present pages, but do not fault in new pages */
if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
@@ -818,7 +818,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
bool *unlocked)
{
struct vm_area_struct *vma;
- int ret, major = 0;
+ vm_fault_t ret, major = 0;
if (unlocked)
fault_flags |= FAULT_FLAG_ALLOW_RETRY;
@@ -1238,8 +1238,6 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
int locked = 0;
long ret = 0;
- VM_BUG_ON(start & ~PAGE_MASK);
- VM_BUG_ON(len != PAGE_ALIGN(len));
end = start + len;
for (nstart = start; nstart < end; nstart = nend) {
diff --git a/mm/hmm.c b/mm/hmm.c
index de7b6bf77201..c968e49f7a0c 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -177,16 +177,19 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
up_write(&hmm->mirrors_sem);
}
-static void hmm_invalidate_range_start(struct mmu_notifier *mn,
+static int hmm_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
- unsigned long end)
+ unsigned long end,
+ bool blockable)
{
struct hmm *hmm = mm->hmm;
VM_BUG_ON(!hmm);
atomic_inc(&hmm->sequence);
+
+ return 0;
}
static void hmm_invalidate_range_end(struct mmu_notifier *mn,
@@ -299,14 +302,14 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
struct vm_area_struct *vma = walk->vma;
- int r;
+ vm_fault_t ret;
flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
flags |= write_fault ? FAULT_FLAG_WRITE : 0;
- r = handle_mm_fault(vma, addr, flags);
- if (r & VM_FAULT_RETRY)
+ ret = handle_mm_fault(vma, addr, flags);
+ if (ret & VM_FAULT_RETRY)
return -EBUSY;
- if (r & VM_FAULT_ERROR) {
+ if (ret & VM_FAULT_ERROR) {
*pfn = range->values[HMM_PFN_ERROR];
return -EFAULT;
}
@@ -676,7 +679,8 @@ int hmm_vma_get_pfns(struct hmm_range *range)
return -EINVAL;
/* FIXME support hugetlb fs */
- if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
+ if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
+ vma_is_dax(vma)) {
hmm_pfns_special(range);
return -EINVAL;
}
@@ -849,7 +853,8 @@ int hmm_vma_fault(struct hmm_range *range, bool block)
return -EINVAL;
/* FIXME support hugetlb fs */
- if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
+ if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
+ vma_is_dax(vma)) {
hmm_pfns_special(range);
return -EINVAL;
}
@@ -963,6 +968,8 @@ static void hmm_devmem_free(struct page *page, void *data)
{
struct hmm_devmem *devmem = data;
+ page->mapping = NULL;
+
devmem->ops->free(devmem, page);
}
@@ -971,10 +978,7 @@ static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL);
static void hmm_devmem_radix_release(struct resource *resource)
{
- resource_size_t key, align_start, align_size;
-
- align_start = resource->start & ~(PA_SECTION_SIZE - 1);
- align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE);
+ resource_size_t key;
mutex_lock(&hmm_devmem_lock);
for (key = resource->start;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1cd7c1a57a14..c3bc7e9c9a2a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -541,18 +541,18 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
}
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
-static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
- gfp_t gfp)
+static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
+ struct page *page, gfp_t gfp)
{
struct vm_area_struct *vma = vmf->vma;
struct mem_cgroup *memcg;
pgtable_t pgtable;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- int ret = 0;
+ vm_fault_t ret = 0;
VM_BUG_ON_PAGE(!PageCompound(page), page);
- if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) {
+ if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
@@ -584,15 +584,15 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
/* Deliver the page fault to userland */
if (userfaultfd_missing(vma)) {
- int ret;
+ vm_fault_t ret2;
spin_unlock(vmf->ptl);
mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(vma->vm_mm, pgtable);
- ret = handle_userfault(vmf, VM_UFFD_MISSING);
- VM_BUG_ON(ret & VM_FAULT_FALLBACK);
- return ret;
+ ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
+ VM_BUG_ON(ret2 & VM_FAULT_FALLBACK);
+ return ret2;
}
entry = mk_huge_pmd(page, vma->vm_page_prot);
@@ -663,7 +663,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
return true;
}
-int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
+vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
gfp_t gfp;
@@ -682,7 +682,7 @@ int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
pgtable_t pgtable;
struct page *zero_page;
bool set;
- int ret;
+ vm_fault_t ret;
pgtable = pte_alloc_one(vma->vm_mm, haddr);
if (unlikely(!pgtable))
return VM_FAULT_OOM;
@@ -752,7 +752,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
spin_unlock(ptl);
}
-int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
+vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, pfn_t pfn, bool write)
{
pgprot_t pgprot = vma->vm_page_prot;
@@ -762,11 +762,11 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
* but we need to be consistent with PTEs and architectures that
* can't support a 'special' bit.
*/
- BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
+ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
+ !pfn_t_devmap(pfn));
BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
- BUG_ON(!pfn_t_devmap(pfn));
if (addr < vma->vm_start || addr >= vma->vm_end)
return VM_FAULT_SIGBUS;
@@ -812,7 +812,7 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
spin_unlock(ptl);
}
-int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
+vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
pud_t *pud, pfn_t pfn, bool write)
{
pgprot_t pgprot = vma->vm_page_prot;
@@ -1118,15 +1118,16 @@ unlock:
spin_unlock(vmf->ptl);
}
-static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
- struct page *page)
+static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
+ pmd_t orig_pmd, struct page *page)
{
struct vm_area_struct *vma = vmf->vma;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
struct mem_cgroup *memcg;
pgtable_t pgtable;
pmd_t _pmd;
- int ret = 0, i;
+ int i;
+ vm_fault_t ret = 0;
struct page **pages;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
@@ -1142,7 +1143,7 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
vmf->address, page_to_nid(page));
if (unlikely(!pages[i] ||
- mem_cgroup_try_charge(pages[i], vma->vm_mm,
+ mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
GFP_KERNEL, &memcg, false))) {
if (pages[i])
put_page(pages[i]);
@@ -1236,7 +1237,7 @@ out_free_pages:
goto out;
}
-int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
+vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL, *new_page;
@@ -1245,7 +1246,7 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
gfp_t huge_gfp; /* for allocation and charge */
- int ret = 0;
+ vm_fault_t ret = 0;
vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
@@ -1312,7 +1313,7 @@ alloc:
goto out;
}
- if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
+ if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
huge_gfp, &memcg, true))) {
put_page(new_page);
split_huge_pmd(vma, vmf->pmd, vmf->address);
@@ -1328,7 +1329,8 @@ alloc:
if (!page)
clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
else
- copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
+ copy_user_huge_page(new_page, page, vmf->address,
+ vma, HPAGE_PMD_NR);
__SetPageUptodate(new_page);
mmun_start = haddr;
@@ -1456,7 +1458,7 @@ out:
}
/* NUMA hinting page fault entry point for trans huge pmds */
-int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
+vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
{
struct vm_area_struct *vma = vmf->vma;
struct anon_vma *anon_vma = NULL;
@@ -1740,7 +1742,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
} else {
if (arch_needs_pgtable_deposit())
zap_deposited_table(tlb->mm, pmd);
- add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
+ add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR);
}
spin_unlock(ptl);
@@ -2084,11 +2086,13 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
if (vma_is_dax(vma))
return;
page = pmd_page(_pmd);
+ if (!PageDirty(page) && pmd_dirty(_pmd))
+ set_page_dirty(page);
if (!PageReferenced(page) && pmd_young(_pmd))
SetPageReferenced(page);
page_remove_rmap(page, true);
put_page(page);
- add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR);
+ add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
return;
} else if (is_huge_zero_pmd(*pmd)) {
/*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3612fbb32e9d..3c21775f196b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1479,22 +1479,20 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
/*
* Dissolve a given free hugepage into free buddy pages. This function does
* nothing for in-use (including surplus) hugepages. Returns -EBUSY if the
- * number of free hugepages would be reduced below the number of reserved
- * hugepages.
+ * dissolution fails because a give page is not a free hugepage, or because
+ * free hugepages are fully reserved.
*/
int dissolve_free_huge_page(struct page *page)
{
- int rc = 0;
+ int rc = -EBUSY;
spin_lock(&hugetlb_lock);
if (PageHuge(page) && !page_count(page)) {
struct page *head = compound_head(page);
struct hstate *h = page_hstate(head);
int nid = page_to_nid(head);
- if (h->free_huge_pages - h->resv_huge_pages == 0) {
- rc = -EBUSY;
+ if (h->free_huge_pages - h->resv_huge_pages == 0)
goto out;
- }
/*
* Move PageHWPoison flag from head page to the raw error page,
* which makes any subpages rather than the error page reusable.
@@ -1508,6 +1506,7 @@ int dissolve_free_huge_page(struct page *page)
h->free_huge_pages_node[nid]--;
h->max_huge_pages--;
update_and_free_page(h, head);
+ rc = 0;
}
out:
spin_unlock(&hugetlb_lock);
@@ -2101,7 +2100,7 @@ int __alloc_bootmem_huge_page(struct hstate *h)
for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
void *addr;
- addr = memblock_virt_alloc_try_nid_nopanic(
+ addr = memblock_virt_alloc_try_nid_raw(
huge_page_size(h), huge_page_size(h),
0, BOOTMEM_ALLOC_ACCESSIBLE, node);
if (addr) {
@@ -2119,6 +2118,7 @@ int __alloc_bootmem_huge_page(struct hstate *h)
found:
BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
/* Put them into a private list first because mem_map is not up yet */
+ INIT_LIST_HEAD(&m->list);
list_add(&m->list, &huge_boot_pages);
m->hstate = h;
return 1;
@@ -2139,16 +2139,9 @@ static void __init gather_bootmem_prealloc(void)
struct huge_bootmem_page *m;
list_for_each_entry(m, &huge_boot_pages, list) {
+ struct page *page = virt_to_page(m);
struct hstate *h = m->hstate;
- struct page *page;
-#ifdef CONFIG_HIGHMEM
- page = pfn_to_page(m->phys >> PAGE_SHIFT);
- memblock_free_late(__pa(m),
- sizeof(struct huge_bootmem_page));
-#else
- page = virt_to_page(m);
-#endif
WARN_ON(page_count(page) != 1);
prep_compound_huge_page(page, h->order);
WARN_ON(PageReserved(page));
@@ -2163,6 +2156,7 @@ static void __init gather_bootmem_prealloc(void)
*/
if (hstate_is_gigantic(h))
adjust_managed_page_count(page, 1 << h->order);
+ cond_resched();
}
}
@@ -3166,6 +3160,13 @@ static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
return 0;
}
+/*
+ * When a new function is introduced to vm_operations_struct and added
+ * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops.
+ * This is because under System V memory model, mappings created via
+ * shmget/shmat with "huge page" specified are backed by hugetlbfs files,
+ * their original vm_ops are overwritten with shm_vm_ops.
+ */
const struct vm_operations_struct hugetlb_vm_ops = {
.fault = hugetlb_vm_op_fault,
.open = hugetlb_vm_op_open,
@@ -3500,16 +3501,18 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* cannot race with other handlers or page migration.
* Keep the pte_same checks anyway to make transition from the mutex easier.
*/
-static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
+static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
struct page *pagecache_page, spinlock_t *ptl)
{
pte_t pte;
struct hstate *h = hstate_vma(vma);
struct page *old_page, *new_page;
- int ret = 0, outside_reserve = 0;
+ int outside_reserve = 0;
+ vm_fault_t ret = 0;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
+ unsigned long haddr = address & huge_page_mask(h);
pte = huge_ptep_get(ptep);
old_page = pte_page(pte);
@@ -3519,7 +3522,7 @@ retry_avoidcopy:
* and just make the page writable */
if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
page_move_anon_rmap(old_page, vma);
- set_huge_ptep_writable(vma, address, ptep);
+ set_huge_ptep_writable(vma, haddr, ptep);
return 0;
}
@@ -3543,7 +3546,7 @@ retry_avoidcopy:
* be acquired again before returning to the caller, as expected.
*/
spin_unlock(ptl);
- new_page = alloc_huge_page(vma, address, outside_reserve);
+ new_page = alloc_huge_page(vma, haddr, outside_reserve);
if (IS_ERR(new_page)) {
/*
@@ -3556,11 +3559,10 @@ retry_avoidcopy:
if (outside_reserve) {
put_page(old_page);
BUG_ON(huge_pte_none(pte));
- unmap_ref_private(mm, vma, old_page, address);
+ unmap_ref_private(mm, vma, old_page, haddr);
BUG_ON(huge_pte_none(pte));
spin_lock(ptl);
- ptep = huge_pte_offset(mm, address & huge_page_mask(h),
- huge_page_size(h));
+ ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (likely(ptep &&
pte_same(huge_ptep_get(ptep), pte)))
goto retry_avoidcopy;
@@ -3571,8 +3573,7 @@ retry_avoidcopy:
return 0;
}
- ret = (PTR_ERR(new_page) == -ENOMEM) ?
- VM_FAULT_OOM : VM_FAULT_SIGBUS;
+ ret = vmf_error(PTR_ERR(new_page));
goto out_release_old;
}
@@ -3590,7 +3591,7 @@ retry_avoidcopy:
__SetPageUptodate(new_page);
set_page_huge_active(new_page);
- mmun_start = address & huge_page_mask(h);
+ mmun_start = haddr;
mmun_end = mmun_start + huge_page_size(h);
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
@@ -3599,25 +3600,24 @@ retry_avoidcopy:
* before the page tables are altered
*/
spin_lock(ptl);
- ptep = huge_pte_offset(mm, address & huge_page_mask(h),
- huge_page_size(h));
+ ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
ClearPagePrivate(new_page);
/* Break COW */
- huge_ptep_clear_flush(vma, address, ptep);
+ huge_ptep_clear_flush(vma, haddr, ptep);
mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
- set_huge_pte_at(mm, address, ptep,
+ set_huge_pte_at(mm, haddr, ptep,
make_huge_pte(vma, new_page, 1));
page_remove_rmap(old_page, true);
- hugepage_add_new_anon_rmap(new_page, vma, address);
+ hugepage_add_new_anon_rmap(new_page, vma, haddr);
/* Make the old page be freed below */
new_page = old_page;
}
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out_release_all:
- restore_reserve_on_error(h, vma, address, new_page);
+ restore_reserve_on_error(h, vma, haddr, new_page);
put_page(new_page);
out_release_old:
put_page(old_page);
@@ -3676,12 +3676,13 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
return 0;
}
-static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
- struct address_space *mapping, pgoff_t idx,
- unsigned long address, pte_t *ptep, unsigned int flags)
+static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ struct address_space *mapping, pgoff_t idx,
+ unsigned long address, pte_t *ptep, unsigned int flags)
{
struct hstate *h = hstate_vma(vma);
- int ret = VM_FAULT_SIGBUS;
+ vm_fault_t ret = VM_FAULT_SIGBUS;
int anon_rmap = 0;
unsigned long size;
struct page *page;
@@ -3744,11 +3745,7 @@ retry:
page = alloc_huge_page(vma, haddr, 0);
if (IS_ERR(page)) {
- ret = PTR_ERR(page);
- if (ret == -ENOMEM)
- ret = VM_FAULT_OOM;
- else
- ret = VM_FAULT_SIGBUS;
+ ret = vmf_error(PTR_ERR(page));
goto out;
}
clear_huge_page(page, address, pages_per_huge_page(h));
@@ -3820,7 +3817,7 @@ retry:
hugetlb_count_add(pages_per_huge_page(h), mm);
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
- ret = hugetlb_cow(mm, vma, haddr, ptep, page, ptl);
+ ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
}
spin_unlock(ptl);
@@ -3872,12 +3869,12 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
}
#endif
-int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
pte_t *ptep, entry;
spinlock_t *ptl;
- int ret;
+ vm_fault_t ret;
u32 hash;
pgoff_t idx;
struct page *page = NULL;
@@ -3974,7 +3971,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (flags & FAULT_FLAG_WRITE) {
if (!huge_pte_write(entry)) {
- ret = hugetlb_cow(mm, vma, haddr, ptep,
+ ret = hugetlb_cow(mm, vma, address, ptep,
pagecache_page, ptl);
goto out_put_page;
}
@@ -4207,7 +4204,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (absent || is_swap_pte(huge_ptep_get(pte)) ||
((flags & FOLL_WRITE) &&
!huge_pte_write(huge_ptep_get(pte)))) {
- int ret;
+ vm_fault_t ret;
unsigned int fault_flags = 0;
if (pte)
diff --git a/mm/init-mm.c b/mm/init-mm.c
index f0179c9c04c2..a787a319211e 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -15,6 +15,16 @@
#define INIT_MM_CONTEXT(name)
#endif
+/*
+ * For dynamically allocated mm_structs, there is a dynamically sized cpumask
+ * at the end of the structure, the size of which depends on the maximum CPU
+ * number the system can see. That way we allocate only as much memory for
+ * mm_cpumask() as needed for the hundreds, or thousands of processes that
+ * a system typically runs.
+ *
+ * Since there is only one init_mm in the entire system, keep it simple
+ * and size this cpu_bitmask to NR_CPUS.
+ */
struct mm_struct init_mm = {
.mm_rb = RB_ROOT,
.pgd = swapper_pg_dir,
@@ -25,5 +35,6 @@ struct mm_struct init_mm = {
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
.user_ns = &init_user_ns,
+ .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0},
INIT_MM_CONTEXT(init_mm)
};
diff --git a/mm/internal.h b/mm/internal.h
index 9e3654d70289..87256ae1bef8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -38,7 +38,7 @@
void page_writeback_init(void);
-int do_swap_page(struct vm_fault *vmf);
+vm_fault_t do_swap_page(struct vm_fault *vmf);
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
@@ -389,18 +389,6 @@ static inline struct page *mem_map_next(struct page *iter,
return iter + 1;
}
-/*
- * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
- * so all functions starting at paging_init should be marked __init
- * in those cases. SPARSEMEM, however, allows for memory hotplug,
- * and alloc_bootmem_node is not used.
- */
-#ifdef CONFIG_SPARSEMEM
-#define __paginginit __meminit
-#else
-#define __paginginit __init
-#endif
-
/* Memory initialisation debug and verification */
enum mminit_level {
MMINIT_WARNING,
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index f185455b3406..c3bd5209da38 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -619,12 +619,13 @@ void kasan_kfree_large(void *ptr, unsigned long ip)
int kasan_module_alloc(void *addr, size_t size)
{
void *ret;
+ size_t scaled_size;
size_t shadow_size;
unsigned long shadow_start;
shadow_start = (unsigned long)kasan_mem_to_shadow(addr);
- shadow_size = round_up(size >> KASAN_SHADOW_SCALE_SHIFT,
- PAGE_SIZE);
+ scaled_size = (size + KASAN_SHADOW_MASK) >> KASAN_SHADOW_SCALE_SHIFT;
+ shadow_size = round_up(scaled_size, PAGE_SIZE);
if (WARN_ON(!PAGE_ALIGNED(shadow_start)))
return -EINVAL;
diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c
index f436246ccc79..7a2a2f13f86f 100644
--- a/mm/kasan/kasan_init.c
+++ b/mm/kasan/kasan_init.c
@@ -17,10 +17,13 @@
#include <linux/memblock.h>
#include <linux/mm.h>
#include <linux/pfn.h>
+#include <linux/slab.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
+#include "kasan.h"
+
/*
* This page serves two purposes:
* - It used as early shadow memory. The entire shadow region populated
@@ -32,22 +35,59 @@ unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss;
#if CONFIG_PGTABLE_LEVELS > 4
p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss;
+static inline bool kasan_p4d_table(pgd_t pgd)
+{
+ return pgd_page(pgd) == virt_to_page(lm_alias(kasan_zero_p4d));
+}
+#else
+static inline bool kasan_p4d_table(pgd_t pgd)
+{
+ return 0;
+}
#endif
#if CONFIG_PGTABLE_LEVELS > 3
pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss;
+static inline bool kasan_pud_table(p4d_t p4d)
+{
+ return p4d_page(p4d) == virt_to_page(lm_alias(kasan_zero_pud));
+}
+#else
+static inline bool kasan_pud_table(p4d_t p4d)
+{
+ return 0;
+}
#endif
#if CONFIG_PGTABLE_LEVELS > 2
pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss;
+static inline bool kasan_pmd_table(pud_t pud)
+{
+ return pud_page(pud) == virt_to_page(lm_alias(kasan_zero_pmd));
+}
+#else
+static inline bool kasan_pmd_table(pud_t pud)
+{
+ return 0;
+}
#endif
pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss;
+static inline bool kasan_pte_table(pmd_t pmd)
+{
+ return pmd_page(pmd) == virt_to_page(lm_alias(kasan_zero_pte));
+}
+
+static inline bool kasan_zero_page_entry(pte_t pte)
+{
+ return pte_page(pte) == virt_to_page(lm_alias(kasan_zero_page));
+}
+
static __init void *early_alloc(size_t size, int node)
{
return memblock_virt_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS),
BOOTMEM_ALLOC_ACCESSIBLE, node);
}
-static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr,
+static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr,
unsigned long end)
{
pte_t *pte = pte_offset_kernel(pmd, addr);
@@ -63,7 +103,7 @@ static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr,
}
}
-static void __init zero_pmd_populate(pud_t *pud, unsigned long addr,
+static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr,
unsigned long end)
{
pmd_t *pmd = pmd_offset(pud, addr);
@@ -78,14 +118,24 @@ static void __init zero_pmd_populate(pud_t *pud, unsigned long addr,
}
if (pmd_none(*pmd)) {
- pmd_populate_kernel(&init_mm, pmd,
- early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ pte_t *p;
+
+ if (slab_is_available())
+ p = pte_alloc_one_kernel(&init_mm, addr);
+ else
+ p = early_alloc(PAGE_SIZE, NUMA_NO_NODE);
+ if (!p)
+ return -ENOMEM;
+
+ pmd_populate_kernel(&init_mm, pmd, p);
}
zero_pte_populate(pmd, addr, next);
} while (pmd++, addr = next, addr != end);
+
+ return 0;
}
-static void __init zero_pud_populate(p4d_t *p4d, unsigned long addr,
+static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr,
unsigned long end)
{
pud_t *pud = pud_offset(p4d, addr);
@@ -103,14 +153,24 @@ static void __init zero_pud_populate(p4d_t *p4d, unsigned long addr,
}
if (pud_none(*pud)) {
- pud_populate(&init_mm, pud,
- early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ pmd_t *p;
+
+ if (slab_is_available()) {
+ p = pmd_alloc(&init_mm, pud, addr);
+ if (!p)
+ return -ENOMEM;
+ } else {
+ pud_populate(&init_mm, pud,
+ early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ }
}
zero_pmd_populate(pud, addr, next);
} while (pud++, addr = next, addr != end);
+
+ return 0;
}
-static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr,
+static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
unsigned long end)
{
p4d_t *p4d = p4d_offset(pgd, addr);
@@ -132,11 +192,21 @@ static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr,
}
if (p4d_none(*p4d)) {
- p4d_populate(&init_mm, p4d,
- early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ pud_t *p;
+
+ if (slab_is_available()) {
+ p = pud_alloc(&init_mm, p4d, addr);
+ if (!p)
+ return -ENOMEM;
+ } else {
+ p4d_populate(&init_mm, p4d,
+ early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ }
}
zero_pud_populate(p4d, addr, next);
} while (p4d++, addr = next, addr != end);
+
+ return 0;
}
/**
@@ -145,7 +215,7 @@ static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr,
* @shadow_start - start of the memory range to populate
* @shadow_end - end of the memory range to populate
*/
-void __init kasan_populate_zero_shadow(const void *shadow_start,
+int __ref kasan_populate_zero_shadow(const void *shadow_start,
const void *shadow_end)
{
unsigned long addr = (unsigned long)shadow_start;
@@ -191,9 +261,229 @@ void __init kasan_populate_zero_shadow(const void *shadow_start,
}
if (pgd_none(*pgd)) {
- pgd_populate(&init_mm, pgd,
- early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ p4d_t *p;
+
+ if (slab_is_available()) {
+ p = p4d_alloc(&init_mm, pgd, addr);
+ if (!p)
+ return -ENOMEM;
+ } else {
+ pgd_populate(&init_mm, pgd,
+ early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ }
}
zero_p4d_populate(pgd, addr, next);
} while (pgd++, addr = next, addr != end);
+
+ return 0;
+}
+
+static void kasan_free_pte(pte_t *pte_start, pmd_t *pmd)
+{
+ pte_t *pte;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte = pte_start + i;
+ if (!pte_none(*pte))
+ return;
+ }
+
+ pte_free_kernel(&init_mm, (pte_t *)page_to_virt(pmd_page(*pmd)));
+ pmd_clear(pmd);
+}
+
+static void kasan_free_pmd(pmd_t *pmd_start, pud_t *pud)
+{
+ pmd_t *pmd;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd = pmd_start + i;
+ if (!pmd_none(*pmd))
+ return;
+ }
+
+ pmd_free(&init_mm, (pmd_t *)page_to_virt(pud_page(*pud)));
+ pud_clear(pud);
+}
+
+static void kasan_free_pud(pud_t *pud_start, p4d_t *p4d)
+{
+ pud_t *pud;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ pud = pud_start + i;
+ if (!pud_none(*pud))
+ return;
+ }
+
+ pud_free(&init_mm, (pud_t *)page_to_virt(p4d_page(*p4d)));
+ p4d_clear(p4d);
+}
+
+static void kasan_free_p4d(p4d_t *p4d_start, pgd_t *pgd)
+{
+ p4d_t *p4d;
+ int i;
+
+ for (i = 0; i < PTRS_PER_P4D; i++) {
+ p4d = p4d_start + i;
+ if (!p4d_none(*p4d))
+ return;
+ }
+
+ p4d_free(&init_mm, (p4d_t *)page_to_virt(pgd_page(*pgd)));
+ pgd_clear(pgd);
+}
+
+static void kasan_remove_pte_table(pte_t *pte, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long next;
+
+ for (; addr < end; addr = next, pte++) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ if (next > end)
+ next = end;
+
+ if (!pte_present(*pte))
+ continue;
+
+ if (WARN_ON(!kasan_zero_page_entry(*pte)))
+ continue;
+ pte_clear(&init_mm, addr, pte);
+ }
+}
+
+static void kasan_remove_pmd_table(pmd_t *pmd, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long next;
+
+ for (; addr < end; addr = next, pmd++) {
+ pte_t *pte;
+
+ next = pmd_addr_end(addr, end);
+
+ if (!pmd_present(*pmd))
+ continue;
+
+ if (kasan_pte_table(*pmd)) {
+ if (IS_ALIGNED(addr, PMD_SIZE) &&
+ IS_ALIGNED(next, PMD_SIZE))
+ pmd_clear(pmd);
+ continue;
+ }
+ pte = pte_offset_kernel(pmd, addr);
+ kasan_remove_pte_table(pte, addr, next);
+ kasan_free_pte(pte_offset_kernel(pmd, 0), pmd);
+ }
+}
+
+static void kasan_remove_pud_table(pud_t *pud, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long next;
+
+ for (; addr < end; addr = next, pud++) {
+ pmd_t *pmd, *pmd_base;
+
+ next = pud_addr_end(addr, end);
+
+ if (!pud_present(*pud))
+ continue;
+
+ if (kasan_pmd_table(*pud)) {
+ if (IS_ALIGNED(addr, PUD_SIZE) &&
+ IS_ALIGNED(next, PUD_SIZE))
+ pud_clear(pud);
+ continue;
+ }
+ pmd = pmd_offset(pud, addr);
+ pmd_base = pmd_offset(pud, 0);
+ kasan_remove_pmd_table(pmd, addr, next);
+ kasan_free_pmd(pmd_base, pud);
+ }
+}
+
+static void kasan_remove_p4d_table(p4d_t *p4d, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long next;
+
+ for (; addr < end; addr = next, p4d++) {
+ pud_t *pud;
+
+ next = p4d_addr_end(addr, end);
+
+ if (!p4d_present(*p4d))
+ continue;
+
+ if (kasan_pud_table(*p4d)) {
+ if (IS_ALIGNED(addr, P4D_SIZE) &&
+ IS_ALIGNED(next, P4D_SIZE))
+ p4d_clear(p4d);
+ continue;
+ }
+ pud = pud_offset(p4d, addr);
+ kasan_remove_pud_table(pud, addr, next);
+ kasan_free_pud(pud_offset(p4d, 0), p4d);
+ }
+}
+
+void kasan_remove_zero_shadow(void *start, unsigned long size)
+{
+ unsigned long addr, end, next;
+ pgd_t *pgd;
+
+ addr = (unsigned long)kasan_mem_to_shadow(start);
+ end = addr + (size >> KASAN_SHADOW_SCALE_SHIFT);
+
+ if (WARN_ON((unsigned long)start %
+ (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) ||
+ WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)))
+ return;
+
+ for (; addr < end; addr = next) {
+ p4d_t *p4d;
+
+ next = pgd_addr_end(addr, end);
+
+ pgd = pgd_offset_k(addr);
+ if (!pgd_present(*pgd))
+ continue;
+
+ if (kasan_p4d_table(*pgd)) {
+ if (IS_ALIGNED(addr, PGDIR_SIZE) &&
+ IS_ALIGNED(next, PGDIR_SIZE))
+ pgd_clear(pgd);
+ continue;
+ }
+
+ p4d = p4d_offset(pgd, addr);
+ kasan_remove_p4d_table(p4d, addr, next);
+ kasan_free_p4d(p4d_offset(pgd, 0), pgd);
+ }
+}
+
+int kasan_add_zero_shadow(void *start, unsigned long size)
+{
+ int ret;
+ void *shadow_start, *shadow_end;
+
+ shadow_start = kasan_mem_to_shadow(start);
+ shadow_end = shadow_start + (size >> KASAN_SHADOW_SCALE_SHIFT);
+
+ if (WARN_ON((unsigned long)start %
+ (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) ||
+ WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)))
+ return -EINVAL;
+
+ ret = kasan_populate_zero_shadow(shadow_start, shadow_end);
+ if (ret)
+ kasan_remove_zero_shadow(shadow_start,
+ size >> KASAN_SHADOW_SCALE_SHIFT);
+ return ret;
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index d7b2a4bf8671..a31d740e6cd1 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -397,6 +397,26 @@ static inline int khugepaged_test_exit(struct mm_struct *mm)
return atomic_read(&mm->mm_users) == 0;
}
+static bool hugepage_vma_check(struct vm_area_struct *vma,
+ unsigned long vm_flags)
+{
+ if ((!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
+ (vm_flags & VM_NOHUGEPAGE) ||
+ test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+ return false;
+ if (shmem_file(vma->vm_file)) {
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
+ return false;
+ return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
+ HPAGE_PMD_NR);
+ }
+ if (!vma->anon_vma || vma->vm_ops)
+ return false;
+ if (is_vma_temporary_stack(vma))
+ return false;
+ return !(vm_flags & VM_NO_KHUGEPAGED);
+}
+
int __khugepaged_enter(struct mm_struct *mm)
{
struct mm_slot *mm_slot;
@@ -434,15 +454,14 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
unsigned long vm_flags)
{
unsigned long hstart, hend;
- if (!vma->anon_vma)
- /*
- * Not yet faulted in so we will register later in the
- * page fault if needed.
- */
- return 0;
- if (vma->vm_ops || (vm_flags & VM_NO_KHUGEPAGED))
- /* khugepaged not yet working on file or special mappings */
+
+ /*
+ * khugepaged does not yet work on non-shmem files or special
+ * mappings. And file-private shmem THP is not supported.
+ */
+ if (!hugepage_vma_check(vma, vm_flags))
return 0;
+
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (hstart < hend)
@@ -819,25 +838,6 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
}
#endif
-static bool hugepage_vma_check(struct vm_area_struct *vma)
-{
- if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
- (vma->vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
- return false;
- if (shmem_file(vma->vm_file)) {
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
- return false;
- return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
- HPAGE_PMD_NR);
- }
- if (!vma->anon_vma || vma->vm_ops)
- return false;
- if (is_vma_temporary_stack(vma))
- return false;
- return !(vma->vm_flags & VM_NO_KHUGEPAGED);
-}
-
/*
* If mmap_sem temporarily dropped, revalidate vma
* before taking mmap_sem.
@@ -862,7 +862,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
hend = vma->vm_end & HPAGE_PMD_MASK;
if (address < hstart || address + HPAGE_PMD_SIZE > hend)
return SCAN_ADDRESS_RANGE;
- if (!hugepage_vma_check(vma))
+ if (!hugepage_vma_check(vma, vma->vm_flags))
return SCAN_VMA_CHECK;
return 0;
}
@@ -880,7 +880,8 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
unsigned long address, pmd_t *pmd,
int referenced)
{
- int swapped_in = 0, ret = 0;
+ int swapped_in = 0;
+ vm_fault_t ret = 0;
struct vm_fault vmf = {
.vma = vma,
.address = address,
@@ -1517,6 +1518,8 @@ tree_unlocked:
unlock_page(new_page);
*hpage = NULL;
+
+ khugepaged_pages_collapsed++;
} else {
/* Something went wrong: rollback changes to the radix-tree */
shmem_uncharge(mapping->host, nr_none);
@@ -1694,7 +1697,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
progress++;
break;
}
- if (!hugepage_vma_check(vma)) {
+ if (!hugepage_vma_check(vma, vma->vm_flags)) {
skip:
progress++;
continue;
diff --git a/mm/ksm.c b/mm/ksm.c
index a6d43cf9a982..5b0894b45ee5 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -470,7 +470,7 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
{
struct page *page;
- int ret = 0;
+ vm_fault_t ret = 0;
do {
cond_resched();
@@ -652,9 +652,9 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
* list_head to stay clear from the rb_parent_color union
* (aligned and different than any node) and also different
* from &migrate_nodes. This will verify that future list.h changes
- * don't break STABLE_NODE_DUP_HEAD.
+ * don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it.
*/
-#if GCC_VERSION >= 40903 /* only recent gcc can handle it */
+#if defined(GCC_VERSION) && GCC_VERSION >= 40903
BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
#endif
@@ -703,7 +703,7 @@ again:
* We cannot do anything with the page while its refcount is 0.
* Usually 0 means free, or tail of a higher-order page: in which
* case this node is no longer referenced, and should be freed;
- * however, it might mean that the page is under page_freeze_refs().
+ * however, it might mean that the page is under page_ref_freeze().
* The __remove_mapping() case is easy, again the node is now stale;
* but if page is swapcache in migrate_page_move_mapping(), it might
* still be our page, in which case it's essential to keep the node.
@@ -714,7 +714,7 @@ again:
* work here too. We have chosen the !PageSwapCache test to
* optimize the common case, when the page is or is about to
* be freed: PageSwapCache is cleared (under spin_lock_irq)
- * in the freeze_refs section of __remove_mapping(); but Anon
+ * in the ref_freeze section of __remove_mapping(); but Anon
* page->mapping reset to NULL later, in free_pages_prepare().
*/
if (!PageSwapCache(page))
@@ -2430,6 +2430,9 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
VM_HUGETLB | VM_MIXEDMAP))
return 0; /* just ignore the advice */
+ if (vma_is_dax(vma))
+ return 0;
+
#ifdef VM_SAO
if (*vm_flags & VM_SAO)
return 0;
diff --git a/mm/list_lru.c b/mm/list_lru.c
index fcfb6c89ed47..5b30625fd365 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -12,7 +12,7 @@
#include <linux/mutex.h>
#include <linux/memcontrol.h>
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+#ifdef CONFIG_MEMCG_KMEM
static LIST_HEAD(list_lrus);
static DEFINE_MUTEX(list_lrus_mutex);
@@ -29,17 +29,12 @@ static void list_lru_unregister(struct list_lru *lru)
list_del(&lru->list);
mutex_unlock(&list_lrus_mutex);
}
-#else
-static void list_lru_register(struct list_lru *lru)
-{
-}
-static void list_lru_unregister(struct list_lru *lru)
+static int lru_shrinker_id(struct list_lru *lru)
{
+ return lru->shrinker_id;
}
-#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
/*
@@ -75,20 +70,39 @@ static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
}
static inline struct list_lru_one *
-list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
+list_lru_from_kmem(struct list_lru_node *nlru, void *ptr,
+ struct mem_cgroup **memcg_ptr)
{
- struct mem_cgroup *memcg;
+ struct list_lru_one *l = &nlru->lru;
+ struct mem_cgroup *memcg = NULL;
if (!nlru->memcg_lrus)
- return &nlru->lru;
+ goto out;
memcg = mem_cgroup_from_kmem(ptr);
if (!memcg)
- return &nlru->lru;
+ goto out;
- return list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
+ l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
+out:
+ if (memcg_ptr)
+ *memcg_ptr = memcg;
+ return l;
}
#else
+static void list_lru_register(struct list_lru *lru)
+{
+}
+
+static void list_lru_unregister(struct list_lru *lru)
+{
+}
+
+static int lru_shrinker_id(struct list_lru *lru)
+{
+ return -1;
+}
+
static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
return false;
@@ -101,23 +115,30 @@ list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
}
static inline struct list_lru_one *
-list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
+list_lru_from_kmem(struct list_lru_node *nlru, void *ptr,
+ struct mem_cgroup **memcg_ptr)
{
+ if (memcg_ptr)
+ *memcg_ptr = NULL;
return &nlru->lru;
}
-#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
+#endif /* CONFIG_MEMCG_KMEM */
bool list_lru_add(struct list_lru *lru, struct list_head *item)
{
int nid = page_to_nid(virt_to_page(item));
struct list_lru_node *nlru = &lru->node[nid];
+ struct mem_cgroup *memcg;
struct list_lru_one *l;
spin_lock(&nlru->lock);
if (list_empty(item)) {
- l = list_lru_from_kmem(nlru, item);
+ l = list_lru_from_kmem(nlru, item, &memcg);
list_add_tail(item, &l->list);
- l->nr_items++;
+ /* Set shrinker bit if the first element was added */
+ if (!l->nr_items++)
+ memcg_set_shrinker_bit(memcg, nid,
+ lru_shrinker_id(lru));
nlru->nr_items++;
spin_unlock(&nlru->lock);
return true;
@@ -135,7 +156,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
spin_lock(&nlru->lock);
if (!list_empty(item)) {
- l = list_lru_from_kmem(nlru, item);
+ l = list_lru_from_kmem(nlru, item, NULL);
list_del_init(item);
l->nr_items--;
nlru->nr_items--;
@@ -162,26 +183,20 @@ void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
}
EXPORT_SYMBOL_GPL(list_lru_isolate_move);
-static unsigned long __list_lru_count_one(struct list_lru *lru,
- int nid, int memcg_idx)
+unsigned long list_lru_count_one(struct list_lru *lru,
+ int nid, struct mem_cgroup *memcg)
{
struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l;
unsigned long count;
rcu_read_lock();
- l = list_lru_from_memcg_idx(nlru, memcg_idx);
+ l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
count = l->nr_items;
rcu_read_unlock();
return count;
}
-
-unsigned long list_lru_count_one(struct list_lru *lru,
- int nid, struct mem_cgroup *memcg)
-{
- return __list_lru_count_one(lru, nid, memcg_cache_id(memcg));
-}
EXPORT_SYMBOL_GPL(list_lru_count_one);
unsigned long list_lru_count_node(struct list_lru *lru, int nid)
@@ -194,17 +209,15 @@ unsigned long list_lru_count_node(struct list_lru *lru, int nid)
EXPORT_SYMBOL_GPL(list_lru_count_node);
static unsigned long
-__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx,
+__list_lru_walk_one(struct list_lru_node *nlru, int memcg_idx,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk)
{
- struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l;
struct list_head *item, *n;
unsigned long isolated = 0;
- spin_lock(&nlru->lock);
l = list_lru_from_memcg_idx(nlru, memcg_idx);
restart:
list_for_each_safe(item, n, &l->list) {
@@ -250,8 +263,6 @@ restart:
BUG();
}
}
-
- spin_unlock(&nlru->lock);
return isolated;
}
@@ -260,11 +271,32 @@ list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk)
{
- return __list_lru_walk_one(lru, nid, memcg_cache_id(memcg),
- isolate, cb_arg, nr_to_walk);
+ struct list_lru_node *nlru = &lru->node[nid];
+ unsigned long ret;
+
+ spin_lock(&nlru->lock);
+ ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg,
+ nr_to_walk);
+ spin_unlock(&nlru->lock);
+ return ret;
}
EXPORT_SYMBOL_GPL(list_lru_walk_one);
+unsigned long
+list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
+ list_lru_walk_cb isolate, void *cb_arg,
+ unsigned long *nr_to_walk)
+{
+ struct list_lru_node *nlru = &lru->node[nid];
+ unsigned long ret;
+
+ spin_lock_irq(&nlru->lock);
+ ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg,
+ nr_to_walk);
+ spin_unlock_irq(&nlru->lock);
+ return ret;
+}
+
unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk)
@@ -272,12 +304,18 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
long isolated = 0;
int memcg_idx;
- isolated += __list_lru_walk_one(lru, nid, -1, isolate, cb_arg,
- nr_to_walk);
+ isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
+ nr_to_walk);
if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
for_each_memcg_cache_index(memcg_idx) {
- isolated += __list_lru_walk_one(lru, nid, memcg_idx,
- isolate, cb_arg, nr_to_walk);
+ struct list_lru_node *nlru = &lru->node[nid];
+
+ spin_lock(&nlru->lock);
+ isolated += __list_lru_walk_one(nlru, memcg_idx,
+ isolate, cb_arg,
+ nr_to_walk);
+ spin_unlock(&nlru->lock);
+
if (*nr_to_walk <= 0)
break;
}
@@ -292,7 +330,7 @@ static void init_one_lru(struct list_lru_one *l)
l->nr_items = 0;
}
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+#ifdef CONFIG_MEMCG_KMEM
static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus,
int begin, int end)
{
@@ -500,10 +538,13 @@ fail:
goto out;
}
-static void memcg_drain_list_lru_node(struct list_lru_node *nlru,
- int src_idx, int dst_idx)
+static void memcg_drain_list_lru_node(struct list_lru *lru, int nid,
+ int src_idx, struct mem_cgroup *dst_memcg)
{
+ struct list_lru_node *nlru = &lru->node[nid];
+ int dst_idx = dst_memcg->kmemcg_id;
struct list_lru_one *src, *dst;
+ bool set;
/*
* Since list_lru_{add,del} may be called under an IRQ-safe lock,
@@ -515,14 +556,17 @@ static void memcg_drain_list_lru_node(struct list_lru_node *nlru,
dst = list_lru_from_memcg_idx(nlru, dst_idx);
list_splice_init(&src->list, &dst->list);
+ set = (!dst->nr_items && src->nr_items);
dst->nr_items += src->nr_items;
+ if (set)
+ memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
src->nr_items = 0;
spin_unlock_irq(&nlru->lock);
}
static void memcg_drain_list_lru(struct list_lru *lru,
- int src_idx, int dst_idx)
+ int src_idx, struct mem_cgroup *dst_memcg)
{
int i;
@@ -530,16 +574,16 @@ static void memcg_drain_list_lru(struct list_lru *lru,
return;
for_each_node(i)
- memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx);
+ memcg_drain_list_lru_node(lru, i, src_idx, dst_memcg);
}
-void memcg_drain_all_list_lrus(int src_idx, int dst_idx)
+void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg)
{
struct list_lru *lru;
mutex_lock(&list_lrus_mutex);
list_for_each_entry(lru, &list_lrus, list)
- memcg_drain_list_lru(lru, src_idx, dst_idx);
+ memcg_drain_list_lru(lru, src_idx, dst_memcg);
mutex_unlock(&list_lrus_mutex);
}
#else
@@ -551,15 +595,21 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
static void memcg_destroy_list_lru(struct list_lru *lru)
{
}
-#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
+#endif /* CONFIG_MEMCG_KMEM */
int __list_lru_init(struct list_lru *lru, bool memcg_aware,
- struct lock_class_key *key)
+ struct lock_class_key *key, struct shrinker *shrinker)
{
int i;
size_t size = sizeof(*lru->node) * nr_node_ids;
int err = -ENOMEM;
+#ifdef CONFIG_MEMCG_KMEM
+ if (shrinker)
+ lru->shrinker_id = shrinker->id;
+ else
+ lru->shrinker_id = -1;
+#endif
memcg_get_cache_ids();
lru->node = kzalloc(size, GFP_KERNEL);
@@ -602,6 +652,9 @@ void list_lru_destroy(struct list_lru *lru)
kfree(lru->node);
lru->node = NULL;
+#ifdef CONFIG_MEMCG_KMEM
+ lru->shrinker_id = -1;
+#endif
memcg_put_cache_ids();
}
EXPORT_SYMBOL_GPL(list_lru_destroy);
diff --git a/mm/madvise.c b/mm/madvise.c
index 4d3c922ea1a1..972a9eaa898b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -631,11 +631,13 @@ static int madvise_inject_error(int behavior,
for (; start < end; start += PAGE_SIZE << order) {
+ unsigned long pfn;
int ret;
ret = get_user_pages_fast(start, 1, 0, &page);
if (ret != 1)
return ret;
+ pfn = page_to_pfn(page);
/*
* When soft offlining hugepages, after migrating the page
@@ -651,17 +653,25 @@ static int madvise_inject_error(int behavior,
if (behavior == MADV_SOFT_OFFLINE) {
pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
- page_to_pfn(page), start);
+ pfn, start);
ret = soft_offline_page(page, MF_COUNT_INCREASED);
if (ret)
return ret;
continue;
}
+
pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
- page_to_pfn(page), start);
+ pfn, start);
- ret = memory_failure(page_to_pfn(page), MF_COUNT_INCREASED);
+ /*
+ * Drop the page reference taken by get_user_pages_fast(). In
+ * the absence of MF_COUNT_INCREASED the memory_failure()
+ * routine is responsible for pinning the page to prevent it
+ * from being released back to the page allocator.
+ */
+ put_page(page);
+ ret = memory_failure(pfn, 0);
if (ret)
return ret;
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 03d48d8835ba..237944479d25 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -20,12 +20,68 @@
#include <linux/kmemleak.h>
#include <linux/seq_file.h>
#include <linux/memblock.h>
+#include <linux/bootmem.h>
#include <asm/sections.h>
#include <linux/io.h>
#include "internal.h"
+/**
+ * DOC: memblock overview
+ *
+ * Memblock is a method of managing memory regions during the early
+ * boot period when the usual kernel memory allocators are not up and
+ * running.
+ *
+ * Memblock views the system memory as collections of contiguous
+ * regions. There are several types of these collections:
+ *
+ * * ``memory`` - describes the physical memory available to the
+ * kernel; this may differ from the actual physical memory installed
+ * in the system, for instance when the memory is restricted with
+ * ``mem=`` command line parameter
+ * * ``reserved`` - describes the regions that were allocated
+ * * ``physmap`` - describes the actual physical memory regardless of
+ * the possible restrictions; the ``physmap`` type is only available
+ * on some architectures.
+ *
+ * Each region is represented by :c:type:`struct memblock_region` that
+ * defines the region extents, its attributes and NUMA node id on NUMA
+ * systems. Every memory type is described by the :c:type:`struct
+ * memblock_type` which contains an array of memory regions along with
+ * the allocator metadata. The memory types are nicely wrapped with
+ * :c:type:`struct memblock`. This structure is statically initialzed
+ * at build time. The region arrays for the "memory" and "reserved"
+ * types are initially sized to %INIT_MEMBLOCK_REGIONS and for the
+ * "physmap" type to %INIT_PHYSMEM_REGIONS.
+ * The :c:func:`memblock_allow_resize` enables automatic resizing of
+ * the region arrays during addition of new regions. This feature
+ * should be used with care so that memory allocated for the region
+ * array will not overlap with areas that should be reserved, for
+ * example initrd.
+ *
+ * The early architecture setup should tell memblock what the physical
+ * memory layout is by using :c:func:`memblock_add` or
+ * :c:func:`memblock_add_node` functions. The first function does not
+ * assign the region to a NUMA node and it is appropriate for UMA
+ * systems. Yet, it is possible to use it on NUMA systems as well and
+ * assign the region to a NUMA node later in the setup process using
+ * :c:func:`memblock_set_node`. The :c:func:`memblock_add_node`
+ * performs such an assignment directly.
+ *
+ * Once memblock is setup the memory can be allocated using either
+ * memblock or bootmem APIs.
+ *
+ * As the system boot progresses, the architecture specific
+ * :c:func:`mem_init` function frees all the memory to the buddy page
+ * allocator.
+ *
+ * If an architecure enables %CONFIG_ARCH_DISCARD_MEMBLOCK, the
+ * memblock data structures will be discarded after the system
+ * initialization compltes.
+ */
+
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
@@ -60,7 +116,7 @@ static int memblock_can_resize __initdata_memblock;
static int memblock_memory_in_slab __initdata_memblock = 0;
static int memblock_reserved_in_slab __initdata_memblock = 0;
-ulong __init_memblock choose_memblock_flags(void)
+enum memblock_flags __init_memblock choose_memblock_flags(void)
{
return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
}
@@ -92,10 +148,11 @@ bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
return i < type->cnt;
}
-/*
+/**
* __memblock_find_range_bottom_up - find free area utility in bottom-up
* @start: start of candidate range
- * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or
+ * %MEMBLOCK_ALLOC_ACCESSIBLE
* @size: size of free area to find
* @align: alignment of free area to find
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
@@ -103,13 +160,13 @@ bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
*
* Utility called from memblock_find_in_range_node(), find free area bottom-up.
*
- * RETURNS:
+ * Return:
* Found address on success, 0 on failure.
*/
static phys_addr_t __init_memblock
__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
phys_addr_t size, phys_addr_t align, int nid,
- ulong flags)
+ enum memblock_flags flags)
{
phys_addr_t this_start, this_end, cand;
u64 i;
@@ -129,7 +186,8 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
/**
* __memblock_find_range_top_down - find free area utility, in top-down
* @start: start of candidate range
- * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or
+ * %MEMBLOCK_ALLOC_ACCESSIBLE
* @size: size of free area to find
* @align: alignment of free area to find
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
@@ -137,13 +195,13 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
*
* Utility called from memblock_find_in_range_node(), find free area top-down.
*
- * RETURNS:
+ * Return:
* Found address on success, 0 on failure.
*/
static phys_addr_t __init_memblock
__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
phys_addr_t size, phys_addr_t align, int nid,
- ulong flags)
+ enum memblock_flags flags)
{
phys_addr_t this_start, this_end, cand;
u64 i;
@@ -169,7 +227,8 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
* @size: size of free area to find
* @align: alignment of free area to find
* @start: start of candidate range
- * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or
+ * %MEMBLOCK_ALLOC_ACCESSIBLE
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
* @flags: pick from blocks based on memory attributes
*
@@ -183,12 +242,13 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
*
* If bottom-up allocation failed, will try to allocate memory top-down.
*
- * RETURNS:
+ * Return:
* Found address on success, 0 on failure.
*/
phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
- phys_addr_t end, int nid, ulong flags)
+ phys_addr_t end, int nid,
+ enum memblock_flags flags)
{
phys_addr_t kernel_end, ret;
@@ -227,7 +287,8 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
* so we use WARN_ONCE() here to see the stack trace if
* fail happens.
*/
- WARN_ONCE(1, "memblock: bottom-up allocation failed, memory hotunplug may be affected\n");
+ WARN_ONCE(IS_ENABLED(CONFIG_MEMORY_HOTREMOVE),
+ "memblock: bottom-up allocation failed, memory hotremove may be affected\n");
}
return __memblock_find_range_top_down(start, end, size, align, nid,
@@ -237,13 +298,14 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
/**
* memblock_find_in_range - find free area in given range
* @start: start of candidate range
- * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or
+ * %MEMBLOCK_ALLOC_ACCESSIBLE
* @size: size of free area to find
* @align: alignment of free area to find
*
* Find @size free area aligned to @align in the specified range.
*
- * RETURNS:
+ * Return:
* Found address on success, 0 on failure.
*/
phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
@@ -251,7 +313,7 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
phys_addr_t align)
{
phys_addr_t ret;
- ulong flags = choose_memblock_flags();
+ enum memblock_flags flags = choose_memblock_flags();
again:
ret = memblock_find_in_range_node(size, align, start, end,
@@ -287,7 +349,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
/**
- * Discard memory and reserved arrays if they were allocated
+ * memblock_discard - discard memory and reserved arrays if they were allocated
*/
void __init memblock_discard(void)
{
@@ -317,11 +379,11 @@ void __init memblock_discard(void)
*
* Double the size of the @type regions array. If memblock is being used to
* allocate memory for a new reserved regions array and there is a previously
- * allocated memory range [@new_area_start,@new_area_start+@new_area_size]
+ * allocated memory range [@new_area_start, @new_area_start + @new_area_size]
* waiting to be reserved, ensure the memory used by the new array does
* not overlap.
*
- * RETURNS:
+ * Return:
* 0 on success, -1 on failure.
*/
static int __init_memblock memblock_double_array(struct memblock_type *type,
@@ -330,7 +392,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
{
struct memblock_region *new_array, *old_array;
phys_addr_t old_alloc_size, new_alloc_size;
- phys_addr_t old_size, new_size, addr;
+ phys_addr_t old_size, new_size, addr, new_end;
int use_slab = slab_is_available();
int *in_slab;
@@ -391,9 +453,9 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
return -1;
}
- memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]",
- type->name, type->max * 2, (u64)addr,
- (u64)addr + new_size - 1);
+ new_end = addr + new_size - 1;
+ memblock_dbg("memblock: %s is doubled to %ld at [%pa-%pa]",
+ type->name, type->max * 2, &addr, &new_end);
/*
* Found space, we now need to move the array over before we add the
@@ -466,13 +528,14 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
* @nid: node id of the new region
* @flags: flags of the new region
*
- * Insert new memblock region [@base,@base+@size) into @type at @idx.
+ * Insert new memblock region [@base, @base + @size) into @type at @idx.
* @type must already have extra room to accommodate the new region.
*/
static void __init_memblock memblock_insert_region(struct memblock_type *type,
int idx, phys_addr_t base,
phys_addr_t size,
- int nid, unsigned long flags)
+ int nid,
+ enum memblock_flags flags)
{
struct memblock_region *rgn = &type->regions[idx];
@@ -494,17 +557,17 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
* @nid: nid of the new region
* @flags: flags of the new region
*
- * Add new memblock region [@base,@base+@size) into @type. The new region
+ * Add new memblock region [@base, @base + @size) into @type. The new region
* is allowed to overlap with existing ones - overlaps don't affect already
* existing regions. @type is guaranteed to be minimal (all neighbouring
* compatible regions are merged) after the addition.
*
- * RETURNS:
+ * Return:
* 0 on success, -errno on failure.
*/
int __init_memblock memblock_add_range(struct memblock_type *type,
phys_addr_t base, phys_addr_t size,
- int nid, unsigned long flags)
+ int nid, enum memblock_flags flags)
{
bool insert = false;
phys_addr_t obase = base;
@@ -588,12 +651,35 @@ repeat:
}
}
+/**
+ * memblock_add_node - add new memblock region within a NUMA node
+ * @base: base address of the new region
+ * @size: size of the new region
+ * @nid: nid of the new region
+ *
+ * Add new memblock region [@base, @base + @size) to the "memory"
+ * type. See memblock_add_range() description for mode details
+ *
+ * Return:
+ * 0 on success, -errno on failure.
+ */
int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
int nid)
{
return memblock_add_range(&memblock.memory, base, size, nid, 0);
}
+/**
+ * memblock_add - add new memblock region
+ * @base: base address of the new region
+ * @size: size of the new region
+ *
+ * Add new memblock region [@base, @base + @size) to the "memory"
+ * type. See memblock_add_range() description for mode details
+ *
+ * Return:
+ * 0 on success, -errno on failure.
+ */
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
@@ -613,11 +699,11 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
* @end_rgn: out parameter for the end of isolated region
*
* Walk @type and ensure that regions don't cross the boundaries defined by
- * [@base,@base+@size). Crossing regions are split at the boundaries,
+ * [@base, @base + @size). Crossing regions are split at the boundaries,
* which may create at most two more regions. The index of the first
* region inside the range is returned in *@start_rgn and end in *@end_rgn.
*
- * RETURNS:
+ * Return:
* 0 on success, -errno on failure.
*/
static int __init_memblock memblock_isolate_range(struct memblock_type *type,
@@ -728,10 +814,15 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
}
/**
+ * memblock_setclr_flag - set or clear flag for a memory region
+ * @base: base address of the region
+ * @size: size of the region
+ * @set: set or clear the flag
+ * @flag: the flag to udpate
*
* This function isolates region [@base, @base + @size), and sets/clears flag
*
- * Return 0 on success, -errno on failure.
+ * Return: 0 on success, -errno on failure.
*/
static int __init_memblock memblock_setclr_flag(phys_addr_t base,
phys_addr_t size, int set, int flag)
@@ -758,7 +849,7 @@ static int __init_memblock memblock_setclr_flag(phys_addr_t base,
* @base: the base phys addr of the region
* @size: the size of the region
*
- * Return 0 on success, -errno on failure.
+ * Return: 0 on success, -errno on failure.
*/
int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
{
@@ -770,7 +861,7 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
* @base: the base phys addr of the region
* @size: the size of the region
*
- * Return 0 on success, -errno on failure.
+ * Return: 0 on success, -errno on failure.
*/
int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
{
@@ -782,7 +873,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
* @base: the base phys addr of the region
* @size: the size of the region
*
- * Return 0 on success, -errno on failure.
+ * Return: 0 on success, -errno on failure.
*/
int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
{
@@ -796,7 +887,7 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
* @base: the base phys addr of the region
* @size: the size of the region
*
- * Return 0 on success, -errno on failure.
+ * Return: 0 on success, -errno on failure.
*/
int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size)
{
@@ -808,7 +899,7 @@ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size)
* @base: the base phys addr of the region
* @size: the size of the region
*
- * Return 0 on success, -errno on failure.
+ * Return: 0 on success, -errno on failure.
*/
int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size)
{
@@ -873,7 +964,8 @@ void __init_memblock __next_reserved_mem_region(u64 *idx,
* As both region arrays are sorted, the function advances the two indices
* in lockstep and returns each intersection.
*/
-void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
+void __init_memblock __next_mem_range(u64 *idx, int nid,
+ enum memblock_flags flags,
struct memblock_type *type_a,
struct memblock_type *type_b,
phys_addr_t *out_start,
@@ -968,9 +1060,6 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
/**
* __next_mem_range_rev - generic next function for for_each_*_range_rev()
*
- * Finds the next range from type_a which is not marked as unsuitable
- * in type_b.
- *
* @idx: pointer to u64 loop variable
* @nid: node selector, %NUMA_NO_NODE for all nodes
* @flags: pick from blocks based on memory attributes
@@ -980,9 +1069,13 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
* @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @out_nid: ptr to int for nid of the range, can be %NULL
*
+ * Finds the next range from type_a which is not marked as unsuitable
+ * in type_b.
+ *
* Reverse of __next_mem_range().
*/
-void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
+void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
+ enum memblock_flags flags,
struct memblock_type *type_a,
struct memblock_type *type_b,
phys_addr_t *out_start,
@@ -1114,10 +1207,10 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
* @type: memblock type to set node ID for
* @nid: node ID to set
*
- * Set the nid of memblock @type regions in [@base,@base+@size) to @nid.
+ * Set the nid of memblock @type regions in [@base, @base + @size) to @nid.
* Regions which cross the area boundaries are split as necessary.
*
- * RETURNS:
+ * Return:
* 0 on success, -errno on failure.
*/
int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
@@ -1140,7 +1233,8 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
- phys_addr_t end, int nid, ulong flags)
+ phys_addr_t end, int nid,
+ enum memblock_flags flags)
{
phys_addr_t found;
@@ -1162,7 +1256,7 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
phys_addr_t start, phys_addr_t end,
- ulong flags)
+ enum memblock_flags flags)
{
return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
flags);
@@ -1170,14 +1264,14 @@ phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t max_addr,
- int nid, ulong flags)
+ int nid, enum memblock_flags flags)
{
return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags);
}
phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
{
- ulong flags = choose_memblock_flags();
+ enum memblock_flags flags = choose_memblock_flags();
phys_addr_t ret;
again:
@@ -1224,6 +1318,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
}
+#if defined(CONFIG_NO_BOOTMEM)
/**
* memblock_virt_alloc_internal - allocate boot memory block
* @size: size of memory block to be allocated in bytes
@@ -1240,7 +1335,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
* The allocation is performed from memory region limited by
* memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE.
*
- * The memory block is aligned on SMP_CACHE_BYTES if @align == 0.
+ * The memory block is aligned on %SMP_CACHE_BYTES if @align == 0.
*
* The phys address of allocated boot memory block is converted to virtual and
* allocated memory is reset to 0.
@@ -1248,7 +1343,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
* In addition, function sets the min_count to 0 using kmemleak_alloc for
* allocated boot memory block, so that it is never reported as leaks.
*
- * RETURNS:
+ * Return:
* Virtual address of allocated memory block on success, NULL on failure.
*/
static void * __init memblock_virt_alloc_internal(
@@ -1258,7 +1353,7 @@ static void * __init memblock_virt_alloc_internal(
{
phys_addr_t alloc;
void *ptr;
- ulong flags = choose_memblock_flags();
+ enum memblock_flags flags = choose_memblock_flags();
if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
nid = NUMA_NO_NODE;
@@ -1333,7 +1428,7 @@ done:
* info), if enabled. Does not zero allocated memory, does not panic if request
* cannot be satisfied.
*
- * RETURNS:
+ * Return:
* Virtual address of allocated memory block on success, NULL on failure.
*/
void * __init memblock_virt_alloc_try_nid_raw(
@@ -1343,9 +1438,9 @@ void * __init memblock_virt_alloc_try_nid_raw(
{
void *ptr;
- memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
- __func__, (u64)size, (u64)align, nid, (u64)min_addr,
- (u64)max_addr, (void *)_RET_IP_);
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n",
+ __func__, (u64)size, (u64)align, nid, &min_addr,
+ &max_addr, (void *)_RET_IP_);
ptr = memblock_virt_alloc_internal(size, align,
min_addr, max_addr, nid);
@@ -1370,7 +1465,7 @@ void * __init memblock_virt_alloc_try_nid_raw(
* Public function, provides additional debug information (including caller
* info), if enabled. This function zeroes the allocated memory.
*
- * RETURNS:
+ * Return:
* Virtual address of allocated memory block on success, NULL on failure.
*/
void * __init memblock_virt_alloc_try_nid_nopanic(
@@ -1380,9 +1475,9 @@ void * __init memblock_virt_alloc_try_nid_nopanic(
{
void *ptr;
- memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
- __func__, (u64)size, (u64)align, nid, (u64)min_addr,
- (u64)max_addr, (void *)_RET_IP_);
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n",
+ __func__, (u64)size, (u64)align, nid, &min_addr,
+ &max_addr, (void *)_RET_IP_);
ptr = memblock_virt_alloc_internal(size, align,
min_addr, max_addr, nid);
@@ -1406,7 +1501,7 @@ void * __init memblock_virt_alloc_try_nid_nopanic(
* which provides debug information (including caller info), if enabled,
* and panics if the request can not be satisfied.
*
- * RETURNS:
+ * Return:
* Virtual address of allocated memory block on success, NULL on failure.
*/
void * __init memblock_virt_alloc_try_nid(
@@ -1416,9 +1511,9 @@ void * __init memblock_virt_alloc_try_nid(
{
void *ptr;
- memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
- __func__, (u64)size, (u64)align, nid, (u64)min_addr,
- (u64)max_addr, (void *)_RET_IP_);
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n",
+ __func__, (u64)size, (u64)align, nid, &min_addr,
+ &max_addr, (void *)_RET_IP_);
ptr = memblock_virt_alloc_internal(size, align,
min_addr, max_addr, nid);
if (ptr) {
@@ -1426,11 +1521,11 @@ void * __init memblock_virt_alloc_try_nid(
return ptr;
}
- panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n",
- __func__, (u64)size, (u64)align, nid, (u64)min_addr,
- (u64)max_addr);
+ panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa\n",
+ __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr);
return NULL;
}
+#endif
/**
* __memblock_free_early - free boot memory block
@@ -1442,16 +1537,17 @@ void * __init memblock_virt_alloc_try_nid(
*/
void __init __memblock_free_early(phys_addr_t base, phys_addr_t size)
{
- memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
- __func__, (u64)base, (u64)base + size - 1,
- (void *)_RET_IP_);
+ phys_addr_t end = base + size - 1;
+
+ memblock_dbg("%s: [%pa-%pa] %pF\n",
+ __func__, &base, &end, (void *)_RET_IP_);
kmemleak_free_part_phys(base, size);
memblock_remove_range(&memblock.reserved, base, size);
}
-/*
+/**
* __memblock_free_late - free bootmem block pages directly to buddy allocator
- * @addr: phys starting address of the boot memory block
+ * @base: phys starting address of the boot memory block
* @size: size of the boot memory block in bytes
*
* This is only useful when the bootmem allocator has already been torn
@@ -1460,11 +1556,11 @@ void __init __memblock_free_early(phys_addr_t base, phys_addr_t size)
*/
void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
{
- u64 cursor, end;
+ phys_addr_t cursor, end;
- memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
- __func__, (u64)base, (u64)base + size - 1,
- (void *)_RET_IP_);
+ end = base + size - 1;
+ memblock_dbg("%s: [%pa-%pa] %pF\n",
+ __func__, &base, &end, (void *)_RET_IP_);
kmemleak_free_part_phys(base, size);
cursor = PFN_UP(base);
end = PFN_DOWN(base + size);
@@ -1663,9 +1759,9 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
* @base: base of region to check
* @size: size of region to check
*
- * Check if the region [@base, @base+@size) is a subset of a memory block.
+ * Check if the region [@base, @base + @size) is a subset of a memory block.
*
- * RETURNS:
+ * Return:
* 0 if false, non-zero if true
*/
bool __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
@@ -1684,9 +1780,10 @@ bool __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t siz
* @base: base of region to check
* @size: size of region to check
*
- * Check if the region [@base, @base+@size) intersects a reserved memory block.
+ * Check if the region [@base, @base + @size) intersects a reserved
+ * memory block.
*
- * RETURNS:
+ * Return:
* True if they intersect, false if not.
*/
bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
@@ -1733,7 +1830,7 @@ phys_addr_t __init_memblock memblock_get_current_limit(void)
static void __init_memblock memblock_dump(struct memblock_type *type)
{
phys_addr_t base, end, size;
- unsigned long flags;
+ enum memblock_flags flags;
int idx;
struct memblock_region *rgn;
@@ -1751,7 +1848,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type)
snprintf(nid_buf, sizeof(nid_buf), " on node %d",
memblock_get_region_node(rgn));
#endif
- pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#lx\n",
+ pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#x\n",
type->name, idx, &base, &end, &size, nid_buf, flags);
}
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e6f0d5ef320a..4ead5a4817de 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -233,6 +233,21 @@ enum res_type {
/* Used for OOM nofiier */
#define OOM_CONTROL (0)
+/*
+ * Iteration constructs for visiting all cgroups (under a tree). If
+ * loops are exited prematurely (break), mem_cgroup_iter_break() must
+ * be used for reference counting.
+ */
+#define for_each_mem_cgroup_tree(iter, root) \
+ for (iter = mem_cgroup_iter(root, NULL, NULL); \
+ iter != NULL; \
+ iter = mem_cgroup_iter(root, iter, NULL))
+
+#define for_each_mem_cgroup(iter) \
+ for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
+ iter != NULL; \
+ iter = mem_cgroup_iter(NULL, iter, NULL))
+
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
@@ -246,12 +261,7 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
}
-static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
-{
- return (memcg == root_mem_cgroup);
-}
-
-#ifndef CONFIG_SLOB
+#ifdef CONFIG_MEMCG_KMEM
/*
* This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
* The main reason for not using cgroup id for this:
@@ -305,7 +315,135 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key);
struct workqueue_struct *memcg_kmem_cache_wq;
-#endif /* !CONFIG_SLOB */
+static int memcg_shrinker_map_size;
+static DEFINE_MUTEX(memcg_shrinker_map_mutex);
+
+static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
+{
+ kvfree(container_of(head, struct memcg_shrinker_map, rcu));
+}
+
+static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
+ int size, int old_size)
+{
+ struct memcg_shrinker_map *new, *old;
+ int nid;
+
+ lockdep_assert_held(&memcg_shrinker_map_mutex);
+
+ for_each_node(nid) {
+ old = rcu_dereference_protected(
+ mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
+ /* Not yet online memcg */
+ if (!old)
+ return 0;
+
+ new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ /* Set all old bits, clear all new bits */
+ memset(new->map, (int)0xff, old_size);
+ memset((void *)new->map + old_size, 0, size - old_size);
+
+ rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
+ call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
+ }
+
+ return 0;
+}
+
+static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup_per_node *pn;
+ struct memcg_shrinker_map *map;
+ int nid;
+
+ if (mem_cgroup_is_root(memcg))
+ return;
+
+ for_each_node(nid) {
+ pn = mem_cgroup_nodeinfo(memcg, nid);
+ map = rcu_dereference_protected(pn->shrinker_map, true);
+ if (map)
+ kvfree(map);
+ rcu_assign_pointer(pn->shrinker_map, NULL);
+ }
+}
+
+static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
+{
+ struct memcg_shrinker_map *map;
+ int nid, size, ret = 0;
+
+ if (mem_cgroup_is_root(memcg))
+ return 0;
+
+ mutex_lock(&memcg_shrinker_map_mutex);
+ size = memcg_shrinker_map_size;
+ for_each_node(nid) {
+ map = kvzalloc(sizeof(*map) + size, GFP_KERNEL);
+ if (!map) {
+ memcg_free_shrinker_maps(memcg);
+ ret = -ENOMEM;
+ break;
+ }
+ rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
+ }
+ mutex_unlock(&memcg_shrinker_map_mutex);
+
+ return ret;
+}
+
+int memcg_expand_shrinker_maps(int new_id)
+{
+ int size, old_size, ret = 0;
+ struct mem_cgroup *memcg;
+
+ size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
+ old_size = memcg_shrinker_map_size;
+ if (size <= old_size)
+ return 0;
+
+ mutex_lock(&memcg_shrinker_map_mutex);
+ if (!root_mem_cgroup)
+ goto unlock;
+
+ for_each_mem_cgroup(memcg) {
+ if (mem_cgroup_is_root(memcg))
+ continue;
+ ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
+ if (ret)
+ goto unlock;
+ }
+unlock:
+ if (!ret)
+ memcg_shrinker_map_size = size;
+ mutex_unlock(&memcg_shrinker_map_mutex);
+ return ret;
+}
+
+void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
+{
+ if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
+ struct memcg_shrinker_map *map;
+
+ rcu_read_lock();
+ map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
+ /* Pairs with smp mb in shrink_slab() */
+ smp_mb__before_atomic();
+ set_bit(shrinker_id, map->map);
+ rcu_read_unlock();
+ }
+}
+
+#else /* CONFIG_MEMCG_KMEM */
+static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
+{
+ return 0;
+}
+static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
+#endif /* CONFIG_MEMCG_KMEM */
/**
* mem_cgroup_css_from_page - css of the memcg associated with a page
@@ -678,9 +816,20 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
}
EXPORT_SYMBOL(mem_cgroup_from_task);
-static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
+/**
+ * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
+ * @mm: mm from which memcg should be extracted. It can be NULL.
+ *
+ * Obtain a reference on mm->memcg and returns it if successful. Otherwise
+ * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is
+ * returned.
+ */
+struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
- struct mem_cgroup *memcg = NULL;
+ struct mem_cgroup *memcg;
+
+ if (mem_cgroup_disabled())
+ return NULL;
rcu_read_lock();
do {
@@ -700,6 +849,46 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
rcu_read_unlock();
return memcg;
}
+EXPORT_SYMBOL(get_mem_cgroup_from_mm);
+
+/**
+ * get_mem_cgroup_from_page: Obtain a reference on given page's memcg.
+ * @page: page from which memcg should be extracted.
+ *
+ * Obtain a reference on page->memcg and returns it if successful. Otherwise
+ * root_mem_cgroup is returned.
+ */
+struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
+{
+ struct mem_cgroup *memcg = page->mem_cgroup;
+
+ if (mem_cgroup_disabled())
+ return NULL;
+
+ rcu_read_lock();
+ if (!memcg || !css_tryget_online(&memcg->css))
+ memcg = root_mem_cgroup;
+ rcu_read_unlock();
+ return memcg;
+}
+EXPORT_SYMBOL(get_mem_cgroup_from_page);
+
+/**
+ * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg.
+ */
+static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
+{
+ if (unlikely(current->active_memcg)) {
+ struct mem_cgroup *memcg = root_mem_cgroup;
+
+ rcu_read_lock();
+ if (css_tryget_online(&current->active_memcg->css))
+ memcg = current->active_memcg;
+ rcu_read_unlock();
+ return memcg;
+ }
+ return get_mem_cgroup_from_mm(current->mm);
+}
/**
* mem_cgroup_iter - iterate over memory cgroup hierarchy
@@ -850,7 +1039,7 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
int nid;
int i;
- while ((memcg = parent_mem_cgroup(memcg))) {
+ for (; memcg; memcg = parent_mem_cgroup(memcg)) {
for_each_node(nid) {
mz = mem_cgroup_nodeinfo(memcg, nid);
for (i = 0; i <= DEF_PRIORITY; i++) {
@@ -862,21 +1051,6 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
}
}
-/*
- * Iteration constructs for visiting all cgroups (under a tree). If
- * loops are exited prematurely (break), mem_cgroup_iter_break() must
- * be used for reference counting.
- */
-#define for_each_mem_cgroup_tree(iter, root) \
- for (iter = mem_cgroup_iter(root, NULL, NULL); \
- iter != NULL; \
- iter = mem_cgroup_iter(root, iter, NULL))
-
-#define for_each_mem_cgroup(iter) \
- for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
- iter != NULL; \
- iter = mem_cgroup_iter(NULL, iter, NULL))
-
/**
* mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
* @memcg: hierarchy root
@@ -1483,28 +1657,53 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
}
-static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
+enum oom_status {
+ OOM_SUCCESS,
+ OOM_FAILED,
+ OOM_ASYNC,
+ OOM_SKIPPED
+};
+
+static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
- if (!current->memcg_may_oom || order > PAGE_ALLOC_COSTLY_ORDER)
- return;
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ return OOM_SKIPPED;
+
/*
* We are in the middle of the charge context here, so we
* don't want to block when potentially sitting on a callstack
* that holds all kinds of filesystem and mm locks.
*
- * Also, the caller may handle a failed allocation gracefully
- * (like optional page cache readahead) and so an OOM killer
- * invocation might not even be necessary.
+ * cgroup1 allows disabling the OOM killer and waiting for outside
+ * handling until the charge can succeed; remember the context and put
+ * the task to sleep at the end of the page fault when all locks are
+ * released.
*
- * That's why we don't do anything here except remember the
- * OOM context and then deal with it at the end of the page
- * fault when the stack is unwound, the locks are released,
- * and when we know whether the fault was overall successful.
+ * On the other hand, in-kernel OOM killer allows for an async victim
+ * memory reclaim (oom_reaper) and that means that we are not solely
+ * relying on the oom victim to make a forward progress and we can
+ * invoke the oom killer here.
+ *
+ * Please note that mem_cgroup_out_of_memory might fail to find a
+ * victim and then we have to bail out from the charge path.
*/
- css_get(&memcg->css);
- current->memcg_in_oom = memcg;
- current->memcg_oom_gfp_mask = mask;
- current->memcg_oom_order = order;
+ if (memcg->oom_kill_disable) {
+ if (!current->in_user_fault)
+ return OOM_SKIPPED;
+ css_get(&memcg->css);
+ current->memcg_in_oom = memcg;
+ current->memcg_oom_gfp_mask = mask;
+ current->memcg_oom_order = order;
+
+ return OOM_ASYNC;
+ }
+
+ if (mem_cgroup_out_of_memory(memcg, mask, order))
+ return OOM_SUCCESS;
+
+ WARN(1,"Memory cgroup charge failed because of no reclaimable memory! "
+ "This looks like a misconfiguration or a kernel bug.");
+ return OOM_FAILED;
}
/**
@@ -1578,6 +1777,62 @@ cleanup:
}
/**
+ * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
+ * @victim: task to be killed by the OOM killer
+ * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
+ *
+ * Returns a pointer to a memory cgroup, which has to be cleaned up
+ * by killing all belonging OOM-killable tasks.
+ *
+ * Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
+ */
+struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
+ struct mem_cgroup *oom_domain)
+{
+ struct mem_cgroup *oom_group = NULL;
+ struct mem_cgroup *memcg;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return NULL;
+
+ if (!oom_domain)
+ oom_domain = root_mem_cgroup;
+
+ rcu_read_lock();
+
+ memcg = mem_cgroup_from_task(victim);
+ if (memcg == root_mem_cgroup)
+ goto out;
+
+ /*
+ * Traverse the memory cgroup hierarchy from the victim task's
+ * cgroup up to the OOMing cgroup (or root) to find the
+ * highest-level memory cgroup with oom.group set.
+ */
+ for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+ if (memcg->oom_group)
+ oom_group = memcg;
+
+ if (memcg == oom_domain)
+ break;
+ }
+
+ if (oom_group)
+ css_get(&oom_group->css);
+out:
+ rcu_read_unlock();
+
+ return oom_group;
+}
+
+void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
+{
+ pr_info("Tasks in ");
+ pr_cont_cgroup_path(memcg->css.cgroup);
+ pr_cont(" are going to be killed due to memory.oom.group set\n");
+}
+
+/**
* lock_page_memcg - lock a page->mem_cgroup binding
* @page: the page
*
@@ -1899,6 +2154,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned long nr_reclaimed;
bool may_swap = true;
bool drained = false;
+ bool oomed = false;
+ enum oom_status oom_status;
if (mem_cgroup_is_root(memcg))
return 0;
@@ -1986,6 +2243,9 @@ retry:
if (nr_retries--)
goto retry;
+ if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed)
+ goto nomem;
+
if (gfp_mask & __GFP_NOFAIL)
goto force;
@@ -1994,8 +2254,23 @@ retry:
memcg_memory_event(mem_over_limit, MEMCG_OOM);
- mem_cgroup_oom(mem_over_limit, gfp_mask,
+ /*
+ * keep retrying as long as the memcg oom killer is able to make
+ * a forward progress or bypass the charge if the oom killer
+ * couldn't make any progress.
+ */
+ oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
get_order(nr_pages * PAGE_SIZE));
+ switch (oom_status) {
+ case OOM_SUCCESS:
+ nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ oomed = true;
+ goto retry;
+ case OOM_FAILED:
+ goto force;
+ default:
+ goto nomem;
+ }
nomem:
if (!(gfp_mask & __GFP_NOFAIL))
return -ENOMEM;
@@ -2119,7 +2394,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
unlock_page_lru(page, isolated);
}
-#ifndef CONFIG_SLOB
+#ifdef CONFIG_MEMCG_KMEM
static int memcg_alloc_cache_id(void)
{
int id, size;
@@ -2261,7 +2536,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
if (current->memcg_kmem_skip_account)
return cachep;
- memcg = get_mem_cgroup_from_mm(current->mm);
+ memcg = get_mem_cgroup_from_current();
kmemcg_id = READ_ONCE(memcg->kmemcg_id);
if (kmemcg_id < 0)
goto out;
@@ -2345,7 +2620,7 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
if (memcg_kmem_bypass())
return 0;
- memcg = get_mem_cgroup_from_mm(current->mm);
+ memcg = get_mem_cgroup_from_current();
if (!mem_cgroup_is_root(memcg)) {
ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
if (!ret)
@@ -2384,7 +2659,7 @@ void memcg_kmem_uncharge(struct page *page, int order)
css_put_many(&memcg->css, nr_pages);
}
-#endif /* !CONFIG_SLOB */
+#endif /* CONFIG_MEMCG_KMEM */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -2680,29 +2955,34 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
return retval;
}
-static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
-{
- struct mem_cgroup *iter;
- int i;
-
- memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT);
-
- for_each_mem_cgroup_tree(iter, memcg) {
- for (i = 0; i < MEMCG_NR_STAT; i++)
- stat[i] += memcg_page_state(iter, i);
- }
-}
+struct accumulated_stats {
+ unsigned long stat[MEMCG_NR_STAT];
+ unsigned long events[NR_VM_EVENT_ITEMS];
+ unsigned long lru_pages[NR_LRU_LISTS];
+ const unsigned int *stats_array;
+ const unsigned int *events_array;
+ int stats_size;
+ int events_size;
+};
-static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
+static void accumulate_memcg_tree(struct mem_cgroup *memcg,
+ struct accumulated_stats *acc)
{
- struct mem_cgroup *iter;
+ struct mem_cgroup *mi;
int i;
- memset(events, 0, sizeof(*events) * NR_VM_EVENT_ITEMS);
+ for_each_mem_cgroup_tree(mi, memcg) {
+ for (i = 0; i < acc->stats_size; i++)
+ acc->stat[i] += memcg_page_state(mi,
+ acc->stats_array ? acc->stats_array[i] : i);
- for_each_mem_cgroup_tree(iter, memcg) {
- for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
- events[i] += memcg_sum_events(iter, i);
+ for (i = 0; i < acc->events_size; i++)
+ acc->events[i] += memcg_sum_events(mi,
+ acc->events_array ? acc->events_array[i] : i);
+
+ for (i = 0; i < NR_LRU_LISTS; i++)
+ acc->lru_pages[i] +=
+ mem_cgroup_nr_lru_pages(mi, BIT(i));
}
}
@@ -2779,7 +3059,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
}
}
-#ifndef CONFIG_SLOB
+#ifdef CONFIG_MEMCG_KMEM
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
int memcg_id;
@@ -2851,7 +3131,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
}
rcu_read_unlock();
- memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
+ memcg_drain_all_list_lrus(kmemcg_id, parent);
memcg_free_cache_id(kmemcg_id);
}
@@ -2879,7 +3159,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
static void memcg_free_kmem(struct mem_cgroup *memcg)
{
}
-#endif /* !CONFIG_SLOB */
+#endif /* CONFIG_MEMCG_KMEM */
static int memcg_update_kmem_max(struct mem_cgroup *memcg,
unsigned long max)
@@ -3113,6 +3393,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
unsigned long memory, memsw;
struct mem_cgroup *mi;
unsigned int i;
+ struct accumulated_stats acc;
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
@@ -3145,32 +3426,27 @@ static int memcg_stat_show(struct seq_file *m, void *v)
seq_printf(m, "hierarchical_memsw_limit %llu\n",
(u64)memsw * PAGE_SIZE);
- for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
- unsigned long long val = 0;
+ memset(&acc, 0, sizeof(acc));
+ acc.stats_size = ARRAY_SIZE(memcg1_stats);
+ acc.stats_array = memcg1_stats;
+ acc.events_size = ARRAY_SIZE(memcg1_events);
+ acc.events_array = memcg1_events;
+ accumulate_memcg_tree(memcg, &acc);
+ for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
- for_each_mem_cgroup_tree(mi, memcg)
- val += memcg_page_state(mi, memcg1_stats[i]) *
- PAGE_SIZE;
- seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val);
+ seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
+ (u64)acc.stat[i] * PAGE_SIZE);
}
- for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) {
- unsigned long long val = 0;
-
- for_each_mem_cgroup_tree(mi, memcg)
- val += memcg_sum_events(mi, memcg1_events[i]);
- seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val);
- }
-
- for (i = 0; i < NR_LRU_LISTS; i++) {
- unsigned long long val = 0;
+ for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
+ seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
+ (u64)acc.events[i]);
- for_each_mem_cgroup_tree(mi, memcg)
- val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
- seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
- }
+ for (i = 0; i < NR_LRU_LISTS; i++)
+ seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
+ (u64)acc.lru_pages[i] * PAGE_SIZE);
#ifdef CONFIG_DEBUG_VM
{
@@ -4037,6 +4313,14 @@ static struct cftype mem_cgroup_legacy_files[] = {
static DEFINE_IDR(mem_cgroup_idr);
+static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
+{
+ if (memcg->id.id > 0) {
+ idr_remove(&mem_cgroup_idr, memcg->id.id);
+ memcg->id.id = 0;
+ }
+}
+
static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
{
VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
@@ -4047,8 +4331,7 @@ static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
{
VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
if (atomic_sub_and_test(n, &memcg->id.ref)) {
- idr_remove(&mem_cgroup_idr, memcg->id.id);
- memcg->id.id = 0;
+ mem_cgroup_id_remove(memcg);
/* Memcg ID pins CSS */
css_put(&memcg->css);
@@ -4176,7 +4459,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
INIT_LIST_HEAD(&memcg->event_list);
spin_lock_init(&memcg->event_list_lock);
memcg->socket_pressure = jiffies;
-#ifndef CONFIG_SLOB
+#ifdef CONFIG_MEMCG_KMEM
memcg->kmemcg_id = -1;
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -4185,8 +4468,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
return memcg;
fail:
- if (memcg->id.id > 0)
- idr_remove(&mem_cgroup_idr, memcg->id.id);
+ mem_cgroup_id_remove(memcg);
__mem_cgroup_free(memcg);
return NULL;
}
@@ -4245,6 +4527,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
return &memcg->css;
fail:
+ mem_cgroup_id_remove(memcg);
mem_cgroup_free(memcg);
return ERR_PTR(-ENOMEM);
}
@@ -4253,6 +4536,16 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ /*
+ * A memcg must be visible for memcg_expand_shrinker_maps()
+ * by the time the maps are allocated. So, we allocate maps
+ * here, when for_each_mem_cgroup() can't skip it.
+ */
+ if (memcg_alloc_shrinker_maps(memcg)) {
+ mem_cgroup_id_remove(memcg);
+ return -ENOMEM;
+ }
+
/* Online state pins memcg ID, memcg ID pins CSS */
atomic_set(&memcg->id.ref, 1);
css_get(css);
@@ -4305,6 +4598,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
vmpressure_cleanup(&memcg->vmpressure);
cancel_work_sync(&memcg->high_work);
mem_cgroup_remove_from_trees(memcg);
+ memcg_free_shrinker_maps(memcg);
memcg_free_kmem(memcg);
mem_cgroup_free(memcg);
}
@@ -5249,8 +5543,7 @@ static int memory_events_show(struct seq_file *m, void *v)
static int memory_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long stat[MEMCG_NR_STAT];
- unsigned long events[NR_VM_EVENT_ITEMS];
+ struct accumulated_stats acc;
int i;
/*
@@ -5264,70 +5557,97 @@ static int memory_stat_show(struct seq_file *m, void *v)
* Current memory state:
*/
- tree_stat(memcg, stat);
- tree_events(memcg, events);
+ memset(&acc, 0, sizeof(acc));
+ acc.stats_size = MEMCG_NR_STAT;
+ acc.events_size = NR_VM_EVENT_ITEMS;
+ accumulate_memcg_tree(memcg, &acc);
seq_printf(m, "anon %llu\n",
- (u64)stat[MEMCG_RSS] * PAGE_SIZE);
+ (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
seq_printf(m, "file %llu\n",
- (u64)stat[MEMCG_CACHE] * PAGE_SIZE);
+ (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
seq_printf(m, "kernel_stack %llu\n",
- (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
+ (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
seq_printf(m, "slab %llu\n",
- (u64)(stat[NR_SLAB_RECLAIMABLE] +
- stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
+ (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
+ acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
seq_printf(m, "sock %llu\n",
- (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
+ (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
seq_printf(m, "shmem %llu\n",
- (u64)stat[NR_SHMEM] * PAGE_SIZE);
+ (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
seq_printf(m, "file_mapped %llu\n",
- (u64)stat[NR_FILE_MAPPED] * PAGE_SIZE);
+ (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
seq_printf(m, "file_dirty %llu\n",
- (u64)stat[NR_FILE_DIRTY] * PAGE_SIZE);
+ (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
seq_printf(m, "file_writeback %llu\n",
- (u64)stat[NR_WRITEBACK] * PAGE_SIZE);
+ (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
- for (i = 0; i < NR_LRU_LISTS; i++) {
- struct mem_cgroup *mi;
- unsigned long val = 0;
-
- for_each_mem_cgroup_tree(mi, memcg)
- val += mem_cgroup_nr_lru_pages(mi, BIT(i));
- seq_printf(m, "%s %llu\n",
- mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
- }
+ for (i = 0; i < NR_LRU_LISTS; i++)
+ seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
+ (u64)acc.lru_pages[i] * PAGE_SIZE);
seq_printf(m, "slab_reclaimable %llu\n",
- (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
+ (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
seq_printf(m, "slab_unreclaimable %llu\n",
- (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
+ (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
/* Accumulated memory events */
- seq_printf(m, "pgfault %lu\n", events[PGFAULT]);
- seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]);
+ seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
+ seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
- seq_printf(m, "pgrefill %lu\n", events[PGREFILL]);
- seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] +
- events[PGSCAN_DIRECT]);
- seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] +
- events[PGSTEAL_DIRECT]);
- seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]);
- seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]);
- seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]);
- seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]);
+ seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
+ seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
+ acc.events[PGSCAN_DIRECT]);
+ seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
+ acc.events[PGSTEAL_DIRECT]);
+ seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
+ seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
+ seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
+ seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
seq_printf(m, "workingset_refault %lu\n",
- stat[WORKINGSET_REFAULT]);
+ acc.stat[WORKINGSET_REFAULT]);
seq_printf(m, "workingset_activate %lu\n",
- stat[WORKINGSET_ACTIVATE]);
+ acc.stat[WORKINGSET_ACTIVATE]);
seq_printf(m, "workingset_nodereclaim %lu\n",
- stat[WORKINGSET_NODERECLAIM]);
+ acc.stat[WORKINGSET_NODERECLAIM]);
return 0;
}
+static int memory_oom_group_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+ seq_printf(m, "%d\n", memcg->oom_group);
+
+ return 0;
+}
+
+static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ int ret, oom_group;
+
+ buf = strstrip(buf);
+ if (!buf)
+ return -EINVAL;
+
+ ret = kstrtoint(buf, 0, &oom_group);
+ if (ret)
+ return ret;
+
+ if (oom_group != 0 && oom_group != 1)
+ return -EINVAL;
+
+ memcg->oom_group = oom_group;
+
+ return nbytes;
+}
+
static struct cftype memory_files[] = {
{
.name = "current",
@@ -5369,6 +5689,12 @@ static struct cftype memory_files[] = {
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_stat_show,
},
+ {
+ .name = "oom.group",
+ .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+ .seq_show = memory_oom_group_show,
+ .write = memory_oom_group_write,
+ },
{ } /* terminate */
};
@@ -5593,6 +5919,19 @@ out:
return ret;
}
+int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask, struct mem_cgroup **memcgp,
+ bool compound)
+{
+ struct mem_cgroup *memcg;
+ int ret;
+
+ ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
+ memcg = *memcgp;
+ mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
+ return ret;
+}
+
/**
* mem_cgroup_commit_charge - commit a page charge
* @page: page to charge
@@ -6003,7 +6342,7 @@ static int __init mem_cgroup_init(void)
{
int cpu, node;
-#ifndef CONFIG_SLOB
+#ifdef CONFIG_MEMCG_KMEM
/*
* Kmem cache creation is mostly done with the slab_mutex held,
* so use a workqueue with limited concurrency to avoid stalling
diff --git a/mm/memfd.c b/mm/memfd.c
index 27069518e3c5..2bb5e257080e 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -326,7 +326,7 @@ SYSCALL_DEFINE2(memfd_create,
goto err_fd;
}
file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
- file->f_flags |= O_RDWR | O_LARGEFILE;
+ file->f_flags |= O_LARGEFILE;
if (flags & MFD_ALLOW_SEALING) {
file_seals = memfd_file_seals_ptr(file);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9d142b9b86dc..0cd3de3550f0 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -55,8 +55,10 @@
#include <linux/hugetlb.h>
#include <linux/memory_hotplug.h>
#include <linux/mm_inline.h>
+#include <linux/memremap.h>
#include <linux/kfifo.h>
#include <linux/ratelimit.h>
+#include <linux/page-isolation.h>
#include "internal.h"
#include "ras/ras_event.h"
@@ -174,22 +176,51 @@ int hwpoison_filter(struct page *p)
EXPORT_SYMBOL_GPL(hwpoison_filter);
/*
+ * Kill all processes that have a poisoned page mapped and then isolate
+ * the page.
+ *
+ * General strategy:
+ * Find all processes having the page mapped and kill them.
+ * But we keep a page reference around so that the page is not
+ * actually freed yet.
+ * Then stash the page away
+ *
+ * There's no convenient way to get back to mapped processes
+ * from the VMAs. So do a brute-force search over all
+ * running processes.
+ *
+ * Remember that machine checks are not common (or rather
+ * if they are common you have other problems), so this shouldn't
+ * be a performance issue.
+ *
+ * Also there are some races possible while we get from the
+ * error detection to actually handle it.
+ */
+
+struct to_kill {
+ struct list_head nd;
+ struct task_struct *tsk;
+ unsigned long addr;
+ short size_shift;
+ char addr_valid;
+};
+
+/*
* Send all the processes who have the page mapped a signal.
* ``action optional'' if they are not immediately affected by the error
* ``action required'' if error happened in current execution context
*/
-static int kill_proc(struct task_struct *t, unsigned long addr,
- unsigned long pfn, struct page *page, int flags)
+static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
{
- short addr_lsb;
+ struct task_struct *t = tk->tsk;
+ short addr_lsb = tk->size_shift;
int ret;
pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n",
pfn, t->comm, t->pid);
- addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
- ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)addr,
+ ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)tk->addr,
addr_lsb, current);
} else {
/*
@@ -198,7 +229,7 @@ static int kill_proc(struct task_struct *t, unsigned long addr,
* This could cause a loop when the user sets SIGBUS
* to SIG_IGN, but hopefully no one will do that?
*/
- ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)addr,
+ ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
addr_lsb, t); /* synchronous? */
}
if (ret < 0)
@@ -234,34 +265,39 @@ void shake_page(struct page *p, int access)
}
EXPORT_SYMBOL_GPL(shake_page);
-/*
- * Kill all processes that have a poisoned page mapped and then isolate
- * the page.
- *
- * General strategy:
- * Find all processes having the page mapped and kill them.
- * But we keep a page reference around so that the page is not
- * actually freed yet.
- * Then stash the page away
- *
- * There's no convenient way to get back to mapped processes
- * from the VMAs. So do a brute-force search over all
- * running processes.
- *
- * Remember that machine checks are not common (or rather
- * if they are common you have other problems), so this shouldn't
- * be a performance issue.
- *
- * Also there are some races possible while we get from the
- * error detection to actually handle it.
- */
-
-struct to_kill {
- struct list_head nd;
- struct task_struct *tsk;
- unsigned long addr;
- char addr_valid;
-};
+static unsigned long dev_pagemap_mapping_shift(struct page *page,
+ struct vm_area_struct *vma)
+{
+ unsigned long address = vma_address(page, vma);
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset(vma->vm_mm, address);
+ if (!pgd_present(*pgd))
+ return 0;
+ p4d = p4d_offset(pgd, address);
+ if (!p4d_present(*p4d))
+ return 0;
+ pud = pud_offset(p4d, address);
+ if (!pud_present(*pud))
+ return 0;
+ if (pud_devmap(*pud))
+ return PUD_SHIFT;
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ return 0;
+ if (pmd_devmap(*pmd))
+ return PMD_SHIFT;
+ pte = pte_offset_map(pmd, address);
+ if (!pte_present(*pte))
+ return 0;
+ if (pte_devmap(*pte))
+ return PAGE_SHIFT;
+ return 0;
+}
/*
* Failure handling: if we can't find or can't kill a process there's
@@ -292,6 +328,10 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
}
tk->addr = page_address_in_vma(p, vma);
tk->addr_valid = 1;
+ if (is_zone_device_page(p))
+ tk->size_shift = dev_pagemap_mapping_shift(p, vma);
+ else
+ tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT;
/*
* In theory we don't have to kill when the page was
@@ -299,7 +339,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* likely very rare kill anyways just out of paranoia, but use
* a SIGKILL because the error is not contained anymore.
*/
- if (tk->addr == -EFAULT) {
+ if (tk->addr == -EFAULT || tk->size_shift == 0) {
pr_info("Memory failure: Unable to find user space address %lx in %s\n",
page_to_pfn(p), tsk->comm);
tk->addr_valid = 0;
@@ -317,9 +357,8 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* Also when FAIL is set do a force kill because something went
* wrong earlier.
*/
-static void kill_procs(struct list_head *to_kill, int forcekill,
- bool fail, struct page *page, unsigned long pfn,
- int flags)
+static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
+ unsigned long pfn, int flags)
{
struct to_kill *tk, *next;
@@ -342,8 +381,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill,
* check for that, but we need to tell the
* process anyways.
*/
- else if (kill_proc(tk->tsk, tk->addr,
- pfn, page, flags) < 0)
+ else if (kill_proc(tk, pfn, flags) < 0)
pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
pfn, tk->tsk->comm, tk->tsk->pid);
}
@@ -515,6 +553,7 @@ static const char * const action_page_types[] = {
[MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
[MF_MSG_BUDDY] = "free buddy page",
[MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
+ [MF_MSG_DAX] = "dax page",
[MF_MSG_UNKNOWN] = "unknown page",
};
@@ -1012,7 +1051,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
* any accesses to the poisoned memory.
*/
forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
- kill_procs(&tokill, forcekill, !unmap_success, p, pfn, flags);
+ kill_procs(&tokill, forcekill, !unmap_success, pfn, flags);
return unmap_success;
}
@@ -1112,6 +1151,83 @@ out:
return res;
}
+static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
+ struct dev_pagemap *pgmap)
+{
+ struct page *page = pfn_to_page(pfn);
+ const bool unmap_success = true;
+ unsigned long size = 0;
+ struct to_kill *tk;
+ LIST_HEAD(tokill);
+ int rc = -EBUSY;
+ loff_t start;
+
+ /*
+ * Prevent the inode from being freed while we are interrogating
+ * the address_space, typically this would be handled by
+ * lock_page(), but dax pages do not use the page lock. This
+ * also prevents changes to the mapping of this pfn until
+ * poison signaling is complete.
+ */
+ if (!dax_lock_mapping_entry(page))
+ goto out;
+
+ if (hwpoison_filter(page)) {
+ rc = 0;
+ goto unlock;
+ }
+
+ switch (pgmap->type) {
+ case MEMORY_DEVICE_PRIVATE:
+ case MEMORY_DEVICE_PUBLIC:
+ /*
+ * TODO: Handle HMM pages which may need coordination
+ * with device-side memory.
+ */
+ goto unlock;
+ default:
+ break;
+ }
+
+ /*
+ * Use this flag as an indication that the dax page has been
+ * remapped UC to prevent speculative consumption of poison.
+ */
+ SetPageHWPoison(page);
+
+ /*
+ * Unlike System-RAM there is no possibility to swap in a
+ * different physical page at a given virtual address, so all
+ * userspace consumption of ZONE_DEVICE memory necessitates
+ * SIGBUS (i.e. MF_MUST_KILL)
+ */
+ flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+ collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
+
+ list_for_each_entry(tk, &tokill, nd)
+ if (tk->size_shift)
+ size = max(size, 1UL << tk->size_shift);
+ if (size) {
+ /*
+ * Unmap the largest mapping to avoid breaking up
+ * device-dax mappings which are constant size. The
+ * actual size of the mapping being torn down is
+ * communicated in siginfo, see kill_proc()
+ */
+ start = (page->index << PAGE_SHIFT) & ~(size - 1);
+ unmap_mapping_range(page->mapping, start, start + size, 0);
+ }
+ kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
+ rc = 0;
+unlock:
+ dax_unlock_mapping_entry(page);
+out:
+ /* drop pgmap ref acquired in caller */
+ put_dev_pagemap(pgmap);
+ action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED);
+ return rc;
+}
+
/**
* memory_failure - Handle memory failure of a page.
* @pfn: Page Number of the corrupted page
@@ -1134,6 +1250,7 @@ int memory_failure(unsigned long pfn, int flags)
struct page *p;
struct page *hpage;
struct page *orig_head;
+ struct dev_pagemap *pgmap;
int res;
unsigned long page_flags;
@@ -1146,6 +1263,10 @@ int memory_failure(unsigned long pfn, int flags)
return -ENXIO;
}
+ pgmap = get_dev_pagemap(pfn, NULL);
+ if (pgmap)
+ return memory_failure_dev_pagemap(pfn, flags, pgmap);
+
p = pfn_to_page(pfn);
if (PageHuge(p))
return memory_failure_hugetlb(pfn, flags);
@@ -1167,7 +1288,7 @@ int memory_failure(unsigned long pfn, int flags)
* R/W the page; let's pray that the page has been
* used and will be freed some time later.
* In fact it's dangerous to directly bump up page count from 0,
- * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
+ * that may make page_ref_freeze()/page_ref_unfreeze() mismatch.
*/
if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
if (is_free_buddy_page(p)) {
@@ -1598,8 +1719,18 @@ static int soft_offline_huge_page(struct page *page, int flags)
if (ret > 0)
ret = -EIO;
} else {
- if (PageHuge(page))
- dissolve_free_huge_page(page);
+ /*
+ * We set PG_hwpoison only when the migration source hugepage
+ * was successfully dissolved, because otherwise hwpoisoned
+ * hugepage remains on free hugepage list, then userspace will
+ * find it as SIGBUS by allocation failure. That's not expected
+ * in soft-offlining.
+ */
+ ret = dissolve_free_huge_page(page);
+ if (!ret) {
+ if (set_hwpoison_free_buddy_page(page))
+ num_poisoned_pages_inc();
+ }
}
return ret;
}
@@ -1687,6 +1818,7 @@ static int __soft_offline_page(struct page *page, int flags)
static int soft_offline_in_use_page(struct page *page, int flags)
{
int ret;
+ int mt;
struct page *hpage = compound_head(page);
if (!PageHuge(page) && PageTransHuge(hpage)) {
@@ -1705,23 +1837,37 @@ static int soft_offline_in_use_page(struct page *page, int flags)
put_hwpoison_page(hpage);
}
+ /*
+ * Setting MIGRATE_ISOLATE here ensures that the page will be linked
+ * to free list immediately (not via pcplist) when released after
+ * successful page migration. Otherwise we can't guarantee that the
+ * page is really free after put_page() returns, so
+ * set_hwpoison_free_buddy_page() highly likely fails.
+ */
+ mt = get_pageblock_migratetype(page);
+ set_pageblock_migratetype(page, MIGRATE_ISOLATE);
if (PageHuge(page))
ret = soft_offline_huge_page(page, flags);
else
ret = __soft_offline_page(page, flags);
-
+ set_pageblock_migratetype(page, mt);
return ret;
}
-static void soft_offline_free_page(struct page *page)
+static int soft_offline_free_page(struct page *page)
{
+ int rc = 0;
struct page *head = compound_head(page);
- if (!TestSetPageHWPoison(head)) {
- num_poisoned_pages_inc();
- if (PageHuge(head))
- dissolve_free_huge_page(page);
+ if (PageHuge(head))
+ rc = dissolve_free_huge_page(page);
+ if (!rc) {
+ if (set_hwpoison_free_buddy_page(page))
+ num_poisoned_pages_inc();
+ else
+ rc = -EBUSY;
}
+ return rc;
}
/**
@@ -1751,6 +1897,14 @@ int soft_offline_page(struct page *page, int flags)
int ret;
unsigned long pfn = page_to_pfn(page);
+ if (is_zone_device_page(page)) {
+ pr_debug_ratelimited("soft_offline: %#lx page is device page\n",
+ pfn);
+ if (flags & MF_COUNT_INCREASED)
+ put_page(page);
+ return -EIO;
+ }
+
if (PageHWPoison(page)) {
pr_info("soft offline: %#lx page already poisoned\n", pfn);
if (flags & MF_COUNT_INCREASED)
@@ -1765,7 +1919,7 @@ int soft_offline_page(struct page *page, int flags)
if (ret > 0)
ret = soft_offline_in_use_page(page, flags);
else if (ret == 0)
- soft_offline_free_page(page);
+ ret = soft_offline_free_page(page);
return ret;
}
diff --git a/mm/memory.c b/mm/memory.c
index 7206a634270b..c467102a5cbc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -238,23 +238,13 @@ void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
__tlb_reset_range(tlb);
}
-static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
+static void tlb_flush_mmu_free(struct mmu_gather *tlb)
{
- if (!tlb->end)
- return;
+ struct mmu_gather_batch *batch;
- tlb_flush(tlb);
- mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
tlb_table_flush(tlb);
#endif
- __tlb_reset_range(tlb);
-}
-
-static void tlb_flush_mmu_free(struct mmu_gather *tlb)
-{
- struct mmu_gather_batch *batch;
-
for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
free_pages_and_swap_cache(batch->pages, batch->nr);
batch->nr = 0;
@@ -330,6 +320,21 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
* See the comment near struct mmu_table_batch.
*/
+/*
+ * If we want tlb_remove_table() to imply TLB invalidates.
+ */
+static inline void tlb_table_invalidate(struct mmu_gather *tlb)
+{
+#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE
+ /*
+ * Invalidate page-table caches used by hardware walkers. Then we still
+ * need to RCU-sched wait while freeing the pages because software
+ * walkers can still be in-flight.
+ */
+ tlb_flush_mmu_tlbonly(tlb);
+#endif
+}
+
static void tlb_remove_table_smp_sync(void *arg)
{
/* Simply deliver the interrupt */
@@ -366,6 +371,7 @@ void tlb_table_flush(struct mmu_gather *tlb)
struct mmu_table_batch **batch = &tlb->batch;
if (*batch) {
+ tlb_table_invalidate(tlb);
call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
*batch = NULL;
}
@@ -375,23 +381,16 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
{
struct mmu_table_batch **batch = &tlb->batch;
- /*
- * When there's less then two users of this mm there cannot be a
- * concurrent page-table walk.
- */
- if (atomic_read(&tlb->mm->mm_users) < 2) {
- __tlb_remove_table(table);
- return;
- }
-
if (*batch == NULL) {
*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
if (*batch == NULL) {
+ tlb_table_invalidate(tlb);
tlb_remove_table_one(table);
return;
}
(*batch)->nr = 0;
}
+
(*batch)->tables[(*batch)->nr++] = table;
if ((*batch)->nr == MAX_TABLE_BATCH)
tlb_table_flush(tlb);
@@ -853,6 +852,10 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
return NULL;
}
}
+
+ if (pte_devmap(pte))
+ return NULL;
+
print_bad_pte(vma, addr, pte, NULL);
return NULL;
}
@@ -917,6 +920,8 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
}
}
+ if (pmd_devmap(pmd))
+ return NULL;
if (is_zero_pfn(pfn))
return NULL;
if (unlikely(pfn > highest_memmap_pfn))
@@ -1017,7 +1022,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* If it's a COW mapping, write protect it both
* in the parent and the child
*/
- if (is_cow_mapping(vm_flags)) {
+ if (is_cow_mapping(vm_flags) && pte_write(pte)) {
ptep_set_wrprotect(src_mm, addr, src_pte);
pte = pte_wrprotect(pte);
}
@@ -1417,11 +1422,9 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
do {
next = pmd_addr_end(addr, end);
if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
- if (next - addr != HPAGE_PMD_SIZE) {
- VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
- !rwsem_is_locked(&tlb->mm->mmap_sem), vma);
+ if (next - addr != HPAGE_PMD_SIZE)
__split_huge_pmd(vma, pmd, addr, false, NULL);
- } else if (zap_huge_pmd(tlb, vma, pmd, addr))
+ else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
}
@@ -1603,20 +1606,8 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
tlb_gather_mmu(&tlb, mm, start, end);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(mm, start, end);
- for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
+ for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
unmap_single_vma(&tlb, vma, start, end, NULL);
-
- /*
- * zap_page_range does not specify whether mmap_sem should be
- * held for read or write. That allows parallel zap_page_range
- * operations to unmap a PTE and defer a flush meaning that
- * this call observes pte_none and fails to flush the TLB.
- * Rather than adding a complex API, ensure that no stale
- * TLB entries exist when this call returns.
- */
- flush_tlb_range(vma, start, end);
- }
-
mmu_notifier_invalidate_range_end(mm, start, end);
tlb_finish_mmu(&tlb, start, end);
}
@@ -1886,6 +1877,9 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
if (addr < vma->vm_start || addr >= vma->vm_end)
return -EFAULT;
+ if (!pfn_modify_allowed(pfn, pgprot))
+ return -EACCES;
+
track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
@@ -1921,6 +1915,9 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
track_pfn_insert(vma, &pgprot, pfn);
+ if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
+ return -EACCES;
+
/*
* If we don't have pte special, then we have to use the pfn_valid()
* based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
@@ -1982,6 +1979,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
{
pte_t *pte;
spinlock_t *ptl;
+ int err = 0;
pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
@@ -1989,12 +1987,16 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
arch_enter_lazy_mmu_mode();
do {
BUG_ON(!pte_none(*pte));
+ if (!pfn_modify_allowed(pfn, prot)) {
+ err = -EACCES;
+ break;
+ }
set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
- return 0;
+ return err;
}
static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
@@ -2003,6 +2005,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
{
pmd_t *pmd;
unsigned long next;
+ int err;
pfn -= addr >> PAGE_SHIFT;
pmd = pmd_alloc(mm, pud, addr);
@@ -2011,9 +2014,10 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
VM_BUG_ON(pmd_trans_huge(*pmd));
do {
next = pmd_addr_end(addr, end);
- if (remap_pte_range(mm, pmd, addr, next,
- pfn + (addr >> PAGE_SHIFT), prot))
- return -ENOMEM;
+ err = remap_pte_range(mm, pmd, addr, next,
+ pfn + (addr >> PAGE_SHIFT), prot);
+ if (err)
+ return err;
} while (pmd++, addr = next, addr != end);
return 0;
}
@@ -2024,6 +2028,7 @@ static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
{
pud_t *pud;
unsigned long next;
+ int err;
pfn -= addr >> PAGE_SHIFT;
pud = pud_alloc(mm, p4d, addr);
@@ -2031,9 +2036,10 @@ static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
- if (remap_pmd_range(mm, pud, addr, next,
- pfn + (addr >> PAGE_SHIFT), prot))
- return -ENOMEM;
+ err = remap_pmd_range(mm, pud, addr, next,
+ pfn + (addr >> PAGE_SHIFT), prot);
+ if (err)
+ return err;
} while (pud++, addr = next, addr != end);
return 0;
}
@@ -2044,6 +2050,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
{
p4d_t *p4d;
unsigned long next;
+ int err;
pfn -= addr >> PAGE_SHIFT;
p4d = p4d_alloc(mm, pgd, addr);
@@ -2051,9 +2058,10 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
return -ENOMEM;
do {
next = p4d_addr_end(addr, end);
- if (remap_pud_range(mm, p4d, addr, next,
- pfn + (addr >> PAGE_SHIFT), prot))
- return -ENOMEM;
+ err = remap_pud_range(mm, p4d, addr, next,
+ pfn + (addr >> PAGE_SHIFT), prot);
+ if (err)
+ return err;
} while (p4d++, addr = next, addr != end);
return 0;
}
@@ -2369,9 +2377,9 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
*
* We do this without the lock held, so that it can sleep if it needs to.
*/
-static int do_page_mkwrite(struct vm_fault *vmf)
+static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
{
- int ret;
+ vm_fault_t ret;
struct page *page = vmf->page;
unsigned int old_flags = vmf->flags;
@@ -2475,7 +2483,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
* held to the old page, as well as updating the rmap.
* - In any case, unlock the PTL and drop the reference we took to the old page.
*/
-static int wp_page_copy(struct vm_fault *vmf)
+static vm_fault_t wp_page_copy(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
@@ -2503,7 +2511,7 @@ static int wp_page_copy(struct vm_fault *vmf)
cow_user_page(new_page, old_page, vmf->address, vma);
}
- if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
+ if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false))
goto oom_free_new;
__SetPageUptodate(new_page);
@@ -2623,7 +2631,7 @@ oom:
* The function expects the page to be locked or other protection against
* concurrent faults / writeback (such as DAX radix tree locks).
*/
-int finish_mkwrite_fault(struct vm_fault *vmf)
+vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
{
WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
@@ -2644,12 +2652,12 @@ int finish_mkwrite_fault(struct vm_fault *vmf)
* Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
* mapping
*/
-static int wp_pfn_shared(struct vm_fault *vmf)
+static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
- int ret;
+ vm_fault_t ret;
pte_unmap_unlock(vmf->pte, vmf->ptl);
vmf->flags |= FAULT_FLAG_MKWRITE;
@@ -2662,7 +2670,7 @@ static int wp_pfn_shared(struct vm_fault *vmf)
return VM_FAULT_WRITE;
}
-static int wp_page_shared(struct vm_fault *vmf)
+static vm_fault_t wp_page_shared(struct vm_fault *vmf)
__releases(vmf->ptl)
{
struct vm_area_struct *vma = vmf->vma;
@@ -2670,7 +2678,7 @@ static int wp_page_shared(struct vm_fault *vmf)
get_page(vmf->page);
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
- int tmp;
+ vm_fault_t tmp;
pte_unmap_unlock(vmf->pte, vmf->ptl);
tmp = do_page_mkwrite(vmf);
@@ -2713,7 +2721,7 @@ static int wp_page_shared(struct vm_fault *vmf)
* but allow concurrent faults), with pte both mapped and locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_wp_page(struct vm_fault *vmf)
+static vm_fault_t do_wp_page(struct vm_fault *vmf)
__releases(vmf->ptl)
{
struct vm_area_struct *vma = vmf->vma;
@@ -2889,7 +2897,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
* We return with the mmap_sem locked or unlocked in the same cases
* as does filemap_fault().
*/
-int do_swap_page(struct vm_fault *vmf)
+vm_fault_t do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL, *swapcache;
@@ -2898,7 +2906,7 @@ int do_swap_page(struct vm_fault *vmf)
pte_t pte;
int locked;
int exclusive = 0;
- int ret = 0;
+ vm_fault_t ret = 0;
if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
goto out;
@@ -3003,8 +3011,8 @@ int do_swap_page(struct vm_fault *vmf)
goto out_page;
}
- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
- &memcg, false)) {
+ if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL,
+ &memcg, false)) {
ret = VM_FAULT_OOM;
goto out_page;
}
@@ -3109,12 +3117,12 @@ out_release:
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_anonymous_page(struct vm_fault *vmf)
+static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct mem_cgroup *memcg;
struct page *page;
- int ret = 0;
+ vm_fault_t ret = 0;
pte_t entry;
/* File mapping without ->vm_ops ? */
@@ -3165,7 +3173,8 @@ static int do_anonymous_page(struct vm_fault *vmf)
if (!page)
goto oom;
- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
+ if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg,
+ false))
goto oom_free_page;
/*
@@ -3223,10 +3232,10 @@ oom:
* released depending on flags and vma->vm_ops->fault() return value.
* See filemap_fault() and __lock_page_retry().
*/
-static int __do_fault(struct vm_fault *vmf)
+static vm_fault_t __do_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- int ret;
+ vm_fault_t ret;
ret = vma->vm_ops->fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
@@ -3260,7 +3269,7 @@ static int pmd_devmap_trans_unstable(pmd_t *pmd)
return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
}
-static int pte_alloc_one_map(struct vm_fault *vmf)
+static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
@@ -3336,13 +3345,14 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
vmf->prealloc_pte = NULL;
}
-static int do_set_pmd(struct vm_fault *vmf, struct page *page)
+static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
{
struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
pmd_t entry;
- int i, ret;
+ int i;
+ vm_fault_t ret;
if (!transhuge_vma_suitable(vma, haddr))
return VM_FAULT_FALLBACK;
@@ -3372,7 +3382,7 @@ static int do_set_pmd(struct vm_fault *vmf, struct page *page)
if (write)
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR);
+ add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
page_add_file_rmap(page, true);
/*
* deposit and withdraw with pmd lock held
@@ -3392,7 +3402,7 @@ out:
return ret;
}
#else
-static int do_set_pmd(struct vm_fault *vmf, struct page *page)
+static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
{
BUILD_BUG();
return 0;
@@ -3413,13 +3423,13 @@ static int do_set_pmd(struct vm_fault *vmf, struct page *page)
* Target users are page handler itself and implementations of
* vm_ops->map_pages.
*/
-int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
+vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
struct page *page)
{
struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE;
pte_t entry;
- int ret;
+ vm_fault_t ret;
if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
@@ -3478,10 +3488,10 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
* The function expects the page to be locked and on success it consumes a
* reference of a page being mapped (for the PTE which maps it).
*/
-int finish_fault(struct vm_fault *vmf)
+vm_fault_t finish_fault(struct vm_fault *vmf)
{
struct page *page;
- int ret = 0;
+ vm_fault_t ret = 0;
/* Did we COW the page? */
if ((vmf->flags & FAULT_FLAG_WRITE) &&
@@ -3567,12 +3577,13 @@ late_initcall(fault_around_debugfs);
* (and therefore to page order). This way it's easier to guarantee
* that we don't cross page table boundaries.
*/
-static int do_fault_around(struct vm_fault *vmf)
+static vm_fault_t do_fault_around(struct vm_fault *vmf)
{
unsigned long address = vmf->address, nr_pages, mask;
pgoff_t start_pgoff = vmf->pgoff;
pgoff_t end_pgoff;
- int off, ret = 0;
+ int off;
+ vm_fault_t ret = 0;
nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
@@ -3622,10 +3633,10 @@ out:
return ret;
}
-static int do_read_fault(struct vm_fault *vmf)
+static vm_fault_t do_read_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- int ret = 0;
+ vm_fault_t ret = 0;
/*
* Let's call ->map_pages() first and use ->fault() as fallback
@@ -3649,10 +3660,10 @@ static int do_read_fault(struct vm_fault *vmf)
return ret;
}
-static int do_cow_fault(struct vm_fault *vmf)
+static vm_fault_t do_cow_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- int ret;
+ vm_fault_t ret;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
@@ -3661,7 +3672,7 @@ static int do_cow_fault(struct vm_fault *vmf)
if (!vmf->cow_page)
return VM_FAULT_OOM;
- if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
+ if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
&vmf->memcg, false)) {
put_page(vmf->cow_page);
return VM_FAULT_OOM;
@@ -3688,10 +3699,10 @@ uncharge_out:
return ret;
}
-static int do_shared_fault(struct vm_fault *vmf)
+static vm_fault_t do_shared_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- int ret, tmp;
+ vm_fault_t ret, tmp;
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
@@ -3729,10 +3740,10 @@ static int do_shared_fault(struct vm_fault *vmf)
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
-static int do_fault(struct vm_fault *vmf)
+static vm_fault_t do_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- int ret;
+ vm_fault_t ret;
/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
if (!vma->vm_ops->fault)
@@ -3767,7 +3778,7 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
return mpol_misplaced(page, vma, addr);
}
-static int do_numa_page(struct vm_fault *vmf)
+static vm_fault_t do_numa_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL;
@@ -3857,7 +3868,7 @@ out:
return 0;
}
-static inline int create_huge_pmd(struct vm_fault *vmf)
+static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
{
if (vma_is_anonymous(vmf->vma))
return do_huge_pmd_anonymous_page(vmf);
@@ -3867,7 +3878,7 @@ static inline int create_huge_pmd(struct vm_fault *vmf)
}
/* `inline' is required to avoid gcc 4.1.2 build error */
-static inline int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
+static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
{
if (vma_is_anonymous(vmf->vma))
return do_huge_pmd_wp_page(vmf, orig_pmd);
@@ -3886,7 +3897,7 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
}
-static int create_huge_pud(struct vm_fault *vmf)
+static vm_fault_t create_huge_pud(struct vm_fault *vmf)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* No support for anonymous transparent PUD pages yet */
@@ -3898,7 +3909,7 @@ static int create_huge_pud(struct vm_fault *vmf)
return VM_FAULT_FALLBACK;
}
-static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
+static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* No support for anonymous transparent PUD pages yet */
@@ -3925,7 +3936,7 @@ static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
* The mmap_sem may have been released depending on flags and our return value.
* See filemap_fault() and __lock_page_or_retry().
*/
-static int handle_pte_fault(struct vm_fault *vmf)
+static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
pte_t entry;
@@ -4013,8 +4024,8 @@ unlock:
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
-static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
- unsigned int flags)
+static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags)
{
struct vm_fault vmf = {
.vma = vma,
@@ -4027,7 +4038,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
p4d_t *p4d;
- int ret;
+ vm_fault_t ret;
pgd = pgd_offset(mm, address);
p4d = p4d_alloc(mm, pgd, address);
@@ -4102,10 +4113,10 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
-int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
+vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags)
{
- int ret;
+ vm_fault_t ret;
__set_current_state(TASK_RUNNING);
@@ -4125,7 +4136,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
* space. Kernel faults are handled more gracefully.
*/
if (flags & FAULT_FLAG_USER)
- mem_cgroup_oom_enable();
+ mem_cgroup_enter_user_fault();
if (unlikely(is_vm_hugetlb_page(vma)))
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
@@ -4133,7 +4144,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
ret = __handle_mm_fault(vma, address, flags);
if (flags & FAULT_FLAG_USER) {
- mem_cgroup_oom_disable();
+ mem_cgroup_exit_user_fault();
/*
* The task may have entered a memcg OOM situation but
* if the allocation error was handled gracefully (no
@@ -4397,6 +4408,9 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
return -EINVAL;
maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
+ if (!maddr)
+ return -ENOMEM;
+
if (write)
memcpy_toio(maddr + offset, buf, len);
else
@@ -4568,71 +4582,93 @@ EXPORT_SYMBOL(__might_fault);
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
-static void clear_gigantic_page(struct page *page,
- unsigned long addr,
- unsigned int pages_per_huge_page)
-{
- int i;
- struct page *p = page;
-
- might_sleep();
- for (i = 0; i < pages_per_huge_page;
- i++, p = mem_map_next(p, page, i)) {
- cond_resched();
- clear_user_highpage(p, addr + i * PAGE_SIZE);
- }
-}
-void clear_huge_page(struct page *page,
- unsigned long addr_hint, unsigned int pages_per_huge_page)
+/*
+ * Process all subpages of the specified huge page with the specified
+ * operation. The target subpage will be processed last to keep its
+ * cache lines hot.
+ */
+static inline void process_huge_page(
+ unsigned long addr_hint, unsigned int pages_per_huge_page,
+ void (*process_subpage)(unsigned long addr, int idx, void *arg),
+ void *arg)
{
int i, n, base, l;
unsigned long addr = addr_hint &
~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
- if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
- clear_gigantic_page(page, addr, pages_per_huge_page);
- return;
- }
-
- /* Clear sub-page to access last to keep its cache lines hot */
+ /* Process target subpage last to keep its cache lines hot */
might_sleep();
n = (addr_hint - addr) / PAGE_SIZE;
if (2 * n <= pages_per_huge_page) {
- /* If sub-page to access in first half of huge page */
+ /* If target subpage in first half of huge page */
base = 0;
l = n;
- /* Clear sub-pages at the end of huge page */
+ /* Process subpages at the end of huge page */
for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
cond_resched();
- clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+ process_subpage(addr + i * PAGE_SIZE, i, arg);
}
} else {
- /* If sub-page to access in second half of huge page */
+ /* If target subpage in second half of huge page */
base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
l = pages_per_huge_page - n;
- /* Clear sub-pages at the begin of huge page */
+ /* Process subpages at the begin of huge page */
for (i = 0; i < base; i++) {
cond_resched();
- clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+ process_subpage(addr + i * PAGE_SIZE, i, arg);
}
}
/*
- * Clear remaining sub-pages in left-right-left-right pattern
- * towards the sub-page to access
+ * Process remaining subpages in left-right-left-right pattern
+ * towards the target subpage
*/
for (i = 0; i < l; i++) {
int left_idx = base + i;
int right_idx = base + 2 * l - 1 - i;
cond_resched();
- clear_user_highpage(page + left_idx,
- addr + left_idx * PAGE_SIZE);
+ process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
+ cond_resched();
+ process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
+ }
+}
+
+static void clear_gigantic_page(struct page *page,
+ unsigned long addr,
+ unsigned int pages_per_huge_page)
+{
+ int i;
+ struct page *p = page;
+
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page;
+ i++, p = mem_map_next(p, page, i)) {
cond_resched();
- clear_user_highpage(page + right_idx,
- addr + right_idx * PAGE_SIZE);
+ clear_user_highpage(p, addr + i * PAGE_SIZE);
}
}
+static void clear_subpage(unsigned long addr, int idx, void *arg)
+{
+ struct page *page = arg;
+
+ clear_user_highpage(page + idx, addr);
+}
+
+void clear_huge_page(struct page *page,
+ unsigned long addr_hint, unsigned int pages_per_huge_page)
+{
+ unsigned long addr = addr_hint &
+ ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
+
+ if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+ clear_gigantic_page(page, addr, pages_per_huge_page);
+ return;
+ }
+
+ process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
+}
+
static void copy_user_gigantic_page(struct page *dst, struct page *src,
unsigned long addr,
struct vm_area_struct *vma,
@@ -4652,11 +4688,31 @@ static void copy_user_gigantic_page(struct page *dst, struct page *src,
}
}
+struct copy_subpage_arg {
+ struct page *dst;
+ struct page *src;
+ struct vm_area_struct *vma;
+};
+
+static void copy_subpage(unsigned long addr, int idx, void *arg)
+{
+ struct copy_subpage_arg *copy_arg = arg;
+
+ copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
+ addr, copy_arg->vma);
+}
+
void copy_user_huge_page(struct page *dst, struct page *src,
- unsigned long addr, struct vm_area_struct *vma,
+ unsigned long addr_hint, struct vm_area_struct *vma,
unsigned int pages_per_huge_page)
{
- int i;
+ unsigned long addr = addr_hint &
+ ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
+ struct copy_subpage_arg arg = {
+ .dst = dst,
+ .src = src,
+ .vma = vma,
+ };
if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
copy_user_gigantic_page(dst, src, addr, vma,
@@ -4664,11 +4720,7 @@ void copy_user_huge_page(struct page *dst, struct page *src,
return;
}
- might_sleep();
- for (i = 0; i < pages_per_huge_page; i++) {
- cond_resched();
- copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
- }
+ process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
}
long copy_huge_page_from_user(struct page *dst_page,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 7deb49f69e27..9eea6e809a4e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -982,8 +982,6 @@ static void reset_node_present_pages(pg_data_t *pgdat)
static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
{
struct pglist_data *pgdat;
- unsigned long zones_size[MAX_NR_ZONES] = {0};
- unsigned long zholes_size[MAX_NR_ZONES] = {0};
unsigned long start_pfn = PFN_DOWN(start);
pgdat = NODE_DATA(nid);
@@ -1006,8 +1004,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
/* we can use NODE_DATA(nid) from here */
+ pgdat->node_id = nid;
+ pgdat->node_start_pfn = start_pfn;
+
/* init node's zones as empty zones, we don't have any present pages.*/
- free_area_init_node(nid, zones_size, start_pfn, zholes_size);
+ free_area_init_core_hotplug(nid);
pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
/*
@@ -1017,25 +1018,20 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
build_all_zonelists(pgdat);
/*
- * zone->managed_pages is set to an approximate value in
- * free_area_init_core(), which will cause
- * /sys/device/system/node/nodeX/meminfo has wrong data.
- * So reset it to 0 before any memory is onlined.
- */
- reset_node_managed_pages(pgdat);
-
- /*
* When memory is hot-added, all the memory is in offline state. So
* clear all zones' present_pages because they will be updated in
* online_pages() and offline_pages().
*/
+ reset_node_managed_pages(pgdat);
reset_node_present_pages(pgdat);
return pgdat;
}
-static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
+static void rollback_node_hotadd(int nid)
{
+ pg_data_t *pgdat = NODE_DATA(nid);
+
arch_refresh_nodedata(nid, NULL);
free_percpu(pgdat->per_cpu_nodestats);
arch_free_nodedata(pgdat);
@@ -1046,28 +1042,48 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
/**
* try_online_node - online a node if offlined
* @nid: the node ID
- *
+ * @start: start addr of the node
+ * @set_node_online: Whether we want to online the node
* called by cpu_up() to online a node without onlined memory.
+ *
+ * Returns:
+ * 1 -> a new node has been allocated
+ * 0 -> the node is already online
+ * -ENOMEM -> the node could not be allocated
*/
-int try_online_node(int nid)
+static int __try_online_node(int nid, u64 start, bool set_node_online)
{
- pg_data_t *pgdat;
- int ret;
+ pg_data_t *pgdat;
+ int ret = 1;
if (node_online(nid))
return 0;
- mem_hotplug_begin();
- pgdat = hotadd_new_pgdat(nid, 0);
+ pgdat = hotadd_new_pgdat(nid, start);
if (!pgdat) {
pr_err("Cannot online node %d due to NULL pgdat\n", nid);
ret = -ENOMEM;
goto out;
}
- node_set_online(nid);
- ret = register_one_node(nid);
- BUG_ON(ret);
+
+ if (set_node_online) {
+ node_set_online(nid);
+ ret = register_one_node(nid);
+ BUG_ON(ret);
+ }
out:
+ return ret;
+}
+
+/*
+ * Users of this function always want to online/register the node
+ */
+int try_online_node(int nid)
+{
+ int ret;
+
+ mem_hotplug_begin();
+ ret = __try_online_node(nid, 0, true);
mem_hotplug_done();
return ret;
}
@@ -1099,9 +1115,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
int __ref add_memory_resource(int nid, struct resource *res, bool online)
{
u64 start, size;
- pg_data_t *pgdat = NULL;
- bool new_pgdat;
- bool new_node;
+ bool new_node = false;
int ret;
start = res->start;
@@ -1111,11 +1125,6 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
if (ret)
return ret;
- { /* Stupid hack to suppress address-never-null warning */
- void *p = NODE_DATA(nid);
- new_pgdat = !p;
- }
-
mem_hotplug_begin();
/*
@@ -1126,48 +1135,31 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
*/
memblock_add_node(start, size, nid);
- new_node = !node_online(nid);
- if (new_node) {
- pgdat = hotadd_new_pgdat(nid, start);
- ret = -ENOMEM;
- if (!pgdat)
- goto error;
- }
+ ret = __try_online_node(nid, start, false);
+ if (ret < 0)
+ goto error;
+ new_node = ret;
/* call arch's memory hotadd */
ret = arch_add_memory(nid, start, size, NULL, true);
-
if (ret < 0)
goto error;
- /* we online node here. we can't roll back from here. */
- node_set_online(nid);
-
if (new_node) {
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long nr_pages = size >> PAGE_SHIFT;
-
- ret = __register_one_node(nid);
- if (ret)
- goto register_fail;
-
- /*
- * link memory sections under this node. This is already
- * done when creatig memory section in register_new_memory
- * but that depends to have the node registered so offline
- * nodes have to go through register_node.
- * TODO clean up this mess.
- */
- ret = link_mem_sections(nid, start_pfn, nr_pages, false);
-register_fail:
- /*
- * If sysfs file of new node can't create, cpu on the node
+ /* If sysfs file of new node can't be created, cpu on the node
* can't be hot-added. There is no rollback way now.
* So, check by BUG_ON() to catch it reluctantly..
+ * We online node here. We can't roll back from here.
*/
+ node_set_online(nid);
+ ret = __register_one_node(nid);
BUG_ON(ret);
}
+ /* link memory sections under this node.*/
+ ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1));
+ BUG_ON(ret);
+
/* create new memmap entry */
firmware_map_add_hotplug(start, start + size, "System RAM");
@@ -1180,8 +1172,8 @@ register_fail:
error:
/* rollback pgdat allocation and others */
- if (new_pgdat && pgdat)
- rollback_node_hotadd(nid, pgdat);
+ if (new_node)
+ rollback_node_hotadd(nid);
memblock_remove(start, size);
out:
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9ac49ef17b4e..da858f794eb6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1784,7 +1784,7 @@ unsigned int mempolicy_slab_node(void)
zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
z = first_zones_zonelist(zonelist, highest_zoneidx,
&policy->v.nodes);
- return z->zone ? z->zone->node : node;
+ return z->zone ? zone_to_nid(z->zone) : node;
}
default:
@@ -2326,7 +2326,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
node_zonelist(numa_node_id(), GFP_HIGHUSER),
gfp_zone(GFP_HIGHUSER),
&pol->v.nodes);
- polnid = z->zone->node;
+ polnid = zone_to_nid(z->zone);
break;
default:
@@ -2504,7 +2504,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
goto put_new;
/* Create pseudo-vma that contains just the policy */
- memset(&pvma, 0, sizeof(struct vm_area_struct));
+ vma_init(&pvma, NULL);
pvma.vm_end = TASK_SIZE; /* policy covers entire file */
mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
diff --git a/mm/mempool.c b/mm/mempool.c
index b54f2c20e5e0..0ef8cc8d1602 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -111,7 +111,7 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
kasan_free_pages(element, (unsigned long)pool->pool_data);
}
-static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags)
+static void kasan_unpoison_element(mempool_t *pool, void *element)
{
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
kasan_unpoison_slab(element);
@@ -127,12 +127,12 @@ static __always_inline void add_element(mempool_t *pool, void *element)
pool->elements[pool->curr_nr++] = element;
}
-static void *remove_element(mempool_t *pool, gfp_t flags)
+static void *remove_element(mempool_t *pool)
{
void *element = pool->elements[--pool->curr_nr];
BUG_ON(pool->curr_nr < 0);
- kasan_unpoison_element(pool, element, flags);
+ kasan_unpoison_element(pool, element);
check_element(pool, element);
return element;
}
@@ -151,7 +151,7 @@ static void *remove_element(mempool_t *pool, gfp_t flags)
void mempool_exit(mempool_t *pool)
{
while (pool->curr_nr) {
- void *element = remove_element(pool, GFP_KERNEL);
+ void *element = remove_element(pool);
pool->free(element, pool->pool_data);
}
kfree(pool->elements);
@@ -213,6 +213,7 @@ EXPORT_SYMBOL(mempool_init_node);
/**
* mempool_init - initialize a memory pool
+ * @pool: pointer to the memory pool that should be initialized
* @min_nr: the minimum number of elements guaranteed to be
* allocated for this pool.
* @alloc_fn: user-defined element-allocation function.
@@ -301,7 +302,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr)
spin_lock_irqsave(&pool->lock, flags);
if (new_min_nr <= pool->min_nr) {
while (new_min_nr < pool->curr_nr) {
- element = remove_element(pool, GFP_KERNEL);
+ element = remove_element(pool);
spin_unlock_irqrestore(&pool->lock, flags);
pool->free(element, pool->pool_data);
spin_lock_irqsave(&pool->lock, flags);
@@ -387,7 +388,7 @@ repeat_alloc:
spin_lock_irqsave(&pool->lock, flags);
if (likely(pool->curr_nr)) {
- element = remove_element(pool, gfp_temp);
+ element = remove_element(pool);
spin_unlock_irqrestore(&pool->lock, flags);
/* paired with rmb in mempool_free(), read comment there */
smp_wmb();
diff --git a/mm/migrate.c b/mm/migrate.c
index 8c0af0f7cab1..d6a2e89b086a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1131,7 +1131,8 @@ out:
* gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work
* around it.
*/
-#if (GCC_VERSION >= 40700 && GCC_VERSION < 40900) && defined(CONFIG_ARM)
+#if defined(CONFIG_ARM) && \
+ defined(GCC_VERSION) && GCC_VERSION < 40900 && GCC_VERSION >= 40700
#define ICE_noinline noinline
#else
#define ICE_noinline
@@ -1211,7 +1212,7 @@ out:
* intentionally. Although it's rather weird,
* it's how HWPoison flag works at the moment.
*/
- if (!test_set_page_hwpoison(page))
+ if (set_hwpoison_free_buddy_page(page))
num_poisoned_pages_inc();
}
} else {
@@ -1330,8 +1331,6 @@ put_anon:
out:
if (rc != -EAGAIN)
putback_active_hugepage(hpage);
- if (reason == MR_MEMORY_FAILURE && !test_set_page_hwpoison(hpage))
- num_poisoned_pages_inc();
/*
* If migration was not successful and there's a freeing callback, use
@@ -2951,7 +2950,8 @@ int migrate_vma(const struct migrate_vma_ops *ops,
/* Sanity check the arguments */
start &= PAGE_MASK;
end &= PAGE_MASK;
- if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL))
+ if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
+ vma_is_dax(vma))
return -EINVAL;
if (start < vma->vm_start || start >= vma->vm_end)
return -EINVAL;
diff --git a/mm/mlock.c b/mm/mlock.c
index 74e5a6547c3d..41cc47e28ad6 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -527,7 +527,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
vm_flags_t old_flags = vma->vm_flags;
if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
- is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
+ is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
+ vma_is_dax(vma))
/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
goto out;
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 5b72266b4b03..6838a530789b 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -53,13 +53,8 @@ void __init mminit_verify_zonelist(void)
zone->name);
/* Iterate the zonelist */
- for_each_zone_zonelist(zone, z, zonelist, zoneid) {
-#ifdef CONFIG_NUMA
- pr_cont("%d:%s ", zone->node, zone->name);
-#else
- pr_cont("0:%s ", zone->name);
-#endif /* CONFIG_NUMA */
- }
+ for_each_zone_zonelist(zone, z, zonelist, zoneid)
+ pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
pr_cont("\n");
}
}
diff --git a/mm/mmap.c b/mm/mmap.c
index d1eb87ef4b1a..5f2b2b184c60 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -182,12 +182,12 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
if (vma->vm_file)
fput(vma->vm_file);
mpol_put(vma_policy(vma));
- kmem_cache_free(vm_area_cachep, vma);
+ vm_area_free(vma);
return next;
}
-static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf);
-
+static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags,
+ struct list_head *uf);
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
unsigned long retval;
@@ -245,7 +245,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
goto out;
/* Ok, looks good - let it rip. */
- if (do_brk(oldbrk, newbrk-oldbrk, &uf) < 0)
+ if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
goto out;
set_brk:
@@ -911,7 +911,7 @@ again:
anon_vma_merge(vma, next);
mm->map_count--;
mpol_put(vma_policy(next));
- kmem_cache_free(vm_area_cachep, next);
+ vm_area_free(next);
/*
* In mprotect's case 6 (see comments on vma_merge),
* we must remove another next too. It would clutter
@@ -1729,19 +1729,17 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
* specific mapper. the address has already been validated, but
* not unmapped, but the maps are removed from the list.
*/
- vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+ vma = vm_area_alloc(mm);
if (!vma) {
error = -ENOMEM;
goto unacct_error;
}
- vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_flags = vm_flags;
vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;
- INIT_LIST_HEAD(&vma->anon_vma_chain);
if (file) {
if (vm_flags & VM_DENYWRITE) {
@@ -1780,6 +1778,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
error = shmem_zero_setup(vma);
if (error)
goto free_vma;
+ } else {
+ vma_set_anonymous(vma);
}
vma_link(mm, vma, prev, rb_link, rb_parent);
@@ -1796,11 +1796,12 @@ out:
vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
- if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
- vma == get_gate_vma(current->mm)))
- mm->locked_vm += (len >> PAGE_SHIFT);
- else
+ if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
+ is_vm_hugetlb_page(vma) ||
+ vma == get_gate_vma(current->mm))
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+ else
+ mm->locked_vm += (len >> PAGE_SHIFT);
}
if (file)
@@ -1832,7 +1833,7 @@ allow_write_and_free_vma:
if (vm_flags & VM_DENYWRITE)
allow_write_access(file);
free_vma:
- kmem_cache_free(vm_area_cachep, vma);
+ vm_area_free(vma);
unacct_error:
if (charged)
vm_unacct_memory(charged);
@@ -2620,15 +2621,10 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
return err;
}
- new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+ new = vm_area_dup(vma);
if (!new)
return -ENOMEM;
- /* most fields are the same, copy all, and then fixup */
- *new = *vma;
-
- INIT_LIST_HEAD(&new->anon_vma_chain);
-
if (new_below)
new->vm_end = addr;
else {
@@ -2669,7 +2665,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
out_free_mpol:
mpol_put(vma_policy(new));
out_free_vma:
- kmem_cache_free(vm_area_cachep, new);
+ vm_area_free(new);
return err;
}
@@ -2929,21 +2925,14 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
* anonymous maps. eventually we may be able to do some
* brk-specific accounting here.
*/
-static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags, struct list_head *uf)
+static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags, struct list_head *uf)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
- unsigned long len;
struct rb_node **rb_link, *rb_parent;
pgoff_t pgoff = addr >> PAGE_SHIFT;
int error;
- len = PAGE_ALIGN(request);
- if (len < request)
- return -ENOMEM;
- if (!len)
- return 0;
-
/* Until we need other flags, refuse anything except VM_EXEC. */
if ((flags & (~VM_EXEC)) != 0)
return -EINVAL;
@@ -2991,14 +2980,13 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long
/*
* create a vma struct for an anonymous mapping
*/
- vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+ vma = vm_area_alloc(mm);
if (!vma) {
vm_unacct_memory(len >> PAGE_SHIFT);
return -ENOMEM;
}
- INIT_LIST_HEAD(&vma->anon_vma_chain);
- vma->vm_mm = mm;
+ vma_set_anonymous(vma);
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_pgoff = pgoff;
@@ -3015,18 +3003,20 @@ out:
return 0;
}
-static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf)
-{
- return do_brk_flags(addr, len, 0, uf);
-}
-
-int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags)
+int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
{
struct mm_struct *mm = current->mm;
+ unsigned long len;
int ret;
bool populate;
LIST_HEAD(uf);
+ len = PAGE_ALIGN(request);
+ if (len < request)
+ return -ENOMEM;
+ if (!len)
+ return 0;
+
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
@@ -3073,9 +3063,7 @@ void exit_mmap(struct mm_struct *mm)
* which clears VM_LOCKED, otherwise the oom reaper cannot
* reliably test it.
*/
- mutex_lock(&oom_lock);
- __oom_reap_task_mm(mm);
- mutex_unlock(&oom_lock);
+ (void)__oom_reap_task_mm(mm);
set_bit(MMF_OOM_SKIP, &mm->flags);
down_write(&mm->mmap_sem);
@@ -3207,16 +3195,14 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
}
*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
} else {
- new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+ new_vma = vm_area_dup(vma);
if (!new_vma)
goto out;
- *new_vma = *vma;
new_vma->vm_start = addr;
new_vma->vm_end = addr + len;
new_vma->vm_pgoff = pgoff;
if (vma_dup_policy(vma, new_vma))
goto out_free_vma;
- INIT_LIST_HEAD(&new_vma->anon_vma_chain);
if (anon_vma_clone(new_vma, vma))
goto out_free_mempol;
if (new_vma->vm_file)
@@ -3231,7 +3217,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
out_free_mempol:
mpol_put(vma_policy(new_vma));
out_free_vma:
- kmem_cache_free(vm_area_cachep, new_vma);
+ vm_area_free(new_vma);
out:
return NULL;
}
@@ -3355,12 +3341,10 @@ static struct vm_area_struct *__install_special_mapping(
int ret;
struct vm_area_struct *vma;
- vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+ vma = vm_area_alloc(mm);
if (unlikely(vma == NULL))
return ERR_PTR(-ENOMEM);
- INIT_LIST_HEAD(&vma->anon_vma_chain);
- vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
@@ -3381,7 +3365,7 @@ static struct vm_area_struct *__install_special_mapping(
return vma;
out:
- kmem_cache_free(vm_area_cachep, vma);
+ vm_area_free(vma);
return ERR_PTR(ret);
}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index eff6b88a993f..82bb1a939c0e 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -174,18 +174,29 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
srcu_read_unlock(&srcu, id);
}
-void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
- unsigned long start, unsigned long end)
+int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ bool blockable)
{
struct mmu_notifier *mn;
+ int ret = 0;
int id;
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
- if (mn->ops->invalidate_range_start)
- mn->ops->invalidate_range_start(mn, mm, start, end);
+ if (mn->ops->invalidate_range_start) {
+ int _ret = mn->ops->invalidate_range_start(mn, mm, start, end, blockable);
+ if (_ret) {
+ pr_info("%pS callback failed with %d in %sblockable context.\n",
+ mn->ops->invalidate_range_start, _ret,
+ !blockable ? "non-" : "");
+ ret = _ret;
+ }
+ }
}
srcu_read_unlock(&srcu, id);
+
+ return ret;
}
EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 625608bc8962..6d331620b9e5 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -306,6 +306,42 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
return pages;
}
+static int prot_none_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
+ 0 : -EACCES;
+}
+
+static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long next,
+ struct mm_walk *walk)
+{
+ return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
+ 0 : -EACCES;
+}
+
+static int prot_none_test(unsigned long addr, unsigned long next,
+ struct mm_walk *walk)
+{
+ return 0;
+}
+
+static int prot_none_walk(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, unsigned long newflags)
+{
+ pgprot_t new_pgprot = vm_get_page_prot(newflags);
+ struct mm_walk prot_none_walk = {
+ .pte_entry = prot_none_pte_entry,
+ .hugetlb_entry = prot_none_hugetlb_entry,
+ .test_walk = prot_none_test,
+ .mm = current->mm,
+ .private = &new_pgprot,
+ };
+
+ return walk_page_range(start, end, &prot_none_walk);
+}
+
int
mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
unsigned long start, unsigned long end, unsigned long newflags)
@@ -324,6 +360,19 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
}
/*
+ * Do PROT_NONE PFN permission checks here when we can still
+ * bail out without undoing a lot of state. This is a rather
+ * uncommon case, so doesn't need to be very optimized.
+ */
+ if (arch_has_pfn_modify_check() &&
+ (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
+ (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) {
+ error = prot_none_walk(vma, start, end, newflags);
+ if (error)
+ return error;
+ }
+
+ /*
* If we make a private mapping writable we increase our commit;
* but (without finer accounting) cannot reduce our commit if we
* make it unwritable again. hugetlb mapping were accounted for
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 9b02fda0886b..439af3b765a7 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -42,7 +42,7 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
{
void *ptr;
u64 addr;
- ulong flags = choose_memblock_flags();
+ enum memblock_flags flags = choose_memblock_flags();
if (limit > memblock.current_limit)
limit = memblock.current_limit;
@@ -72,7 +72,7 @@ again:
return ptr;
}
-/*
+/**
* free_bootmem_late - free bootmem pages directly to page allocator
* @addr: starting address of the range
* @size: size of the range in bytes
@@ -176,7 +176,7 @@ void __init reset_all_zones_managed_pages(void)
/**
* free_all_bootmem - release free pages to the buddy allocator
*
- * Returns the number of pages actually released.
+ * Return: the number of pages actually released.
*/
unsigned long __init free_all_bootmem(void)
{
@@ -193,7 +193,7 @@ unsigned long __init free_all_bootmem(void)
/**
* free_bootmem_node - mark a page range as usable
* @pgdat: node the range resides on
- * @physaddr: starting address of the range
+ * @physaddr: starting physical address of the range
* @size: size of the range in bytes
*
* Partial pages will be considered reserved and left as they are.
@@ -208,7 +208,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
/**
* free_bootmem - mark a page range as usable
- * @addr: starting address of the range
+ * @addr: starting physical address of the range
* @size: size of the range in bytes
*
* Partial pages will be considered reserved and left as they are.
@@ -256,7 +256,7 @@ restart:
*
* Allocation may happen on any node in the system.
*
- * Returns NULL on failure.
+ * Return: address of the allocated region or %NULL on failure.
*/
void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
unsigned long goal)
@@ -293,6 +293,8 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
* Allocation may happen on any node in the system.
*
* The function panics if the request can not be satisfied.
+ *
+ * Return: address of the allocated region.
*/
void * __init __alloc_bootmem(unsigned long size, unsigned long align,
unsigned long goal)
@@ -367,6 +369,8 @@ static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
* can not hold the requested memory.
*
* The function panics if the request can not be satisfied.
+ *
+ * Return: address of the allocated region.
*/
void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
@@ -396,6 +400,8 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
* Allocation may happen on any node in the system.
*
* The function panics if the request can not be satisfied.
+ *
+ * Return: address of the allocated region.
*/
void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
unsigned long goal)
@@ -425,6 +431,8 @@ void * __init __alloc_bootmem_low_nopanic(unsigned long size,
* can not hold the requested memory.
*
* The function panics if the request can not be satisfied.
+ *
+ * Return: address of the allocated region.
*/
void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
diff --git a/mm/nommu.c b/mm/nommu.c
index 4452d8bd9ae4..e4aac33216ae 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -364,10 +364,6 @@ void *vzalloc_node(unsigned long size, int node)
}
EXPORT_SYMBOL(vzalloc_node);
-#ifndef PAGE_KERNEL_EXEC
-# define PAGE_KERNEL_EXEC PAGE_KERNEL
-#endif
-
/**
* vmalloc_exec - allocate virtually contiguous, executable memory
* @size: allocation size
@@ -769,7 +765,7 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
if (vma->vm_file)
fput(vma->vm_file);
put_nommu_region(vma->vm_region);
- kmem_cache_free(vm_area_cachep, vma);
+ vm_area_free(vma);
}
/*
@@ -1145,6 +1141,8 @@ static int do_mmap_private(struct vm_area_struct *vma,
if (ret < len)
memset(base + ret, 0, len - ret);
+ } else {
+ vma_set_anonymous(vma);
}
return 0;
@@ -1204,7 +1202,7 @@ unsigned long do_mmap(struct file *file,
if (!region)
goto error_getting_region;
- vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+ vma = vm_area_alloc(current->mm);
if (!vma)
goto error_getting_vma;
@@ -1212,7 +1210,6 @@ unsigned long do_mmap(struct file *file,
region->vm_flags = vm_flags;
region->vm_pgoff = pgoff;
- INIT_LIST_HEAD(&vma->anon_vma_chain);
vma->vm_flags = vm_flags;
vma->vm_pgoff = pgoff;
@@ -1368,7 +1365,7 @@ error:
kmem_cache_free(vm_region_jar, region);
if (vma->vm_file)
fput(vma->vm_file);
- kmem_cache_free(vm_area_cachep, vma);
+ vm_area_free(vma);
return ret;
sharing_violation:
@@ -1469,14 +1466,13 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
if (!region)
return -ENOMEM;
- new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+ new = vm_area_dup(vma);
if (!new) {
kmem_cache_free(vm_region_jar, region);
return -ENOMEM;
}
/* most fields are the same, copy all, and then fixup */
- *new = *vma;
*region = *vma->vm_region;
new->vm_region = region;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 84081e77bc51..b5b25e4dcbbb 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -53,6 +53,14 @@ int sysctl_panic_on_oom;
int sysctl_oom_kill_allocating_task;
int sysctl_oom_dump_tasks = 1;
+/*
+ * Serializes oom killer invocations (out_of_memory()) from all contexts to
+ * prevent from over eager oom killing (e.g. when the oom killer is invoked
+ * from different domains).
+ *
+ * oom_killer_disable() relies on this lock to stabilize oom_killer_disabled
+ * and mark_oom_victim
+ */
DEFINE_MUTEX(oom_lock);
#ifdef CONFIG_NUMA
@@ -392,7 +400,8 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
struct task_struct *p;
struct task_struct *task;
- pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
+ pr_info("Tasks state (memory values in pages):\n");
+ pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
rcu_read_lock();
for_each_process(p) {
if (oom_unkillable_task(p, memcg, nodemask))
@@ -408,7 +417,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
continue;
}
- pr_info("[%5d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
+ pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
task->pid, from_kuid(&init_user_ns, task_uid(task)),
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
mm_pgtables_bytes(task->mm),
@@ -479,9 +488,10 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
static struct task_struct *oom_reaper_list;
static DEFINE_SPINLOCK(oom_reaper_lock);
-void __oom_reap_task_mm(struct mm_struct *mm)
+bool __oom_reap_task_mm(struct mm_struct *mm)
{
struct vm_area_struct *vma;
+ bool ret = true;
/*
* Tell all users of get_user/copy_from_user etc... that the content
@@ -511,50 +521,32 @@ void __oom_reap_task_mm(struct mm_struct *mm)
struct mmu_gather tlb;
tlb_gather_mmu(&tlb, mm, start, end);
- mmu_notifier_invalidate_range_start(mm, start, end);
+ if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) {
+ ret = false;
+ continue;
+ }
unmap_page_range(&tlb, vma, start, end, NULL);
mmu_notifier_invalidate_range_end(mm, start, end);
tlb_finish_mmu(&tlb, start, end);
}
}
+
+ return ret;
}
+/*
+ * Reaps the address space of the give task.
+ *
+ * Returns true on success and false if none or part of the address space
+ * has been reclaimed and the caller should retry later.
+ */
static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
{
bool ret = true;
- /*
- * We have to make sure to not race with the victim exit path
- * and cause premature new oom victim selection:
- * oom_reap_task_mm exit_mm
- * mmget_not_zero
- * mmput
- * atomic_dec_and_test
- * exit_oom_victim
- * [...]
- * out_of_memory
- * select_bad_process
- * # no TIF_MEMDIE task selects new victim
- * unmap_page_range # frees some memory
- */
- mutex_lock(&oom_lock);
-
if (!down_read_trylock(&mm->mmap_sem)) {
- ret = false;
trace_skip_task_reaping(tsk->pid);
- goto unlock_oom;
- }
-
- /*
- * If the mm has invalidate_{start,end}() notifiers that could block,
- * sleep to give the oom victim some more time.
- * TODO: we really want to get rid of this ugly hack and make sure that
- * notifiers cannot block for unbounded amount of time
- */
- if (mm_has_blockable_invalidate_notifiers(mm)) {
- up_read(&mm->mmap_sem);
- schedule_timeout_idle(HZ);
- goto unlock_oom;
+ return false;
}
/*
@@ -564,25 +556,27 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
* down_write();up_write() cycle in exit_mmap().
*/
if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
- up_read(&mm->mmap_sem);
trace_skip_task_reaping(tsk->pid);
- goto unlock_oom;
+ goto out_unlock;
}
trace_start_task_reaping(tsk->pid);
- __oom_reap_task_mm(mm);
+ /* failed to reap part of the address space. Try again later */
+ ret = __oom_reap_task_mm(mm);
+ if (!ret)
+ goto out_finish;
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
task_pid_nr(tsk), tsk->comm,
K(get_mm_counter(mm, MM_ANONPAGES)),
K(get_mm_counter(mm, MM_FILEPAGES)),
K(get_mm_counter(mm, MM_SHMEMPAGES)));
+out_finish:
+ trace_finish_task_reaping(tsk->pid);
+out_unlock:
up_read(&mm->mmap_sem);
- trace_finish_task_reaping(tsk->pid);
-unlock_oom:
- mutex_unlock(&oom_lock);
return ret;
}
@@ -835,68 +829,12 @@ static bool task_will_free_mem(struct task_struct *task)
return ret;
}
-static void oom_kill_process(struct oom_control *oc, const char *message)
+static void __oom_kill_process(struct task_struct *victim)
{
- struct task_struct *p = oc->chosen;
- unsigned int points = oc->chosen_points;
- struct task_struct *victim = p;
- struct task_struct *child;
- struct task_struct *t;
+ struct task_struct *p;
struct mm_struct *mm;
- unsigned int victim_points = 0;
- static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
bool can_oom_reap = true;
- /*
- * If the task is already exiting, don't alarm the sysadmin or kill
- * its children or threads, just give it access to memory reserves
- * so it can die quickly
- */
- task_lock(p);
- if (task_will_free_mem(p)) {
- mark_oom_victim(p);
- wake_oom_reaper(p);
- task_unlock(p);
- put_task_struct(p);
- return;
- }
- task_unlock(p);
-
- if (__ratelimit(&oom_rs))
- dump_header(oc, p);
-
- pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
- message, task_pid_nr(p), p->comm, points);
-
- /*
- * If any of p's children has a different mm and is eligible for kill,
- * the one with the highest oom_badness() score is sacrificed for its
- * parent. This attempts to lose the minimal amount of work done while
- * still freeing memory.
- */
- read_lock(&tasklist_lock);
- for_each_thread(p, t) {
- list_for_each_entry(child, &t->children, sibling) {
- unsigned int child_points;
-
- if (process_shares_mm(child, p->mm))
- continue;
- /*
- * oom_badness() returns 0 if the thread is unkillable
- */
- child_points = oom_badness(child,
- oc->memcg, oc->nodemask, oc->totalpages);
- if (child_points > victim_points) {
- put_task_struct(victim);
- victim = child;
- victim_points = child_points;
- get_task_struct(victim);
- }
- }
- }
- read_unlock(&tasklist_lock);
-
p = find_lock_task_mm(victim);
if (!p) {
put_task_struct(victim);
@@ -920,7 +858,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
* in order to prevent the OOM victim from depleting the memory
* reserves from the user space under its control.
*/
- do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
+ do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, PIDTYPE_TGID);
mark_oom_victim(victim);
pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
@@ -958,7 +896,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
*/
if (unlikely(p->flags & PF_KTHREAD))
continue;
- do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
+ do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, PIDTYPE_TGID);
}
rcu_read_unlock();
@@ -971,6 +909,99 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
#undef K
/*
+ * Kill provided task unless it's secured by setting
+ * oom_score_adj to OOM_SCORE_ADJ_MIN.
+ */
+static int oom_kill_memcg_member(struct task_struct *task, void *unused)
+{
+ if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
+ get_task_struct(task);
+ __oom_kill_process(task);
+ }
+ return 0;
+}
+
+static void oom_kill_process(struct oom_control *oc, const char *message)
+{
+ struct task_struct *p = oc->chosen;
+ unsigned int points = oc->chosen_points;
+ struct task_struct *victim = p;
+ struct task_struct *child;
+ struct task_struct *t;
+ struct mem_cgroup *oom_group;
+ unsigned int victim_points = 0;
+ static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ /*
+ * If the task is already exiting, don't alarm the sysadmin or kill
+ * its children or threads, just give it access to memory reserves
+ * so it can die quickly
+ */
+ task_lock(p);
+ if (task_will_free_mem(p)) {
+ mark_oom_victim(p);
+ wake_oom_reaper(p);
+ task_unlock(p);
+ put_task_struct(p);
+ return;
+ }
+ task_unlock(p);
+
+ if (__ratelimit(&oom_rs))
+ dump_header(oc, p);
+
+ pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
+ message, task_pid_nr(p), p->comm, points);
+
+ /*
+ * If any of p's children has a different mm and is eligible for kill,
+ * the one with the highest oom_badness() score is sacrificed for its
+ * parent. This attempts to lose the minimal amount of work done while
+ * still freeing memory.
+ */
+ read_lock(&tasklist_lock);
+ for_each_thread(p, t) {
+ list_for_each_entry(child, &t->children, sibling) {
+ unsigned int child_points;
+
+ if (process_shares_mm(child, p->mm))
+ continue;
+ /*
+ * oom_badness() returns 0 if the thread is unkillable
+ */
+ child_points = oom_badness(child,
+ oc->memcg, oc->nodemask, oc->totalpages);
+ if (child_points > victim_points) {
+ put_task_struct(victim);
+ victim = child;
+ victim_points = child_points;
+ get_task_struct(victim);
+ }
+ }
+ }
+ read_unlock(&tasklist_lock);
+
+ /*
+ * Do we need to kill the entire memory cgroup?
+ * Or even one of the ancestor memory cgroups?
+ * Check this out before killing the victim task.
+ */
+ oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
+
+ __oom_kill_process(victim);
+
+ /*
+ * If necessary, kill all tasks in the selected memory cgroup.
+ */
+ if (oom_group) {
+ mem_cgroup_print_oom_group(oom_group);
+ mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL);
+ mem_cgroup_put(oom_group);
+ }
+}
+
+/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
static void check_panic_on_oom(struct oom_control *oc,
@@ -1077,15 +1108,9 @@ bool out_of_memory(struct oom_control *oc)
dump_header(oc, NULL);
panic("Out of memory and no killable processes...\n");
}
- if (oc->chosen && oc->chosen != (void *)-1UL) {
+ if (oc->chosen && oc->chosen != (void *)-1UL)
oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
"Memory cgroup out of memory");
- /*
- * Give the killed process a good chance to exit before trying
- * to allocate memory again.
- */
- schedule_timeout_killable(1);
- }
return !!oc->chosen;
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 337c6afb3345..84ae9bf5858a 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -27,7 +27,6 @@
#include <linux/mpage.h>
#include <linux/rmap.h>
#include <linux/percpu.h>
-#include <linux/notifier.h>
#include <linux/smp.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
@@ -2490,8 +2489,8 @@ EXPORT_SYMBOL(__set_page_dirty_nobuffers);
/*
* Call this whenever redirtying a page, to de-account the dirty counters
- * (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written
- * counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to
+ * (NR_DIRTIED, WB_DIRTIED, tsk->nr_dirtied), so that they match the written
+ * counters (NR_WRITTEN, WB_WRITTEN) in long term. The mismatches will lead to
* systematic errors in balanced_dirty_ratelimit and the dirty pages position
* control.
*/
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1521100f1e63..05e983f42316 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -32,7 +32,6 @@
#include <linux/slab.h>
#include <linux/ratelimit.h>
#include <linux/oom.h>
-#include <linux/notifier.h>
#include <linux/topology.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
@@ -155,16 +154,17 @@ static inline void set_pcppage_migratetype(struct page *page, int migratetype)
* The following functions are used by the suspend/hibernate code to temporarily
* change gfp_allowed_mask in order to avoid using I/O during memory allocations
* while devices are suspended. To avoid races with the suspend/hibernate code,
- * they should always be called with pm_mutex held (gfp_allowed_mask also should
- * only be modified with pm_mutex held, unless the suspend/hibernate code is
- * guaranteed not to run in parallel with that modification).
+ * they should always be called with system_transition_mutex held
+ * (gfp_allowed_mask also should only be modified with system_transition_mutex
+ * held, unless the suspend/hibernate code is guaranteed not to run in parallel
+ * with that modification).
*/
static gfp_t saved_gfp_mask;
void pm_restore_gfp_mask(void)
{
- WARN_ON(!mutex_is_locked(&pm_mutex));
+ WARN_ON(!mutex_is_locked(&system_transition_mutex));
if (saved_gfp_mask) {
gfp_allowed_mask = saved_gfp_mask;
saved_gfp_mask = 0;
@@ -173,7 +173,7 @@ void pm_restore_gfp_mask(void)
void pm_restrict_gfp_mask(void)
{
- WARN_ON(!mutex_is_locked(&pm_mutex));
+ WARN_ON(!mutex_is_locked(&system_transition_mutex));
WARN_ON(saved_gfp_mask);
saved_gfp_mask = gfp_allowed_mask;
gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
@@ -2908,10 +2908,10 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
if (!static_branch_likely(&vm_numa_stat_key))
return;
- if (z->node != numa_node_id())
+ if (zone_to_nid(z) != numa_node_id())
local_stat = NUMA_OTHER;
- if (z->node == preferred_zone->node)
+ if (zone_to_nid(z) == zone_to_nid(preferred_zone))
__inc_numa_state(z, NUMA_HIT);
else {
__inc_numa_state(z, NUMA_MISS);
@@ -4164,11 +4164,12 @@ retry:
alloc_flags = reserve_flags;
/*
- * Reset the zonelist iterators if memory policies can be ignored.
- * These allocations are high priority and system rather than user
- * orientated.
+ * Reset the nodemask and zonelist iterators if memory policies can be
+ * ignored. These allocations are high priority and system rather than
+ * user oriented.
*/
if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
+ ac->nodemask = NULL;
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
}
@@ -4402,19 +4403,15 @@ out:
EXPORT_SYMBOL(__alloc_pages_nodemask);
/*
- * Common helper functions.
+ * Common helper functions. Never use with __GFP_HIGHMEM because the returned
+ * address cannot represent highmem pages. Use alloc_pages and then kmap if
+ * you need to access high mem.
*/
unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
{
struct page *page;
- /*
- * __get_free_pages() returns a virtual address, which cannot represent
- * a highmem page
- */
- VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
-
- page = alloc_pages(gfp_mask, order);
+ page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order);
if (!page)
return 0;
return (unsigned long) page_address(page);
@@ -5280,7 +5277,7 @@ int local_memory_node(int node)
z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
gfp_zone(GFP_KERNEL),
NULL);
- return z->zone->node;
+ return zone_to_nid(z->zone);
}
#endif
@@ -5566,13 +5563,12 @@ static int zone_batchsize(struct zone *zone)
/*
* The per-cpu-pages pools are set to around 1000th of the
- * size of the zone. But no more than 1/2 of a meg.
- *
- * OK, so we don't know how big the cache is. So guess.
+ * size of the zone.
*/
batch = zone->managed_pages / 1024;
- if (batch * PAGE_SIZE > 512 * 1024)
- batch = (512 * 1024) / PAGE_SIZE;
+ /* But no more than a meg. */
+ if (batch * PAGE_SIZE > 1024 * 1024)
+ batch = (1024 * 1024) / PAGE_SIZE;
batch /= 4; /* We effectively *= 4 below */
if (batch < 1)
batch = 1;
@@ -6123,7 +6119,7 @@ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned l
return usemapsize / 8;
}
-static void __init setup_usemap(struct pglist_data *pgdat,
+static void __ref setup_usemap(struct pglist_data *pgdat,
struct zone *zone,
unsigned long zone_start_pfn,
unsigned long zonesize)
@@ -6143,7 +6139,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
-void __paginginit set_pageblock_order(void)
+void __init set_pageblock_order(void)
{
unsigned int order;
@@ -6171,14 +6167,14 @@ void __paginginit set_pageblock_order(void)
* include/linux/pageblock-flags.h for the values of pageblock_order based on
* the kernel config
*/
-void __paginginit set_pageblock_order(void)
+void __init set_pageblock_order(void)
{
}
#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
-static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
- unsigned long present_pages)
+static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
+ unsigned long present_pages)
{
unsigned long pages = spanned_pages;
@@ -6197,39 +6193,99 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
}
-/*
- * Set up the zone data structures:
- * - mark all pages reserved
- * - mark all memory queues empty
- * - clear the memory bitmaps
- *
- * NOTE: pgdat should get zeroed by caller.
- */
-static void __paginginit free_area_init_core(struct pglist_data *pgdat)
-{
- enum zone_type j;
- int nid = pgdat->node_id;
-
- pgdat_resize_init(pgdat);
#ifdef CONFIG_NUMA_BALANCING
+static void pgdat_init_numabalancing(struct pglist_data *pgdat)
+{
spin_lock_init(&pgdat->numabalancing_migrate_lock);
pgdat->numabalancing_migrate_nr_pages = 0;
pgdat->numabalancing_migrate_next_window = jiffies;
+}
+#else
+static void pgdat_init_numabalancing(struct pglist_data *pgdat) {}
#endif
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void pgdat_init_split_queue(struct pglist_data *pgdat)
+{
spin_lock_init(&pgdat->split_queue_lock);
INIT_LIST_HEAD(&pgdat->split_queue);
pgdat->split_queue_len = 0;
+}
+#else
+static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
#endif
- init_waitqueue_head(&pgdat->kswapd_wait);
- init_waitqueue_head(&pgdat->pfmemalloc_wait);
+
#ifdef CONFIG_COMPACTION
+static void pgdat_init_kcompactd(struct pglist_data *pgdat)
+{
init_waitqueue_head(&pgdat->kcompactd_wait);
+}
+#else
+static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
#endif
+
+static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
+{
+ pgdat_resize_init(pgdat);
+
+ pgdat_init_numabalancing(pgdat);
+ pgdat_init_split_queue(pgdat);
+ pgdat_init_kcompactd(pgdat);
+
+ init_waitqueue_head(&pgdat->kswapd_wait);
+ init_waitqueue_head(&pgdat->pfmemalloc_wait);
+
pgdat_page_ext_init(pgdat);
spin_lock_init(&pgdat->lru_lock);
lruvec_init(node_lruvec(pgdat));
+}
+
+static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
+ unsigned long remaining_pages)
+{
+ zone->managed_pages = remaining_pages;
+ zone_set_nid(zone, nid);
+ zone->name = zone_names[idx];
+ zone->zone_pgdat = NODE_DATA(nid);
+ spin_lock_init(&zone->lock);
+ zone_seqlock_init(zone);
+ zone_pcp_init(zone);
+}
+
+/*
+ * Set up the zone data structures
+ * - init pgdat internals
+ * - init all zones belonging to this node
+ *
+ * NOTE: this function is only called during memory hotplug
+ */
+#ifdef CONFIG_MEMORY_HOTPLUG
+void __ref free_area_init_core_hotplug(int nid)
+{
+ enum zone_type z;
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ pgdat_init_internals(pgdat);
+ for (z = 0; z < MAX_NR_ZONES; z++)
+ zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
+}
+#endif
+
+/*
+ * Set up the zone data structures:
+ * - mark all pages reserved
+ * - mark all memory queues empty
+ * - clear the memory bitmaps
+ *
+ * NOTE: pgdat should get zeroed by caller.
+ * NOTE: this function is only called during early init.
+ */
+static void __init free_area_init_core(struct pglist_data *pgdat)
+{
+ enum zone_type j;
+ int nid = pgdat->node_id;
+ pgdat_init_internals(pgdat);
pgdat->per_cpu_nodestats = &boot_nodestats;
for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -6277,15 +6333,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
* when the bootmem allocator frees pages into the buddy system.
* And all highmem pages will be managed by the buddy system.
*/
- zone->managed_pages = freesize;
-#ifdef CONFIG_NUMA
- zone->node = nid;
-#endif
- zone->name = zone_names[j];
- zone->zone_pgdat = pgdat;
- spin_lock_init(&zone->lock);
- zone_seqlock_init(zone);
- zone_pcp_init(zone);
+ zone_init_internals(zone, j, nid, freesize);
if (!size)
continue;
@@ -6345,8 +6393,24 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
#endif /* CONFIG_FLAT_NODE_MEM_MAP */
-void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
- unsigned long node_start_pfn, unsigned long *zholes_size)
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
+{
+ /*
+ * We start only with one section of pages, more pages are added as
+ * needed until the rest of deferred pages are initialized.
+ */
+ pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
+ pgdat->node_spanned_pages);
+ pgdat->first_deferred_pfn = ULONG_MAX;
+}
+#else
+static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
+#endif
+
+void __init free_area_init_node(int nid, unsigned long *zones_size,
+ unsigned long node_start_pfn,
+ unsigned long *zholes_size)
{
pg_data_t *pgdat = NODE_DATA(nid);
unsigned long start_pfn = 0;
@@ -6370,20 +6434,12 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
zones_size, zholes_size);
alloc_node_mem_map(pgdat);
+ pgdat_set_deferred_range(pgdat);
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
- /*
- * We start only with one section of pages, more pages are added as
- * needed until the rest of deferred pages are initialized.
- */
- pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
- pgdat->node_spanned_pages);
- pgdat->first_deferred_pfn = ULONG_MAX;
-#endif
free_area_init_core(pgdat);
}
-#ifdef CONFIG_HAVE_MEMBLOCK
+#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP)
/*
* Only struct pages that are backed by physical memory are zeroed and
* initialized by going through __init_single_page(). But, there are some
@@ -6391,7 +6447,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
* may be accessed (for example page_to_pfn() on some configuration accesses
* flags). We must explicitly zero those struct pages.
*/
-void __paginginit zero_resv_unavail(void)
+void __init zero_resv_unavail(void)
{
phys_addr_t start, end;
unsigned long pfn;
@@ -6404,8 +6460,11 @@ void __paginginit zero_resv_unavail(void)
pgcnt = 0;
for_each_resv_unavail_range(i, &start, &end) {
for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) {
- if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages)))
+ if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
+ pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
+ + pageblock_nr_pages - 1;
continue;
+ }
mm_zero_struct_page(pfn_to_page(pfn));
pgcnt++;
}
@@ -6421,7 +6480,7 @@ void __paginginit zero_resv_unavail(void)
if (pgcnt)
pr_info("Reserved but unavailable: %lld pages", pgcnt);
}
-#endif /* CONFIG_HAVE_MEMBLOCK */
+#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -6847,6 +6906,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
/* Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
+ zero_resv_unavail();
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
free_area_init_node(nid, NULL,
@@ -6857,7 +6917,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
node_set_state(nid, N_MEMORY);
check_for_memory(pgdat, nid);
}
- zero_resv_unavail();
}
static int __init cmdline_parse_core(char *p, unsigned long *core,
@@ -6939,9 +6998,21 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
start = (void *)PAGE_ALIGN((unsigned long)start);
end = (void *)((unsigned long)end & PAGE_MASK);
for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
+ struct page *page = virt_to_page(pos);
+ void *direct_map_addr;
+
+ /*
+ * 'direct_map_addr' might be different from 'pos'
+ * because some architectures' virt_to_page()
+ * work with aliases. Getting the direct map
+ * address ensures that we get a _writeable_
+ * alias for the memset().
+ */
+ direct_map_addr = page_address(page);
if ((unsigned int)poison <= 0xFF)
- memset(pos, poison, PAGE_SIZE);
- free_reserved_page(virt_to_page(pos));
+ memset(direct_map_addr, poison, PAGE_SIZE);
+
+ free_reserved_page(page);
}
if (pages && s)
@@ -7033,9 +7104,9 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
void __init free_area_init(unsigned long *zones_size)
{
+ zero_resv_unavail();
free_area_init_node(0, zones_size,
__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
- zero_resv_unavail();
}
static int page_alloc_cpu_dead(unsigned int cpu)
@@ -8024,3 +8095,33 @@ bool is_free_buddy_page(struct page *page)
return order < MAX_ORDER;
}
+
+#ifdef CONFIG_MEMORY_FAILURE
+/*
+ * Set PG_hwpoison flag if a given page is confirmed to be a free page. This
+ * test is performed under the zone lock to prevent a race against page
+ * allocation.
+ */
+bool set_hwpoison_free_buddy_page(struct page *page)
+{
+ struct zone *zone = page_zone(page);
+ unsigned long pfn = page_to_pfn(page);
+ unsigned long flags;
+ unsigned int order;
+ bool hwpoisoned = false;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ for (order = 0; order < MAX_ORDER; order++) {
+ struct page *page_head = page - (pfn & ((1 << order) - 1));
+
+ if (PageBuddy(page_head) && page_order(page_head) >= order) {
+ if (!TestSetPageHWPoison(page))
+ hwpoisoned = true;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ return hwpoisoned;
+}
+#endif
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 5295ef331165..a9826da84ccb 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -120,7 +120,7 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
pgdat->node_page_ext = NULL;
}
-struct page_ext *lookup_page_ext(struct page *page)
+struct page_ext *lookup_page_ext(const struct page *page)
{
unsigned long pfn = page_to_pfn(page);
unsigned long index;
@@ -195,7 +195,7 @@ fail:
#else /* CONFIG_FLAT_NODE_MEM_MAP */
-struct page_ext *lookup_page_ext(struct page *page)
+struct page_ext *lookup_page_ext(const struct page *page)
{
unsigned long pfn = page_to_pfn(page);
struct mem_section *section = __pfn_to_section(pfn);
diff --git a/mm/page_io.c b/mm/page_io.c
index b41cf9644585..aafd19ec1db4 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -338,7 +338,8 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
ret = -ENOMEM;
goto out;
}
- bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
+ bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
+ bio_associate_blkcg_from_page(bio, page);
count_swpout_vm_event(page);
set_page_writeback(page);
unlock_page(page);
diff --git a/mm/percpu.c b/mm/percpu.c
index 0b6480979ac7..a749d4d96e3e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -170,6 +170,14 @@ static LIST_HEAD(pcpu_map_extend_chunks);
int pcpu_nr_empty_pop_pages;
/*
+ * The number of populated pages in use by the allocator, protected by
+ * pcpu_lock. This number is kept per a unit per chunk (i.e. when a page gets
+ * allocated/deallocated, it is allocated/deallocated in all units of a chunk
+ * and increments/decrements this count by 1).
+ */
+static unsigned long pcpu_nr_populated;
+
+/*
* Balance work is used to populate or destroy chunks asynchronously. We
* try to keep the number of populated free pages between
* PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
@@ -1232,6 +1240,7 @@ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
bitmap_set(chunk->populated, page_start, nr);
chunk->nr_populated += nr;
+ pcpu_nr_populated += nr;
if (!for_alloc) {
chunk->nr_empty_pop_pages += nr;
@@ -1260,6 +1269,7 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
chunk->nr_populated -= nr;
chunk->nr_empty_pop_pages -= nr;
pcpu_nr_empty_pop_pages -= nr;
+ pcpu_nr_populated -= nr;
}
/*
@@ -2176,6 +2186,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
pcpu_chunk_relocate(pcpu_first_chunk, -1);
+ /* include all regions of the first chunk */
+ pcpu_nr_populated += PFN_DOWN(size_sum);
+
pcpu_stats_chunk_alloc();
trace_percpu_create_chunk(base_addr);
@@ -2746,6 +2759,22 @@ void __init setup_per_cpu_areas(void)
#endif /* CONFIG_SMP */
/*
+ * pcpu_nr_pages - calculate total number of populated backing pages
+ *
+ * This reflects the number of pages populated to back chunks. Metadata is
+ * excluded in the number exposed in meminfo as the number of backing pages
+ * scales with the number of cpus and can quickly outweigh the memory used for
+ * metadata. It also keeps this calculation nice and simple.
+ *
+ * RETURNS:
+ * Total number of populated backing pages in use by the allocator.
+ */
+unsigned long pcpu_nr_pages(void)
+{
+ return pcpu_nr_populated * pcpu_nr_units;
+}
+
+/*
* Percpu allocator is initialized early during boot when neither slab or
* workqueue is available. Plug async management until everything is up
* and running.
diff --git a/mm/readahead.c b/mm/readahead.c
index e273f0de3376..a59ea70527b9 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -19,6 +19,7 @@
#include <linux/syscalls.h>
#include <linux/file.h>
#include <linux/mm_inline.h>
+#include <linux/blk-cgroup.h>
#include "internal.h"
@@ -385,6 +386,7 @@ ondemand_readahead(struct address_space *mapping,
{
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
unsigned long max_pages = ra->ra_pages;
+ unsigned long add_pages;
pgoff_t prev_offset;
/*
@@ -474,10 +476,17 @@ readit:
* Will this read hit the readahead marker made by itself?
* If so, trigger the readahead marker hit now, and merge
* the resulted next readahead window into the current one.
+ * Take care of maximum IO pages as above.
*/
if (offset == ra->start && ra->size == ra->async_size) {
- ra->async_size = get_next_ra_size(ra, max_pages);
- ra->size += ra->async_size;
+ add_pages = get_next_ra_size(ra, max_pages);
+ if (ra->size + add_pages <= max_pages) {
+ ra->async_size = add_pages;
+ ra->size += add_pages;
+ } else {
+ ra->size = max_pages;
+ ra->async_size = max_pages >> 1;
+ }
}
return ra_submit(ra, mapping, filp);
@@ -505,6 +514,9 @@ void page_cache_sync_readahead(struct address_space *mapping,
if (!ra->ra_pages)
return;
+ if (blk_cgroup_congested())
+ return;
+
/* be dumb */
if (filp && (filp->f_mode & FMODE_RANDOM)) {
force_page_cache_readahead(mapping, filp, offset, req_size);
@@ -555,6 +567,9 @@ page_cache_async_readahead(struct address_space *mapping,
if (inode_read_congested(mapping->host))
return;
+ if (blk_cgroup_congested())
+ return;
+
/* do read-ahead */
ondemand_readahead(mapping, ra, filp, true, offset, req_size);
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 6db729dc4c50..eb477809a5c0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -64,6 +64,7 @@
#include <linux/backing-dev.h>
#include <linux/page_idle.h>
#include <linux/memremap.h>
+#include <linux/userfaultfd_k.h>
#include <asm/tlbflush.h>
@@ -1481,11 +1482,16 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
set_pte_at(mm, address, pvmw.pte, pteval);
}
- } else if (pte_unused(pteval)) {
+ } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
/*
* The guest indicated that the page content is of no
* interest anymore. Simply discard the pte, vmscan
* will take care of the rest.
+ * A future reference will then fault in a new zero
+ * page. When userfaultfd is active, we must not drop
+ * this page though, as its main user (postcopy
+ * migration) will not expect userfaults on already
+ * copied pages.
*/
dec_mm_counter(mm, mm_counter(page));
/* We have to invalidate as we cleared the pte */
diff --git a/mm/shmem.c b/mm/shmem.c
index 2cab84403055..0376c124b043 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -29,6 +29,7 @@
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/mm.h>
+#include <linux/random.h>
#include <linux/sched/signal.h>
#include <linux/export.h>
#include <linux/swap.h>
@@ -123,7 +124,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct page **pagep, enum sgp_type sgp,
gfp_t gfp, struct vm_area_struct *vma,
- struct vm_fault *vmf, int *fault_type);
+ struct vm_fault *vmf, vm_fault_t *fault_type);
int shmem_getpage(struct inode *inode, pgoff_t index,
struct page **pagep, enum sgp_type sgp)
@@ -1239,8 +1240,8 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
* the shmem_swaplist_mutex which might hold up shmem_writepage().
* Charged back to the user (not to caller) when swap account is used.
*/
- error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg,
- false);
+ error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL,
+ &memcg, false);
if (error)
goto out;
/* No radix_tree_preload: swap entry keeps a place for page in tree */
@@ -1420,7 +1421,7 @@ static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
struct shmem_inode_info *info, pgoff_t index)
{
/* Create a pseudo vma that just contains the policy */
- memset(vma, 0, sizeof(*vma));
+ vma_init(vma, NULL);
/* Bias interleave by inode number to distribute better across nodes */
vma->vm_pgoff = index + info->vfs_inode.i_ino;
vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
@@ -1619,7 +1620,8 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
*/
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct page **pagep, enum sgp_type sgp, gfp_t gfp,
- struct vm_area_struct *vma, struct vm_fault *vmf, int *fault_type)
+ struct vm_area_struct *vma, struct vm_fault *vmf,
+ vm_fault_t *fault_type)
{
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
@@ -1712,7 +1714,7 @@ repeat:
goto failed;
}
- error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
+ error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
false);
if (!error) {
error = shmem_add_to_page_cache(page, mapping, index,
@@ -1818,7 +1820,7 @@ alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode,
if (sgp == SGP_WRITE)
__SetPageReferenced(page);
- error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
+ error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
PageTransHuge(page));
if (error)
goto unacct;
@@ -2187,7 +2189,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
inode_init_owner(inode, dir, mode);
inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
- inode->i_generation = get_seconds();
+ inode->i_generation = prandom_u32();
info = SHMEM_I(inode);
memset(info, 0, (char *)inode - (char *)info);
spin_lock_init(&info->lock);
@@ -2291,7 +2293,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
__SetPageSwapBacked(page);
__SetPageUptodate(page);
- ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg, false);
+ ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false);
if (ret)
goto out_release;
@@ -3896,18 +3898,11 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
/* common code */
-static const struct dentry_operations anon_ops = {
- .d_dname = simple_dname
-};
-
static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size,
unsigned long flags, unsigned int i_flags)
{
- struct file *res;
struct inode *inode;
- struct path path;
- struct super_block *sb;
- struct qstr this;
+ struct file *res;
if (IS_ERR(mnt))
return ERR_CAST(mnt);
@@ -3918,41 +3913,21 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, l
if (shmem_acct_size(flags, size))
return ERR_PTR(-ENOMEM);
- res = ERR_PTR(-ENOMEM);
- this.name = name;
- this.len = strlen(name);
- this.hash = 0; /* will go */
- sb = mnt->mnt_sb;
- path.mnt = mntget(mnt);
- path.dentry = d_alloc_pseudo(sb, &this);
- if (!path.dentry)
- goto put_memory;
- d_set_d_op(path.dentry, &anon_ops);
-
- res = ERR_PTR(-ENOSPC);
- inode = shmem_get_inode(sb, NULL, S_IFREG | 0777, 0, flags);
- if (!inode)
- goto put_memory;
-
+ inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0,
+ flags);
+ if (unlikely(!inode)) {
+ shmem_unacct_size(flags, size);
+ return ERR_PTR(-ENOSPC);
+ }
inode->i_flags |= i_flags;
- d_instantiate(path.dentry, inode);
inode->i_size = size;
clear_nlink(inode); /* It is unlinked */
res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
+ if (!IS_ERR(res))
+ res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
+ &shmem_file_operations);
if (IS_ERR(res))
- goto put_path;
-
- res = alloc_file(&path, FMODE_WRITE | FMODE_READ,
- &shmem_file_operations);
- if (IS_ERR(res))
- goto put_path;
-
- return res;
-
-put_memory:
- shmem_unacct_size(flags, size);
-put_path:
- path_put(&path);
+ iput(inode);
return res;
}
diff --git a/mm/slab.h b/mm/slab.h
index 68bdf498da3b..58c6c1c2a78e 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -203,7 +203,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+#ifdef CONFIG_MEMCG_KMEM
/* List of all root caches. */
extern struct list_head slab_root_caches;
@@ -296,7 +296,7 @@ extern void memcg_link_cache(struct kmem_cache *s);
extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
void (*deact_fn)(struct kmem_cache *));
-#else /* CONFIG_MEMCG && !CONFIG_SLOB */
+#else /* CONFIG_MEMCG_KMEM */
/* If !memcg, all caches are root. */
#define slab_root_caches slab_caches
@@ -351,7 +351,7 @@ static inline void memcg_link_cache(struct kmem_cache *s)
{
}
-#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
+#endif /* CONFIG_MEMCG_KMEM */
static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
{
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 890b1f04a03a..fea3376f9816 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -127,7 +127,7 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
return i;
}
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+#ifdef CONFIG_MEMCG_KMEM
LIST_HEAD(slab_root_caches);
@@ -256,7 +256,7 @@ static inline void destroy_memcg_params(struct kmem_cache *s)
static inline void memcg_unlink_cache(struct kmem_cache *s)
{
}
-#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
+#endif /* CONFIG_MEMCG_KMEM */
/*
* Figure out what the alignment of the objects will be given a set of
@@ -567,10 +567,14 @@ static int shutdown_cache(struct kmem_cache *s)
list_del(&s->list);
if (s->flags & SLAB_TYPESAFE_BY_RCU) {
+#ifdef SLAB_SUPPORTS_SYSFS
+ sysfs_slab_unlink(s);
+#endif
list_add_tail(&s->list, &slab_caches_to_rcu_destroy);
schedule_work(&slab_caches_to_rcu_destroy_work);
} else {
#ifdef SLAB_SUPPORTS_SYSFS
+ sysfs_slab_unlink(s);
sysfs_slab_release(s);
#else
slab_kmem_cache_release(s);
@@ -580,7 +584,7 @@ static int shutdown_cache(struct kmem_cache *s)
return 0;
}
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+#ifdef CONFIG_MEMCG_KMEM
/*
* memcg_create_kmem_cache - Create a cache for a memory cgroup.
* @memcg: The memory cgroup the new cache is for.
@@ -857,7 +861,7 @@ static inline int shutdown_memcg_caches(struct kmem_cache *s)
static inline void flush_memcg_workqueue(struct kmem_cache *s)
{
}
-#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
+#endif /* CONFIG_MEMCG_KMEM */
void slab_kmem_cache_release(struct kmem_cache *s)
{
diff --git a/mm/slub.c b/mm/slub.c
index a3b8467c14af..8da34a8af53d 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -19,7 +19,6 @@
#include <linux/slab.h>
#include "slab.h"
#include <linux/proc_fs.h>
-#include <linux/notifier.h>
#include <linux/seq_file.h>
#include <linux/kasan.h>
#include <linux/cpu.h>
@@ -271,8 +270,7 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object)
static void prefetch_freepointer(const struct kmem_cache *s, void *object)
{
- if (object)
- prefetch(freelist_dereference(s, object + s->offset));
+ prefetch(object + s->offset);
}
static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
@@ -5667,7 +5665,6 @@ static void sysfs_slab_remove_workfn(struct work_struct *work)
kset_unregister(s->memcg_kset);
#endif
kobject_uevent(&s->kobj, KOBJ_REMOVE);
- kobject_del(&s->kobj);
out:
kobject_put(&s->kobj);
}
@@ -5752,6 +5749,12 @@ static void sysfs_slab_remove(struct kmem_cache *s)
schedule_work(&s->kobj_remove_work);
}
+void sysfs_slab_unlink(struct kmem_cache *s)
+{
+ if (slab_state >= FULL)
+ kobject_del(&s->kobj);
+}
+
void sysfs_slab_release(struct kmem_cache *s)
{
if (slab_state >= FULL)
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index bd0276d5f66b..8301293331a2 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -43,12 +43,9 @@ static void * __ref __earlyonly_bootmem_alloc(int node,
unsigned long goal)
{
return memblock_virt_alloc_try_nid_raw(size, align, goal,
- BOOTMEM_ALLOC_ACCESSIBLE, node);
+ BOOTMEM_ALLOC_ACCESSIBLE, node);
}
-static void *vmemmap_buf;
-static void *vmemmap_buf_end;
-
void * __meminit vmemmap_alloc_block(unsigned long size, int node)
{
/* If the main allocator is up use that, fallback to bootmem. */
@@ -76,18 +73,10 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
/* need to make sure size is all the same during early stage */
void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
{
- void *ptr;
-
- if (!vmemmap_buf)
- return vmemmap_alloc_block(size, node);
-
- /* take the from buf */
- ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size);
- if (ptr + size > vmemmap_buf_end)
- return vmemmap_alloc_block(size, node);
-
- vmemmap_buf = ptr + size;
+ void *ptr = sparse_buffer_alloc(size);
+ if (!ptr)
+ ptr = vmemmap_alloc_block(size, node);
return ptr;
}
@@ -272,45 +261,3 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid,
return map;
}
-
-void __init sparse_mem_maps_populate_node(struct page **map_map,
- unsigned long pnum_begin,
- unsigned long pnum_end,
- unsigned long map_count, int nodeid)
-{
- unsigned long pnum;
- unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
- void *vmemmap_buf_start;
-
- size = ALIGN(size, PMD_SIZE);
- vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count,
- PMD_SIZE, __pa(MAX_DMA_ADDRESS));
-
- if (vmemmap_buf_start) {
- vmemmap_buf = vmemmap_buf_start;
- vmemmap_buf_end = vmemmap_buf_start + size * map_count;
- }
-
- for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
- struct mem_section *ms;
-
- if (!present_section_nr(pnum))
- continue;
-
- map_map[pnum] = sparse_mem_map_populate(pnum, nodeid, NULL);
- if (map_map[pnum])
- continue;
- ms = __nr_to_section(pnum);
- pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
- __func__);
- ms->section_mem_map = 0;
- }
-
- if (vmemmap_buf_start) {
- /* need to free left buf */
- memblock_free_early(__pa(vmemmap_buf),
- vmemmap_buf_end - vmemmap_buf);
- vmemmap_buf = NULL;
- vmemmap_buf_end = NULL;
- }
-}
diff --git a/mm/sparse.c b/mm/sparse.c
index f13f2723950a..10b07eea9a6e 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -200,6 +200,11 @@ static inline int next_present_section_nr(int section_nr)
(section_nr <= __highest_present_section_nr)); \
section_nr = next_present_section_nr(section_nr))
+static inline unsigned long first_present_section_nr(void)
+{
+ return next_present_section_nr(-1);
+}
+
/* Record a memory area against a node. */
void __init memory_present(int nid, unsigned long start, unsigned long end)
{
@@ -257,19 +262,14 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn
return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
}
-static int __meminit sparse_init_one_section(struct mem_section *ms,
+static void __meminit sparse_init_one_section(struct mem_section *ms,
unsigned long pnum, struct page *mem_map,
unsigned long *pageblock_bitmap)
{
- if (!present_section(ms))
- return -EINVAL;
-
ms->section_mem_map &= ~SECTION_MAP_MASK;
ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
SECTION_HAS_MEM_MAP;
ms->pageblock_flags = pageblock_bitmap;
-
- return 1;
}
unsigned long usemap_size(void)
@@ -370,160 +370,121 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
-static void __init sparse_early_usemaps_alloc_node(void *data,
- unsigned long pnum_begin,
- unsigned long pnum_end,
- unsigned long usemap_count, int nodeid)
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static unsigned long __init section_map_size(void)
{
- void *usemap;
- unsigned long pnum;
- unsigned long **usemap_map = (unsigned long **)data;
- int size = usemap_size();
-
- usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
- size * usemap_count);
- if (!usemap) {
- pr_warn("%s: allocation failed\n", __func__);
- return;
- }
+ return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE);
+}
- for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
- if (!present_section_nr(pnum))
- continue;
- usemap_map[pnum] = usemap;
- usemap += size;
- check_usemap_section_nr(nodeid, usemap_map[pnum]);
- }
+#else
+static unsigned long __init section_map_size(void)
+{
+ return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
}
-#ifndef CONFIG_SPARSEMEM_VMEMMAP
struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid,
struct vmem_altmap *altmap)
{
- struct page *map;
- unsigned long size;
+ unsigned long size = section_map_size();
+ struct page *map = sparse_buffer_alloc(size);
+
+ if (map)
+ return map;
- size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
map = memblock_virt_alloc_try_nid(size,
PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
BOOTMEM_ALLOC_ACCESSIBLE, nid);
return map;
}
-void __init sparse_mem_maps_populate_node(struct page **map_map,
- unsigned long pnum_begin,
- unsigned long pnum_end,
- unsigned long map_count, int nodeid)
-{
- void *map;
- unsigned long pnum;
- unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
-
- size = PAGE_ALIGN(size);
- map = memblock_virt_alloc_try_nid_raw(size * map_count,
- PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
- BOOTMEM_ALLOC_ACCESSIBLE, nodeid);
- if (map) {
- for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
- if (!present_section_nr(pnum))
- continue;
- map_map[pnum] = map;
- map += size;
- }
- return;
- }
+#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
- /* fallback */
- for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
- struct mem_section *ms;
+static void *sparsemap_buf __meminitdata;
+static void *sparsemap_buf_end __meminitdata;
- if (!present_section_nr(pnum))
- continue;
- map_map[pnum] = sparse_mem_map_populate(pnum, nodeid, NULL);
- if (map_map[pnum])
- continue;
- ms = __nr_to_section(pnum);
- pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
- __func__);
- ms->section_mem_map = 0;
- }
+static void __init sparse_buffer_init(unsigned long size, int nid)
+{
+ WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */
+ sparsemap_buf =
+ memblock_virt_alloc_try_nid_raw(size, PAGE_SIZE,
+ __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, nid);
+ sparsemap_buf_end = sparsemap_buf + size;
}
-#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
-#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
-static void __init sparse_early_mem_maps_alloc_node(void *data,
- unsigned long pnum_begin,
- unsigned long pnum_end,
- unsigned long map_count, int nodeid)
+static void __init sparse_buffer_fini(void)
{
- struct page **map_map = (struct page **)data;
- sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end,
- map_count, nodeid);
+ unsigned long size = sparsemap_buf_end - sparsemap_buf;
+
+ if (sparsemap_buf && size > 0)
+ memblock_free_early(__pa(sparsemap_buf), size);
+ sparsemap_buf = NULL;
}
-#else
-static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
-{
- struct page *map;
- struct mem_section *ms = __nr_to_section(pnum);
- int nid = sparse_early_nid(ms);
- map = sparse_mem_map_populate(pnum, nid, NULL);
- if (map)
- return map;
+void * __meminit sparse_buffer_alloc(unsigned long size)
+{
+ void *ptr = NULL;
- pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
- __func__);
- ms->section_mem_map = 0;
- return NULL;
+ if (sparsemap_buf) {
+ ptr = PTR_ALIGN(sparsemap_buf, size);
+ if (ptr + size > sparsemap_buf_end)
+ ptr = NULL;
+ else
+ sparsemap_buf = ptr + size;
+ }
+ return ptr;
}
-#endif
void __weak __meminit vmemmap_populate_print_last(void)
{
}
-/**
- * alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap
- * @map: usemap_map for pageblock flags or mmap_map for vmemmap
+/*
+ * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
+ * And number of present sections in this node is map_count.
*/
-static void __init alloc_usemap_and_memmap(void (*alloc_func)
- (void *, unsigned long, unsigned long,
- unsigned long, int), void *data)
+static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long map_count)
{
- unsigned long pnum;
- unsigned long map_count;
- int nodeid_begin = 0;
- unsigned long pnum_begin = 0;
-
- for_each_present_section_nr(0, pnum) {
- struct mem_section *ms;
+ unsigned long pnum, usemap_longs, *usemap;
+ struct page *map;
- ms = __nr_to_section(pnum);
- nodeid_begin = sparse_early_nid(ms);
- pnum_begin = pnum;
- break;
+ usemap_longs = BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS);
+ usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
+ usemap_size() *
+ map_count);
+ if (!usemap) {
+ pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
+ goto failed;
+ }
+ sparse_buffer_init(map_count * section_map_size(), nid);
+ for_each_present_section_nr(pnum_begin, pnum) {
+ if (pnum >= pnum_end)
+ break;
+
+ map = sparse_mem_map_populate(pnum, nid, NULL);
+ if (!map) {
+ pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
+ __func__, nid);
+ pnum_begin = pnum;
+ goto failed;
+ }
+ check_usemap_section_nr(nid, usemap);
+ sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap);
+ usemap += usemap_longs;
}
- map_count = 1;
- for_each_present_section_nr(pnum_begin + 1, pnum) {
+ sparse_buffer_fini();
+ return;
+failed:
+ /* We failed to allocate, mark all the following pnums as not present */
+ for_each_present_section_nr(pnum_begin, pnum) {
struct mem_section *ms;
- int nodeid;
+ if (pnum >= pnum_end)
+ break;
ms = __nr_to_section(pnum);
- nodeid = sparse_early_nid(ms);
- if (nodeid == nodeid_begin) {
- map_count++;
- continue;
- }
- /* ok, we need to take cake of from pnum_begin to pnum - 1*/
- alloc_func(data, pnum_begin, pnum,
- map_count, nodeid_begin);
- /* new start, update count etc*/
- nodeid_begin = nodeid;
- pnum_begin = pnum;
- map_count = 1;
+ ms->section_mem_map = 0;
}
- /* ok, last chunk */
- alloc_func(data, pnum_begin, __highest_present_section_nr+1,
- map_count, nodeid_begin);
}
/*
@@ -532,72 +493,29 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func)
*/
void __init sparse_init(void)
{
- unsigned long pnum;
- struct page *map;
- unsigned long *usemap;
- unsigned long **usemap_map;
- int size;
-#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
- int size2;
- struct page **map_map;
-#endif
-
- /* see include/linux/mmzone.h 'struct mem_section' definition */
- BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section)));
+ unsigned long pnum_begin = first_present_section_nr();
+ int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
+ unsigned long pnum_end, map_count = 1;
/* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
set_pageblock_order();
- /*
- * map is using big page (aka 2M in x86 64 bit)
- * usemap is less one page (aka 24 bytes)
- * so alloc 2M (with 2M align) and 24 bytes in turn will
- * make next 2M slip to one more 2M later.
- * then in big system, the memory will have a lot of holes...
- * here try to allocate 2M pages continuously.
- *
- * powerpc need to call sparse_init_one_section right after each
- * sparse_early_mem_map_alloc, so allocate usemap_map at first.
- */
- size = sizeof(unsigned long *) * NR_MEM_SECTIONS;
- usemap_map = memblock_virt_alloc(size, 0);
- if (!usemap_map)
- panic("can not allocate usemap_map\n");
- alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node,
- (void *)usemap_map);
-
-#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
- size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
- map_map = memblock_virt_alloc(size2, 0);
- if (!map_map)
- panic("can not allocate map_map\n");
- alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node,
- (void *)map_map);
-#endif
-
- for_each_present_section_nr(0, pnum) {
- usemap = usemap_map[pnum];
- if (!usemap)
- continue;
+ for_each_present_section_nr(pnum_begin + 1, pnum_end) {
+ int nid = sparse_early_nid(__nr_to_section(pnum_end));
-#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
- map = map_map[pnum];
-#else
- map = sparse_early_mem_map_alloc(pnum);
-#endif
- if (!map)
+ if (nid == nid_begin) {
+ map_count++;
continue;
-
- sparse_init_one_section(__nr_to_section(pnum), pnum, map,
- usemap);
+ }
+ /* Init node with sections in range [pnum_begin, pnum_end) */
+ sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
+ nid_begin = nid;
+ pnum_begin = pnum_end;
+ map_count = 1;
}
-
+ /* cover the last node */
+ sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
vmemmap_populate_print_last();
-
-#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
- memblock_free_early(__pa(map_map), size2);
-#endif
- memblock_free_early(__pa(usemap_map), size);
}
#ifdef CONFIG_MEMORY_HOTPLUG
@@ -760,6 +678,7 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat,
ret = sparse_index_init(section_nr, pgdat->node_id);
if (ret < 0 && ret != -EEXIST)
return ret;
+ ret = 0;
memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, altmap);
if (!memmap)
return -ENOMEM;
@@ -786,12 +705,11 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat,
#endif
section_mark_present(ms);
-
- ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
+ sparse_init_one_section(ms, section_nr, memmap, usemap);
out:
pgdat_resize_unlock(pgdat, &flags);
- if (ret <= 0) {
+ if (ret < 0) {
kfree(usemap);
__kfree_section_memmap(memmap, altmap);
}
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index a791411fed71..63a7b4563a57 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -38,9 +38,9 @@ static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots);
static bool swap_slot_cache_active;
bool swap_slot_cache_enabled;
static bool swap_slot_cache_initialized;
-DEFINE_MUTEX(swap_slots_cache_mutex);
+static DEFINE_MUTEX(swap_slots_cache_mutex);
/* Serialize swap slots cache enable/disable operations */
-DEFINE_MUTEX(swap_slots_cache_enable_mutex);
+static DEFINE_MUTEX(swap_slots_cache_enable_mutex);
static void __drain_swap_slots_cache(unsigned int type);
static void deactivate_swap_slots_cache(void);
@@ -269,8 +269,8 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache)
cache->cur = 0;
if (swap_slot_cache_active)
- cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, false,
- cache->slots);
+ cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE,
+ cache->slots, 1);
return cache->nr;
}
@@ -316,7 +316,7 @@ swp_entry_t get_swap_page(struct page *page)
if (PageTransHuge(page)) {
if (IS_ENABLED(CONFIG_THP_SWAP))
- get_swap_pages(1, true, &entry);
+ get_swap_pages(1, &entry, HPAGE_PMD_NR);
goto out;
}
@@ -350,7 +350,7 @@ repeat:
goto out;
}
- get_swap_pages(1, false, &entry);
+ get_swap_pages(1, &entry, 1);
out:
if (mem_cgroup_try_charge_swap(page, entry)) {
put_swap_page(page, entry);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2cc2972eedaf..d954b71c4f9c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -204,8 +204,16 @@ static void discard_swap_cluster(struct swap_info_struct *si,
#ifdef CONFIG_THP_SWAP
#define SWAPFILE_CLUSTER HPAGE_PMD_NR
+
+#define swap_entry_size(size) (size)
#else
#define SWAPFILE_CLUSTER 256
+
+/*
+ * Define swap_entry_size() as constant to let compiler to optimize
+ * out some code if !CONFIG_THP_SWAP
+ */
+#define swap_entry_size(size) 1
#endif
#define LATENCY_LIMIT 256
@@ -269,7 +277,9 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
static inline bool cluster_is_huge(struct swap_cluster_info *info)
{
- return info->flags & CLUSTER_FLAG_HUGE;
+ if (IS_ENABLED(CONFIG_THP_SWAP))
+ return info->flags & CLUSTER_FLAG_HUGE;
+ return false;
}
static inline void cluster_clear_huge(struct swap_cluster_info *info)
@@ -296,13 +306,18 @@ static inline void unlock_cluster(struct swap_cluster_info *ci)
spin_unlock(&ci->lock);
}
+/*
+ * Determine the locking method in use for this device. Return
+ * swap_cluster_info if SSD-style cluster-based locking is in place.
+ */
static inline struct swap_cluster_info *lock_cluster_or_swap_info(
- struct swap_info_struct *si,
- unsigned long offset)
+ struct swap_info_struct *si, unsigned long offset)
{
struct swap_cluster_info *ci;
+ /* Try to use fine-grained SSD-style locking if available: */
ci = lock_cluster(si, offset);
+ /* Otherwise, fall back to traditional, coarse locking: */
if (!ci)
spin_lock(&si->lock);
@@ -863,7 +878,6 @@ no_page:
return n_ret;
}
-#ifdef CONFIG_THP_SWAP
static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
{
unsigned long idx;
@@ -871,6 +885,15 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
unsigned long offset, i;
unsigned char *map;
+ /*
+ * Should not even be attempting cluster allocations when huge
+ * page swap is disabled. Warn and fail the allocation.
+ */
+ if (!IS_ENABLED(CONFIG_THP_SWAP)) {
+ VM_WARN_ON_ONCE(1);
+ return 0;
+ }
+
if (cluster_list_empty(&si->free_clusters))
return 0;
@@ -901,13 +924,6 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
unlock_cluster(ci);
swap_range_free(si, offset, SWAPFILE_CLUSTER);
}
-#else
-static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
-{
- VM_WARN_ON_ONCE(1);
- return 0;
-}
-#endif /* CONFIG_THP_SWAP */
static unsigned long scan_swap_map(struct swap_info_struct *si,
unsigned char usage)
@@ -924,18 +940,18 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
}
-int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
+int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
{
- unsigned long nr_pages = cluster ? SWAPFILE_CLUSTER : 1;
+ unsigned long size = swap_entry_size(entry_size);
struct swap_info_struct *si, *next;
long avail_pgs;
int n_ret = 0;
int node;
/* Only single cluster request supported */
- WARN_ON_ONCE(n_goal > 1 && cluster);
+ WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
- avail_pgs = atomic_long_read(&nr_swap_pages) / nr_pages;
+ avail_pgs = atomic_long_read(&nr_swap_pages) / size;
if (avail_pgs <= 0)
goto noswap;
@@ -945,7 +961,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
if (n_goal > avail_pgs)
n_goal = avail_pgs;
- atomic_long_sub(n_goal * nr_pages, &nr_swap_pages);
+ atomic_long_sub(n_goal * size, &nr_swap_pages);
spin_lock(&swap_avail_lock);
@@ -972,14 +988,14 @@ start_over:
spin_unlock(&si->lock);
goto nextsi;
}
- if (cluster) {
+ if (size == SWAPFILE_CLUSTER) {
if (!(si->flags & SWP_FILE))
n_ret = swap_alloc_cluster(si, swp_entries);
} else
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
n_goal, swp_entries);
spin_unlock(&si->lock);
- if (n_ret || cluster)
+ if (n_ret || size == SWAPFILE_CLUSTER)
goto check_out;
pr_debug("scan_swap_map of si %d failed to find offset\n",
si->type);
@@ -1005,7 +1021,7 @@ nextsi:
check_out:
if (n_ret < n_goal)
- atomic_long_add((long)(n_goal - n_ret) * nr_pages,
+ atomic_long_add((long)(n_goal - n_ret) * size,
&nr_swap_pages);
noswap:
return n_ret;
@@ -1107,16 +1123,13 @@ static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
return p;
}
-static unsigned char __swap_entry_free(struct swap_info_struct *p,
- swp_entry_t entry, unsigned char usage)
+static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
+ unsigned long offset,
+ unsigned char usage)
{
- struct swap_cluster_info *ci;
- unsigned long offset = swp_offset(entry);
unsigned char count;
unsigned char has_cache;
- ci = lock_cluster_or_swap_info(p, offset);
-
count = p->swap_map[offset];
has_cache = count & SWAP_HAS_CACHE;
@@ -1144,6 +1157,17 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p,
usage = count | has_cache;
p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
+ return usage;
+}
+
+static unsigned char __swap_entry_free(struct swap_info_struct *p,
+ swp_entry_t entry, unsigned char usage)
+{
+ struct swap_cluster_info *ci;
+ unsigned long offset = swp_offset(entry);
+
+ ci = lock_cluster_or_swap_info(p, offset);
+ usage = __swap_entry_free_locked(p, offset, usage);
unlock_cluster_or_swap_info(p, ci);
return usage;
@@ -1184,19 +1208,7 @@ void swap_free(swp_entry_t entry)
/*
* Called after dropping swapcache to decrease refcnt to swap entries.
*/
-static void swapcache_free(swp_entry_t entry)
-{
- struct swap_info_struct *p;
-
- p = _swap_info_get(entry);
- if (p) {
- if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE))
- free_swap_slot(entry);
- }
-}
-
-#ifdef CONFIG_THP_SWAP
-static void swapcache_free_cluster(swp_entry_t entry)
+void put_swap_page(struct page *page, swp_entry_t entry)
{
unsigned long offset = swp_offset(entry);
unsigned long idx = offset / SWAPFILE_CLUSTER;
@@ -1205,42 +1217,48 @@ static void swapcache_free_cluster(swp_entry_t entry)
unsigned char *map;
unsigned int i, free_entries = 0;
unsigned char val;
+ int size = swap_entry_size(hpage_nr_pages(page));
si = _swap_info_get(entry);
if (!si)
return;
- ci = lock_cluster(si, offset);
- VM_BUG_ON(!cluster_is_huge(ci));
- map = si->swap_map + offset;
- for (i = 0; i < SWAPFILE_CLUSTER; i++) {
- val = map[i];
- VM_BUG_ON(!(val & SWAP_HAS_CACHE));
- if (val == SWAP_HAS_CACHE)
- free_entries++;
- }
- if (!free_entries) {
- for (i = 0; i < SWAPFILE_CLUSTER; i++)
- map[i] &= ~SWAP_HAS_CACHE;
+ ci = lock_cluster_or_swap_info(si, offset);
+ if (size == SWAPFILE_CLUSTER) {
+ VM_BUG_ON(!cluster_is_huge(ci));
+ map = si->swap_map + offset;
+ for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+ val = map[i];
+ VM_BUG_ON(!(val & SWAP_HAS_CACHE));
+ if (val == SWAP_HAS_CACHE)
+ free_entries++;
+ }
+ cluster_clear_huge(ci);
+ if (free_entries == SWAPFILE_CLUSTER) {
+ unlock_cluster_or_swap_info(si, ci);
+ spin_lock(&si->lock);
+ ci = lock_cluster(si, offset);
+ memset(map, 0, SWAPFILE_CLUSTER);
+ unlock_cluster(ci);
+ mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+ swap_free_cluster(si, idx);
+ spin_unlock(&si->lock);
+ return;
+ }
}
- cluster_clear_huge(ci);
- unlock_cluster(ci);
- if (free_entries == SWAPFILE_CLUSTER) {
- spin_lock(&si->lock);
- ci = lock_cluster(si, offset);
- memset(map, 0, SWAPFILE_CLUSTER);
- unlock_cluster(ci);
- mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
- swap_free_cluster(si, idx);
- spin_unlock(&si->lock);
- } else if (free_entries) {
- for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) {
- if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
- free_swap_slot(entry);
+ for (i = 0; i < size; i++, entry.val++) {
+ if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
+ unlock_cluster_or_swap_info(si, ci);
+ free_swap_slot(entry);
+ if (i == size - 1)
+ return;
+ lock_cluster_or_swap_info(si, offset);
}
}
+ unlock_cluster_or_swap_info(si, ci);
}
+#ifdef CONFIG_THP_SWAP
int split_swap_cluster(swp_entry_t entry)
{
struct swap_info_struct *si;
@@ -1255,19 +1273,7 @@ int split_swap_cluster(swp_entry_t entry)
unlock_cluster(ci);
return 0;
}
-#else
-static inline void swapcache_free_cluster(swp_entry_t entry)
-{
-}
-#endif /* CONFIG_THP_SWAP */
-
-void put_swap_page(struct page *page, swp_entry_t entry)
-{
- if (!PageTransHuge(page))
- swapcache_free(entry);
- else
- swapcache_free_cluster(entry);
-}
+#endif
static int swp_entry_cmp(const void *ent1, const void *ent2)
{
@@ -1409,7 +1415,6 @@ out:
return count;
}
-#ifdef CONFIG_THP_SWAP
static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
swp_entry_t entry)
{
@@ -1422,12 +1427,12 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
ci = lock_cluster_or_swap_info(si, offset);
if (!ci || !cluster_is_huge(ci)) {
- if (map[roffset] != SWAP_HAS_CACHE)
+ if (swap_count(map[roffset]))
ret = true;
goto unlock_out;
}
for (i = 0; i < SWAPFILE_CLUSTER; i++) {
- if (map[offset + i] != SWAP_HAS_CACHE) {
+ if (swap_count(map[offset + i])) {
ret = true;
break;
}
@@ -1442,7 +1447,7 @@ static bool page_swapped(struct page *page)
swp_entry_t entry;
struct swap_info_struct *si;
- if (likely(!PageTransCompound(page)))
+ if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
return page_swapcount(page) != 0;
page = compound_head(page);
@@ -1466,10 +1471,8 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
/* hugetlbfs shouldn't call it */
VM_BUG_ON_PAGE(PageHuge(page), page);
- if (likely(!PageTransCompound(page))) {
- mapcount = atomic_read(&page->_mapcount) + 1;
- if (total_mapcount)
- *total_mapcount = mapcount;
+ if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
+ mapcount = page_trans_huge_mapcount(page, total_mapcount);
if (PageSwapCache(page))
swapcount = page_swapcount(page);
if (total_swapcount)
@@ -1516,26 +1519,6 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
return map_swapcount;
}
-#else
-#define swap_page_trans_huge_swapped(si, entry) swap_swapcount(si, entry)
-#define page_swapped(page) (page_swapcount(page) != 0)
-
-static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
- int *total_swapcount)
-{
- int mapcount, swapcount = 0;
-
- /* hugetlbfs shouldn't call it */
- VM_BUG_ON_PAGE(PageHuge(page), page);
-
- mapcount = page_trans_huge_mapcount(page, total_mapcount);
- if (PageSwapCache(page))
- swapcount = page_swapcount(page);
- if (total_swapcount)
- *total_swapcount = swapcount;
- return mapcount + swapcount;
-}
-#endif
/*
* We can write to an anon page without COW if there are no other references
@@ -2909,6 +2892,35 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
return 0;
}
+
+/*
+ * Find out how many pages are allowed for a single swap device. There
+ * are two limiting factors:
+ * 1) the number of bits for the swap offset in the swp_entry_t type, and
+ * 2) the number of bits in the swap pte, as defined by the different
+ * architectures.
+ *
+ * In order to find the largest possible bit mask, a swap entry with
+ * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
+ * decoded to a swp_entry_t again, and finally the swap offset is
+ * extracted.
+ *
+ * This will mask all the bits from the initial ~0UL mask that can't
+ * be encoded in either the swp_entry_t or the architecture definition
+ * of a swap pte.
+ */
+unsigned long generic_max_swapfile_size(void)
+{
+ return swp_offset(pte_to_swp_entry(
+ swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+}
+
+/* Can be overridden by an architecture for additional checks. */
+__weak unsigned long max_swapfile_size(void)
+{
+ return generic_max_swapfile_size();
+}
+
static unsigned long read_swap_header(struct swap_info_struct *p,
union swap_header *swap_header,
struct inode *inode)
@@ -2944,22 +2956,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
p->cluster_next = 1;
p->cluster_nr = 0;
- /*
- * Find out how many pages are allowed for a single swap
- * device. There are two limiting factors: 1) the number
- * of bits for the swap offset in the swp_entry_t type, and
- * 2) the number of bits in the swap pte as defined by the
- * different architectures. In order to find the
- * largest possible bit mask, a swap entry with swap type 0
- * and swap offset ~0UL is created, encoded to a swap pte,
- * decoded to a swp_entry_t again, and finally the swap
- * offset is extracted. This will mask all the bits from
- * the initial ~0UL mask that can't be encoded in either
- * the swp_entry_t or the architecture definition of a
- * swap pte.
- */
- maxpages = swp_offset(pte_to_swp_entry(
- swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+ maxpages = max_swapfile_size();
last_page = swap_header->info.last_page;
if (!last_page) {
pr_warn("Empty swap-file\n");
@@ -3731,6 +3728,37 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
}
}
+#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
+void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
+ gfp_t gfp_mask)
+{
+ struct swap_info_struct *si, *next;
+ if (!(gfp_mask & __GFP_IO) || !memcg)
+ return;
+
+ if (!blk_cgroup_congested())
+ return;
+
+ /*
+ * We've already scheduled a throttle, avoid taking the global swap
+ * lock.
+ */
+ if (current->throttle_queue)
+ return;
+
+ spin_lock(&swap_avail_lock);
+ plist_for_each_entry_safe(si, next, &swap_avail_heads[node],
+ avail_lists[node]) {
+ if (si->bdev) {
+ blkcg_schedule_throttle(bdev_get_queue(si->bdev),
+ true);
+ break;
+ }
+ }
+ spin_unlock(&swap_avail_lock);
+}
+#endif
+
static int __init swapfile_init(void)
{
int nid;
diff --git a/mm/usercopy.c b/mm/usercopy.c
index e9e9325f7638..852eb4e53f06 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -20,6 +20,8 @@
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/thread_info.h>
+#include <linux/atomic.h>
+#include <linux/jump_label.h>
#include <asm/sections.h>
/*
@@ -240,6 +242,8 @@ static inline void check_heap_object(const void *ptr, unsigned long n,
}
}
+static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);
+
/*
* Validates that the given object is:
* - not bogus address
@@ -248,6 +252,9 @@ static inline void check_heap_object(const void *ptr, unsigned long n,
*/
void __check_object_size(const void *ptr, unsigned long n, bool to_user)
{
+ if (static_branch_unlikely(&bypass_usercopy_checks))
+ return;
+
/* Skip all tests if size is zero. */
if (!n)
return;
@@ -279,3 +286,21 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
check_kernel_text_object((const unsigned long)ptr, n, to_user);
}
EXPORT_SYMBOL(__check_object_size);
+
+static bool enable_checks __initdata = true;
+
+static int __init parse_hardened_usercopy(char *str)
+{
+ return strtobool(str, &enable_checks);
+}
+
+__setup("hardened_usercopy=", parse_hardened_usercopy);
+
+static int __init set_hardened_usercopy(void)
+{
+ if (enable_checks == false)
+ static_branch_enable(&bypass_usercopy_checks);
+ return 1;
+}
+
+late_initcall(set_hardened_usercopy);
diff --git a/mm/util.c b/mm/util.c
index 3351659200e6..d2890a407332 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -196,7 +196,7 @@ void *vmemdup_user(const void __user *src, size_t len)
}
EXPORT_SYMBOL(vmemdup_user);
-/*
+/**
* strndup_user - duplicate an existing string from user space
* @s: The string to duplicate
* @n: Maximum number of bytes to copy, including the trailing NUL.
@@ -434,6 +434,13 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
}
EXPORT_SYMBOL(kvmalloc_node);
+/**
+ * kvfree - free memory allocated with kvmalloc
+ * @addr: pointer returned by kvmalloc
+ *
+ * If the memory is allocated from vmalloc area it is freed with vfree().
+ * Otherwise kfree() is used.
+ */
void kvfree(const void *addr)
{
if (is_vmalloc_addr(addr))
diff --git a/mm/vmacache.c b/mm/vmacache.c
index db7596eb6132..ea517bef7dc5 100644
--- a/mm/vmacache.c
+++ b/mm/vmacache.c
@@ -6,6 +6,18 @@
#include <linux/sched/task.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
+#include <asm/pgtable.h>
+
+/*
+ * Hash based on the pmd of addr if configured with MMU, which provides a good
+ * hit rate for workloads with spatial locality. Otherwise, use pages.
+ */
+#ifdef CONFIG_MMU
+#define VMACACHE_SHIFT PMD_SHIFT
+#else
+#define VMACACHE_SHIFT PAGE_SHIFT
+#endif
+#define VMACACHE_HASH(addr) ((addr >> VMACACHE_SHIFT) & VMACACHE_MASK)
/*
* Flush vma caches for threads that share a given mm.
@@ -87,6 +99,7 @@ static bool vmacache_valid(struct mm_struct *mm)
struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
{
+ int idx = VMACACHE_HASH(addr);
int i;
count_vm_vmacache_event(VMACACHE_FIND_CALLS);
@@ -95,16 +108,20 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
return NULL;
for (i = 0; i < VMACACHE_SIZE; i++) {
- struct vm_area_struct *vma = current->vmacache.vmas[i];
+ struct vm_area_struct *vma = current->vmacache.vmas[idx];
- if (!vma)
- continue;
- if (WARN_ON_ONCE(vma->vm_mm != mm))
- break;
- if (vma->vm_start <= addr && vma->vm_end > addr) {
- count_vm_vmacache_event(VMACACHE_FIND_HITS);
- return vma;
+ if (vma) {
+#ifdef CONFIG_DEBUG_VM_VMACACHE
+ if (WARN_ON_ONCE(vma->vm_mm != mm))
+ break;
+#endif
+ if (vma->vm_start <= addr && vma->vm_end > addr) {
+ count_vm_vmacache_event(VMACACHE_FIND_HITS);
+ return vma;
+ }
}
+ if (++idx == VMACACHE_SIZE)
+ idx = 0;
}
return NULL;
@@ -115,6 +132,7 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
unsigned long start,
unsigned long end)
{
+ int idx = VMACACHE_HASH(start);
int i;
count_vm_vmacache_event(VMACACHE_FIND_CALLS);
@@ -123,12 +141,14 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
return NULL;
for (i = 0; i < VMACACHE_SIZE; i++) {
- struct vm_area_struct *vma = current->vmacache.vmas[i];
+ struct vm_area_struct *vma = current->vmacache.vmas[idx];
if (vma && vma->vm_start == start && vma->vm_end == end) {
count_vm_vmacache_event(VMACACHE_FIND_HITS);
return vma;
}
+ if (++idx == VMACACHE_SIZE)
+ idx = 0;
}
return NULL;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index cfea25be7754..a728fc492557 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1907,10 +1907,6 @@ void *vzalloc_node(unsigned long size, int node)
}
EXPORT_SYMBOL(vzalloc_node);
-#ifndef PAGE_KERNEL_EXEC
-# define PAGE_KERNEL_EXEC PAGE_KERNEL
-#endif
-
/**
* vmalloc_exec - allocate virtually contiguous, executable memory
* @size: allocation size
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 03822f86f288..7e7d25504651 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -65,12 +65,6 @@ struct scan_control {
/* How many pages shrink_list() should reclaim */
unsigned long nr_to_reclaim;
- /* This context's GFP mask */
- gfp_t gfp_mask;
-
- /* Allocation order */
- int order;
-
/*
* Nodemask of nodes allowed by the caller. If NULL, all nodes
* are scanned.
@@ -83,12 +77,6 @@ struct scan_control {
*/
struct mem_cgroup *target_mem_cgroup;
- /* Scan (total_size >> priority) pages at once */
- int priority;
-
- /* The highest zone to isolate pages for reclaim from */
- enum zone_type reclaim_idx;
-
/* Writepage batching in laptop mode; RECLAIM_WRITE */
unsigned int may_writepage:1;
@@ -111,6 +99,18 @@ struct scan_control {
/* One of the zones is ready for compaction */
unsigned int compaction_ready:1;
+ /* Allocation order */
+ s8 order;
+
+ /* Scan (total_size >> priority) pages at once */
+ s8 priority;
+
+ /* The highest zone to isolate pages for reclaim from */
+ s8 reclaim_idx;
+
+ /* This context's GFP mask */
+ gfp_t gfp_mask;
+
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
@@ -169,6 +169,70 @@ unsigned long vm_total_pages;
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
+#ifdef CONFIG_MEMCG_KMEM
+
+/*
+ * We allow subsystems to populate their shrinker-related
+ * LRU lists before register_shrinker_prepared() is called
+ * for the shrinker, since we don't want to impose
+ * restrictions on their internal registration order.
+ * In this case shrink_slab_memcg() may find corresponding
+ * bit is set in the shrinkers map.
+ *
+ * This value is used by the function to detect registering
+ * shrinkers and to skip do_shrink_slab() calls for them.
+ */
+#define SHRINKER_REGISTERING ((struct shrinker *)~0UL)
+
+static DEFINE_IDR(shrinker_idr);
+static int shrinker_nr_max;
+
+static int prealloc_memcg_shrinker(struct shrinker *shrinker)
+{
+ int id, ret = -ENOMEM;
+
+ down_write(&shrinker_rwsem);
+ /* This may call shrinker, so it must use down_read_trylock() */
+ id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL);
+ if (id < 0)
+ goto unlock;
+
+ if (id >= shrinker_nr_max) {
+ if (memcg_expand_shrinker_maps(id)) {
+ idr_remove(&shrinker_idr, id);
+ goto unlock;
+ }
+
+ shrinker_nr_max = id + 1;
+ }
+ shrinker->id = id;
+ ret = 0;
+unlock:
+ up_write(&shrinker_rwsem);
+ return ret;
+}
+
+static void unregister_memcg_shrinker(struct shrinker *shrinker)
+{
+ int id = shrinker->id;
+
+ BUG_ON(id < 0);
+
+ down_write(&shrinker_rwsem);
+ idr_remove(&shrinker_idr, id);
+ up_write(&shrinker_rwsem);
+}
+#else /* CONFIG_MEMCG_KMEM */
+static int prealloc_memcg_shrinker(struct shrinker *shrinker)
+{
+ return 0;
+}
+
+static void unregister_memcg_shrinker(struct shrinker *shrinker)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
#ifdef CONFIG_MEMCG
static bool global_reclaim(struct scan_control *sc)
{
@@ -313,11 +377,28 @@ int prealloc_shrinker(struct shrinker *shrinker)
shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
if (!shrinker->nr_deferred)
return -ENOMEM;
+
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
+ if (prealloc_memcg_shrinker(shrinker))
+ goto free_deferred;
+ }
+
return 0;
+
+free_deferred:
+ kfree(shrinker->nr_deferred);
+ shrinker->nr_deferred = NULL;
+ return -ENOMEM;
}
void free_prealloced_shrinker(struct shrinker *shrinker)
{
+ if (!shrinker->nr_deferred)
+ return;
+
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+ unregister_memcg_shrinker(shrinker);
+
kfree(shrinker->nr_deferred);
shrinker->nr_deferred = NULL;
}
@@ -326,6 +407,10 @@ void register_shrinker_prepared(struct shrinker *shrinker)
{
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
+#ifdef CONFIG_MEMCG_KMEM
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+ idr_replace(&shrinker_idr, shrinker, shrinker->id);
+#endif
up_write(&shrinker_rwsem);
}
@@ -347,6 +432,8 @@ void unregister_shrinker(struct shrinker *shrinker)
{
if (!shrinker->nr_deferred)
return;
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+ unregister_memcg_shrinker(shrinker);
down_write(&shrinker_rwsem);
list_del(&shrinker->list);
up_write(&shrinker_rwsem);
@@ -371,9 +458,12 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
: SHRINK_BATCH;
long scanned = 0, next_deferred;
+ if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+ nid = 0;
+
freeable = shrinker->count_objects(shrinker, shrinkctl);
- if (freeable == 0)
- return 0;
+ if (freeable == 0 || freeable == SHRINK_EMPTY)
+ return freeable;
/*
* copy the current shrinker scan count into a local variable
@@ -474,6 +564,84 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
return freed;
}
+#ifdef CONFIG_MEMCG_KMEM
+static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
+ struct mem_cgroup *memcg, int priority)
+{
+ struct memcg_shrinker_map *map;
+ unsigned long freed = 0;
+ int ret, i;
+
+ if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))
+ return 0;
+
+ if (!down_read_trylock(&shrinker_rwsem))
+ return 0;
+
+ map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map,
+ true);
+ if (unlikely(!map))
+ goto unlock;
+
+ for_each_set_bit(i, map->map, shrinker_nr_max) {
+ struct shrink_control sc = {
+ .gfp_mask = gfp_mask,
+ .nid = nid,
+ .memcg = memcg,
+ };
+ struct shrinker *shrinker;
+
+ shrinker = idr_find(&shrinker_idr, i);
+ if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) {
+ if (!shrinker)
+ clear_bit(i, map->map);
+ continue;
+ }
+
+ ret = do_shrink_slab(&sc, shrinker, priority);
+ if (ret == SHRINK_EMPTY) {
+ clear_bit(i, map->map);
+ /*
+ * After the shrinker reported that it had no objects to
+ * free, but before we cleared the corresponding bit in
+ * the memcg shrinker map, a new object might have been
+ * added. To make sure, we have the bit set in this
+ * case, we invoke the shrinker one more time and reset
+ * the bit if it reports that it is not empty anymore.
+ * The memory barrier here pairs with the barrier in
+ * memcg_set_shrinker_bit():
+ *
+ * list_lru_add() shrink_slab_memcg()
+ * list_add_tail() clear_bit()
+ * <MB> <MB>
+ * set_bit() do_shrink_slab()
+ */
+ smp_mb__after_atomic();
+ ret = do_shrink_slab(&sc, shrinker, priority);
+ if (ret == SHRINK_EMPTY)
+ ret = 0;
+ else
+ memcg_set_shrinker_bit(memcg, nid, i);
+ }
+ freed += ret;
+
+ if (rwsem_is_contended(&shrinker_rwsem)) {
+ freed = freed ? : 1;
+ break;
+ }
+ }
+unlock:
+ up_read(&shrinker_rwsem);
+ return freed;
+}
+#else /* CONFIG_MEMCG_KMEM */
+static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
+ struct mem_cgroup *memcg, int priority)
+{
+ return 0;
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
/**
* shrink_slab - shrink slab caches
* @gfp_mask: allocation context
@@ -486,10 +654,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
* @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
* unaware shrinkers will receive a node id of 0 instead.
*
- * @memcg specifies the memory cgroup to target. If it is not NULL,
- * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan
- * objects from the memory cgroup specified. Otherwise, only unaware
- * shrinkers are called.
+ * @memcg specifies the memory cgroup to target. Unaware shrinkers
+ * are called only if it is the root cgroup.
*
* @priority is sc->priority, we take the number of objects and >> by priority
* in order to get the scan target.
@@ -502,9 +668,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
{
struct shrinker *shrinker;
unsigned long freed = 0;
+ int ret;
- if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)))
- return 0;
+ if (!mem_cgroup_is_root(memcg))
+ return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
if (!down_read_trylock(&shrinker_rwsem))
goto out;
@@ -516,19 +683,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
.memcg = memcg,
};
- /*
- * If kernel memory accounting is disabled, we ignore
- * SHRINKER_MEMCG_AWARE flag and call all shrinkers
- * passing NULL for memcg.
- */
- if (memcg_kmem_enabled() &&
- !!memcg != !!(shrinker->flags & SHRINKER_MEMCG_AWARE))
- continue;
-
- if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
- sc.nid = 0;
-
- freed += do_shrink_slab(&sc, shrinker, priority);
+ ret = do_shrink_slab(&sc, shrinker, priority);
+ if (ret == SHRINK_EMPTY)
+ ret = 0;
+ freed += ret;
/*
* Bail out if someone want to register a new shrinker to
* prevent the regsitration from being stalled for long periods
@@ -554,6 +712,7 @@ void drop_slab_node(int nid)
struct mem_cgroup *memcg = NULL;
freed = 0;
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
@@ -744,7 +903,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
refcount = 2;
if (!page_ref_freeze(page, refcount))
goto cannot_free;
- /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
+ /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
if (unlikely(PageDirty(page))) {
page_ref_unfreeze(page, refcount);
goto cannot_free;
@@ -2573,9 +2732,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
node_lru_pages += lru_pages;
- if (memcg)
- shrink_slab(sc->gfp_mask, pgdat->node_id,
- memcg, sc->priority);
+ shrink_slab(sc->gfp_mask, pgdat->node_id,
+ memcg, sc->priority);
/* Record the group's reclaim efficiency */
vmpressure(sc->gfp_mask, memcg, false,
@@ -2599,10 +2757,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
}
} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
- if (global_reclaim(sc))
- shrink_slab(sc->gfp_mask, pgdat->node_id, NULL,
- sc->priority);
-
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
@@ -3064,6 +3218,14 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
};
/*
+ * scan_control uses s8 fields for order, priority, and reclaim_idx.
+ * Confirm they are large enough for max values.
+ */
+ BUILD_BUG_ON(MAX_ORDER > S8_MAX);
+ BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
+ BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
+
+ /*
* Do not enter reclaim if fatal signal was delivered while throttled.
* 1 is returned so that the page allocator does not OOM kill at this
* point.
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 75eda9c2b260..8ba0870ecddd 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1796,11 +1796,9 @@ static void vmstat_update(struct work_struct *w)
* to occur in the future. Keep on running the
* update worker thread.
*/
- preempt_disable();
queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
this_cpu_ptr(&vmstat_work),
round_jiffies_relative(sysctl_stat_interval));
- preempt_enable();
}
}
diff --git a/mm/workingset.c b/mm/workingset.c
index 40ee02c83978..4516dd790129 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -366,10 +366,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
unsigned long nodes;
unsigned long cache;
- /* list_lru lock nests inside the IRQ-safe i_pages lock */
- local_irq_disable();
nodes = list_lru_shrink_count(&shadow_nodes, sc);
- local_irq_enable();
/*
* Approximate a reasonable limit for the radix tree nodes
@@ -402,6 +399,9 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
}
max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3);
+ if (!nodes)
+ return SHRINK_EMPTY;
+
if (nodes <= max_nodes)
return 0;
return nodes - max_nodes;
@@ -434,7 +434,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
/* Coming from the list, invert the lock order */
if (!xa_trylock(&mapping->i_pages)) {
- spin_unlock(lru_lock);
+ spin_unlock_irq(lru_lock);
ret = LRU_RETRY;
goto out;
}
@@ -472,26 +472,20 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
workingset_lookup_update(mapping));
out_invalid:
- xa_unlock(&mapping->i_pages);
+ xa_unlock_irq(&mapping->i_pages);
ret = LRU_REMOVED_RETRY;
out:
- local_irq_enable();
cond_resched();
- local_irq_disable();
- spin_lock(lru_lock);
+ spin_lock_irq(lru_lock);
return ret;
}
static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
struct shrink_control *sc)
{
- unsigned long ret;
-
/* list_lru lock nests inside the IRQ-safe i_pages lock */
- local_irq_disable();
- ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL);
- local_irq_enable();
- return ret;
+ return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate,
+ NULL);
}
static struct shrinker workingset_shadow_shrinker = {
@@ -528,15 +522,17 @@ static int __init workingset_init(void)
pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
timestamp_bits, max_order, bucket_order);
- ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key);
+ ret = prealloc_shrinker(&workingset_shadow_shrinker);
if (ret)
goto err;
- ret = register_shrinker(&workingset_shadow_shrinker);
+ ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key,
+ &workingset_shadow_shrinker);
if (ret)
goto err_list_lru;
+ register_shrinker_prepared(&workingset_shadow_shrinker);
return 0;
err_list_lru:
- list_lru_destroy(&shadow_nodes);
+ free_prealloced_shrinker(&workingset_shadow_shrinker);
err:
return ret;
}
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 8d87e973a4f5..9da65552e7ca 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -924,20 +924,7 @@ static void reset_page(struct page *page)
page->freelist = NULL;
}
-/*
- * To prevent zspage destroy during migration, zspage freeing should
- * hold locks of all pages in the zspage.
- */
-void lock_zspage(struct zspage *zspage)
-{
- struct page *page = get_first_page(zspage);
-
- do {
- lock_page(page);
- } while ((page = get_next_page(page)) != NULL);
-}
-
-int trylock_zspage(struct zspage *zspage)
+static int trylock_zspage(struct zspage *zspage)
{
struct page *cursor, *fail;
@@ -1814,6 +1801,19 @@ static enum fullness_group putback_zspage(struct size_class *class,
}
#ifdef CONFIG_COMPACTION
+/*
+ * To prevent zspage destroy during migration, zspage freeing should
+ * hold locks of all pages in the zspage.
+ */
+static void lock_zspage(struct zspage *zspage)
+{
+ struct page *page = get_first_page(zspage);
+
+ do {
+ lock_page(page);
+ } while ((page = get_next_page(page)) != NULL);
+}
+
static struct dentry *zs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
@@ -1905,7 +1905,7 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage,
__SetPageMovable(newpage, page_mapping(oldpage));
}
-bool zs_page_isolate(struct page *page, isolate_mode_t mode)
+static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
{
struct zs_pool *pool;
struct size_class *class;
@@ -1960,7 +1960,7 @@ bool zs_page_isolate(struct page *page, isolate_mode_t mode)
return true;
}
-int zs_page_migrate(struct address_space *mapping, struct page *newpage,
+static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
struct page *page, enum migrate_mode mode)
{
struct zs_pool *pool;
@@ -2076,7 +2076,7 @@ unpin_objects:
return ret;
}
-void zs_page_putback(struct page *page)
+static void zs_page_putback(struct page *page)
{
struct zs_pool *pool;
struct size_class *class;
@@ -2108,7 +2108,7 @@ void zs_page_putback(struct page *page)
spin_unlock(&class->lock);
}
-const struct address_space_operations zsmalloc_aops = {
+static const struct address_space_operations zsmalloc_aops = {
.isolate_page = zs_page_isolate,
.migratepage = zs_page_migrate,
.putback_page = zs_page_putback,
diff --git a/mm/zswap.c b/mm/zswap.c
index 7d34e69507e3..cd91fd9d96b8 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1026,6 +1026,15 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
ret = -ENOMEM;
goto reject;
}
+
+ /* A second zswap_is_full() check after
+ * zswap_shrink() to make sure it's now
+ * under the max_pool_percent
+ */
+ if (zswap_is_full()) {
+ ret = -ENOMEM;
+ goto reject;
+ }
}
/* allocate entry */
OpenPOWER on IntegriCloud