diff options
author | James Morris <james.morris@microsoft.com> | 2018-09-04 11:35:54 -0700 |
---|---|---|
committer | James Morris <james.morris@microsoft.com> | 2018-09-04 11:35:54 -0700 |
commit | e42f6f9be4f83c537aa81b4c6239ea94ff5b29ce (patch) | |
tree | f956a5ea0e83fc6d0df3e64681e7bbc1f201f3ee /mm | |
parent | 4408e300a67ab2ce2505087986a9fe922c800ffd (diff) | |
parent | 57361846b52bc686112da6ca5368d11210796804 (diff) | |
download | blackbird-obmc-linux-e42f6f9be4f83c537aa81b4c6239ea94ff5b29ce.tar.gz blackbird-obmc-linux-e42f6f9be4f83c537aa81b4c6239ea94ff5b29ce.zip |
Merge tag 'v4.19-rc2' into next-general
Sync to Linux 4.19-rc2 for downstream developers.
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 16 | ||||
-rw-r--r-- | mm/Kconfig.debug | 6 | ||||
-rw-r--r-- | mm/backing-dev.c | 15 | ||||
-rw-r--r-- | mm/bootmem.c | 159 | ||||
-rw-r--r-- | mm/cma.c | 8 | ||||
-rw-r--r-- | mm/cma_debug.c | 2 | ||||
-rw-r--r-- | mm/debug.c | 18 | ||||
-rw-r--r-- | mm/fadvise.c | 8 | ||||
-rw-r--r-- | mm/gup.c | 6 | ||||
-rw-r--r-- | mm/hmm.c | 28 | ||||
-rw-r--r-- | mm/huge_memory.c | 54 | ||||
-rw-r--r-- | mm/hugetlb.c | 87 | ||||
-rw-r--r-- | mm/init-mm.c | 11 | ||||
-rw-r--r-- | mm/internal.h | 14 | ||||
-rw-r--r-- | mm/kasan/kasan.c | 5 | ||||
-rw-r--r-- | mm/kasan/kasan_init.c | 316 | ||||
-rw-r--r-- | mm/khugepaged.c | 63 | ||||
-rw-r--r-- | mm/ksm.c | 13 | ||||
-rw-r--r-- | mm/list_lru.c | 147 | ||||
-rw-r--r-- | mm/madvise.c | 16 | ||||
-rw-r--r-- | mm/memblock.c | 255 | ||||
-rw-r--r-- | mm/memcontrol.c | 599 | ||||
-rw-r--r-- | mm/memfd.c | 2 | ||||
-rw-r--r-- | mm/memory-failure.c | 256 | ||||
-rw-r--r-- | mm/memory.c | 336 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 112 | ||||
-rw-r--r-- | mm/mempolicy.c | 6 | ||||
-rw-r--r-- | mm/mempool.c | 13 | ||||
-rw-r--r-- | mm/migrate.c | 10 | ||||
-rw-r--r-- | mm/mlock.c | 3 | ||||
-rw-r--r-- | mm/mm_init.c | 9 | ||||
-rw-r--r-- | mm/mmap.c | 80 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 19 | ||||
-rw-r--r-- | mm/mprotect.c | 49 | ||||
-rw-r--r-- | mm/nobootmem.c | 20 | ||||
-rw-r--r-- | mm/nommu.c | 16 | ||||
-rw-r--r-- | mm/oom_kill.c | 239 | ||||
-rw-r--r-- | mm/page-writeback.c | 5 | ||||
-rw-r--r-- | mm/page_alloc.c | 249 | ||||
-rw-r--r-- | mm/page_ext.c | 4 | ||||
-rw-r--r-- | mm/page_io.c | 3 | ||||
-rw-r--r-- | mm/percpu.c | 29 | ||||
-rw-r--r-- | mm/readahead.c | 19 | ||||
-rw-r--r-- | mm/rmap.c | 8 | ||||
-rw-r--r-- | mm/shmem.c | 69 | ||||
-rw-r--r-- | mm/slab.h | 6 | ||||
-rw-r--r-- | mm/slab_common.c | 12 | ||||
-rw-r--r-- | mm/slub.c | 11 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 61 | ||||
-rw-r--r-- | mm/sparse.c | 290 | ||||
-rw-r--r-- | mm/swap_slots.c | 12 | ||||
-rw-r--r-- | mm/swapfile.c | 270 | ||||
-rw-r--r-- | mm/usercopy.c | 25 | ||||
-rw-r--r-- | mm/util.c | 9 | ||||
-rw-r--r-- | mm/vmacache.c | 38 | ||||
-rw-r--r-- | mm/vmalloc.c | 4 | ||||
-rw-r--r-- | mm/vmscan.c | 244 | ||||
-rw-r--r-- | mm/vmstat.c | 2 | ||||
-rw-r--r-- | mm/workingset.c | 30 | ||||
-rw-r--r-- | mm/zsmalloc.c | 36 | ||||
-rw-r--r-- | mm/zswap.c | 9 |
61 files changed, 2867 insertions, 1594 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index ce95491abd6a..a550635ea5c3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1,3 +1,6 @@ + +menu "Memory Management options" + config SELECT_MEMORY_MODEL def_bool y depends on ARCH_SELECT_MEMORY_MODEL @@ -115,10 +118,6 @@ config SPARSEMEM_EXTREME config SPARSEMEM_VMEMMAP_ENABLE bool -config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - def_bool y - depends on SPARSEMEM && X86_64 - config SPARSEMEM_VMEMMAP bool "Sparse Memory virtual memmap" depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE @@ -420,10 +419,11 @@ config ARCH_WANTS_THP_SWAP config THP_SWAP def_bool y - depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP + depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP help Swap transparent huge pages in one piece, without splitting. - XXX: For now this only does clustered swap space allocation. + XXX: For now, swap cluster backing transparent huge page + will be split after swapout. For selection by architectures with reasonable THP sizes. @@ -635,7 +635,7 @@ config DEFERRED_STRUCT_PAGE_INIT bool "Defer initialisation of struct pages to kthreads" default n depends on NO_BOOTMEM - depends on !FLATMEM + depends on SPARSEMEM depends on !NEED_PER_CPU_KM help Ordinarily all struct pages are initialised during early boot in a @@ -762,3 +762,5 @@ config GUP_BENCHMARK config ARCH_HAS_PTE_SPECIAL bool + +endmenu diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index e5e606ee5f71..9a7b8b049d04 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -46,7 +46,8 @@ config PAGE_POISONING Fill the pages with poison patterns after free_pages() and verify the patterns before alloc_pages. The filling of the memory helps reduce the risk of information leaks from freed data. This does - have a potential performance impact. + have a potential performance impact if enabled with the + "page_poison=1" kernel boot option. Note that "poison" here is not the same thing as the "HWPoison" for CONFIG_MEMORY_FAILURE. This is software poisoning only. @@ -65,7 +66,7 @@ config PAGE_POISONING_NO_SANITY say N. config PAGE_POISONING_ZERO - bool "Use zero for poisoning instead of random data" + bool "Use zero for poisoning instead of debugging value" depends on PAGE_POISONING ---help--- Instead of using the existing poison value, fill the pages with @@ -75,7 +76,6 @@ config PAGE_POISONING_ZERO allocation. If unsure, say N - bool config DEBUG_PAGE_REF bool "Enable tracepoint to track down page reference manipulation" diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 2e5d3df0853d..f5981e9d6ae2 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -438,10 +438,10 @@ retry: if (new_congested) { /* !found and storage for new one already allocated, insert */ congested = new_congested; - new_congested = NULL; rb_link_node(&congested->rb_node, parent, node); rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree); - goto found; + spin_unlock_irqrestore(&cgwb_lock, flags); + return congested; } spin_unlock_irqrestore(&cgwb_lock, flags); @@ -451,13 +451,13 @@ retry: if (!new_congested) return NULL; - atomic_set(&new_congested->refcnt, 0); + refcount_set(&new_congested->refcnt, 1); new_congested->__bdi = bdi; new_congested->blkcg_id = blkcg_id; goto retry; found: - atomic_inc(&congested->refcnt); + refcount_inc(&congested->refcnt); spin_unlock_irqrestore(&cgwb_lock, flags); kfree(new_congested); return congested; @@ -473,11 +473,8 @@ void wb_congested_put(struct bdi_writeback_congested *congested) { unsigned long flags; - local_irq_save(flags); - if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) { - local_irq_restore(flags); + if (!refcount_dec_and_lock_irqsave(&congested->refcnt, &cgwb_lock, &flags)) return; - } /* bdi might already have been destroyed leaving @congested unlinked */ if (congested->__bdi) { @@ -804,7 +801,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) if (!bdi->wb_congested) return -ENOMEM; - atomic_set(&bdi->wb_congested->refcnt, 1); + refcount_set(&bdi->wb_congested->refcnt, 1); err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); if (err) { diff --git a/mm/bootmem.c b/mm/bootmem.c index 9e197987b67d..97db0e8e362b 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -21,6 +21,53 @@ #include "internal.h" +/** + * DOC: bootmem overview + * + * Bootmem is a boot-time physical memory allocator and configurator. + * + * It is used early in the boot process before the page allocator is + * set up. + * + * Bootmem is based on the most basic of allocators, a First Fit + * allocator which uses a bitmap to represent memory. If a bit is 1, + * the page is allocated and 0 if unallocated. To satisfy allocations + * of sizes smaller than a page, the allocator records the Page Frame + * Number (PFN) of the last allocation and the offset the allocation + * ended at. Subsequent small allocations are merged together and + * stored on the same page. + * + * The information used by the bootmem allocator is represented by + * :c:type:`struct bootmem_data`. An array to hold up to %MAX_NUMNODES + * such structures is statically allocated and then it is discarded + * when the system initialization completes. Each entry in this array + * corresponds to a node with memory. For UMA systems only entry 0 is + * used. + * + * The bootmem allocator is initialized during early architecture + * specific setup. Each architecture is required to supply a + * :c:func:`setup_arch` function which, among other tasks, is + * responsible for acquiring the necessary parameters to initialise + * the boot memory allocator. These parameters define limits of usable + * physical memory: + * + * * @min_low_pfn - the lowest PFN that is available in the system + * * @max_low_pfn - the highest PFN that may be addressed by low + * memory (%ZONE_NORMAL) + * * @max_pfn - the last PFN available to the system. + * + * After those limits are determined, the :c:func:`init_bootmem` or + * :c:func:`init_bootmem_node` function should be called to initialize + * the bootmem allocator. The UMA case should use the `init_bootmem` + * function. It will initialize ``contig_page_data`` structure that + * represents the only memory node in the system. In the NUMA case the + * `init_bootmem_node` function should be called to initialize the + * bootmem allocator for each node. + * + * Once the allocator is set up, it is possible to use either single + * node or NUMA variant of the allocation APIs. + */ + #ifndef CONFIG_NEED_MULTIPLE_NODES struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] @@ -62,6 +109,8 @@ static unsigned long __init bootmap_bytes(unsigned long pages) /** * bootmem_bootmap_pages - calculate bitmap size in pages * @pages: number of pages the bitmap has to represent + * + * Return: the number of pages needed to hold the bitmap. */ unsigned long __init bootmem_bootmap_pages(unsigned long pages) { @@ -121,7 +170,7 @@ static unsigned long __init init_bootmem_core(bootmem_data_t *bdata, * @startpfn: first pfn on the node * @endpfn: first pfn after the node * - * Returns the number of bytes needed to hold the bitmap for this node. + * Return: the number of bytes needed to hold the bitmap for this node. */ unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn) @@ -134,7 +183,7 @@ unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, * @start: pfn where the bitmap is to be placed * @pages: number of available physical pages * - * Returns the number of bytes needed to hold the bitmap. + * Return: the number of bytes needed to hold the bitmap. */ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) { @@ -143,15 +192,6 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); } -/* - * free_bootmem_late - free bootmem pages directly to page allocator - * @addr: starting physical address of the range - * @size: size of the range in bytes - * - * This is only useful when the bootmem allocator has already been torn - * down, but we are still initializing the system. Pages are given directly - * to the page allocator, no bootmem metadata is updated because it is gone. - */ void __init free_bootmem_late(unsigned long physaddr, unsigned long size) { unsigned long cursor, end; @@ -264,11 +304,6 @@ void __init reset_all_zones_managed_pages(void) reset_managed_pages_done = 1; } -/** - * free_all_bootmem - release free pages to the buddy allocator - * - * Returns the number of pages actually released. - */ unsigned long __init free_all_bootmem(void) { unsigned long total_pages = 0; @@ -385,16 +420,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, BUG(); } -/** - * free_bootmem_node - mark a page range as usable - * @pgdat: node the range resides on - * @physaddr: starting address of the range - * @size: size of the range in bytes - * - * Partial pages will be considered reserved and left as they are. - * - * The range must reside completely on the specified node. - */ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { @@ -408,15 +433,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, mark_bootmem_node(pgdat->bdata, start, end, 0, 0); } -/** - * free_bootmem - mark a page range as usable - * @physaddr: starting physical address of the range - * @size: size of the range in bytes - * - * Partial pages will be considered reserved and left as they are. - * - * The range must be contiguous but may span node boundaries. - */ void __init free_bootmem(unsigned long physaddr, unsigned long size) { unsigned long start, end; @@ -439,6 +455,8 @@ void __init free_bootmem(unsigned long physaddr, unsigned long size) * Partial pages will be reserved. * * The range must reside completely on the specified node. + * + * Return: 0 on success, -errno on failure. */ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size, int flags) @@ -460,6 +478,8 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, * Partial pages will be reserved. * * The range must be contiguous but may span node boundaries. + * + * Return: 0 on success, -errno on failure. */ int __init reserve_bootmem(unsigned long addr, unsigned long size, int flags) @@ -646,19 +666,6 @@ restart: return NULL; } -/** - * __alloc_bootmem_nopanic - allocate boot memory without panicking - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may happen on any node in the system. - * - * Returns NULL on failure. - */ void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal) { @@ -682,19 +689,6 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, return NULL; } -/** - * __alloc_bootmem - allocate boot memory - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may happen on any node in the system. - * - * The function panics if the request can not be satisfied. - */ void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) { @@ -754,21 +748,6 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, return NULL; } -/** - * __alloc_bootmem_node - allocate boot memory from a specific node - * @pgdat: node to allocate from - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may fall back to any node in the system if the specified node - * can not hold the requested memory. - * - * The function panics if the request can not be satisfied. - */ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { @@ -807,19 +786,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, } -/** - * __alloc_bootmem_low - allocate low boot memory - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may happen on any node in the system. - * - * The function panics if the request can not be satisfied. - */ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) { @@ -834,21 +800,6 @@ void * __init __alloc_bootmem_low_nopanic(unsigned long size, ARCH_LOW_ADDRESS_LIMIT); } -/** - * __alloc_bootmem_low_node - allocate low boot memory from a specific node - * @pgdat: node to allocate from - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may fall back to any node in the system if the specified node - * can not hold the requested memory. - * - * The function panics if the request can not be satisfied. - */ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { @@ -395,13 +395,13 @@ static inline void cma_debug_show_areas(struct cma *cma) { } * @cma: Contiguous memory region for which the allocation is performed. * @count: Requested number of pages. * @align: Requested alignment of pages (in PAGE_SIZE order). - * @gfp_mask: GFP mask to use during compaction + * @no_warn: Avoid printing message about failed allocation * * This function allocates part of contiguous memory on specific * contiguous memory area. */ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, - gfp_t gfp_mask) + bool no_warn) { unsigned long mask, offset; unsigned long pfn = -1; @@ -447,7 +447,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); mutex_lock(&cma_mutex); ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, - gfp_mask); + GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0)); mutex_unlock(&cma_mutex); if (ret == 0) { page = pfn_to_page(pfn); @@ -466,7 +466,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, trace_cma_alloc(pfn, page, count, align); - if (ret && !(gfp_mask & __GFP_NOWARN)) { + if (ret && !no_warn) { pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n", __func__, count, ret); cma_debug_show_areas(cma); diff --git a/mm/cma_debug.c b/mm/cma_debug.c index f23467291cfb..ad6723e9d110 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -139,7 +139,7 @@ static int cma_alloc_mem(struct cma *cma, int count) if (!mem) return -ENOMEM; - p = cma_alloc(cma, count, 0, GFP_KERNEL); + p = cma_alloc(cma, count, 0, false); if (!p) { kfree(mem); return -ENOMEM; diff --git a/mm/debug.c b/mm/debug.c index 56e2d9125ea5..38c926520c97 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -43,12 +43,25 @@ const struct trace_print_flags vmaflag_names[] = { void __dump_page(struct page *page, const char *reason) { + bool page_poisoned = PagePoisoned(page); + int mapcount; + + /* + * If struct page is poisoned don't access Page*() functions as that + * leads to recursive loop. Page*() check for poisoned pages, and calls + * dump_page() when detected. + */ + if (page_poisoned) { + pr_emerg("page:%px is uninitialized and poisoned", page); + goto hex_only; + } + /* * Avoid VM_BUG_ON() in page_mapcount(). * page->_mapcount space in struct page is used by sl[aou]b pages to * encode own info. */ - int mapcount = PageSlab(page) ? 0 : page_mapcount(page); + mapcount = PageSlab(page) ? 0 : page_mapcount(page); pr_emerg("page:%px count:%d mapcount:%d mapping:%px index:%#lx", page, page_ref_count(page), mapcount, @@ -60,6 +73,7 @@ void __dump_page(struct page *page, const char *reason) pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags); +hex_only: print_hex_dump(KERN_ALERT, "raw: ", DUMP_PREFIX_NONE, 32, sizeof(unsigned long), page, sizeof(struct page), false); @@ -68,7 +82,7 @@ void __dump_page(struct page *page, const char *reason) pr_alert("page dumped because: %s\n", reason); #ifdef CONFIG_MEMCG - if (page->mem_cgroup) + if (!page_poisoned && page->mem_cgroup) pr_alert("page->mem_cgroup:%px\n", page->mem_cgroup); #endif } diff --git a/mm/fadvise.c b/mm/fadvise.c index afa41491d324..2d8376e3c640 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -72,8 +72,12 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) goto out; } - /* Careful about overflows. Len == 0 means "as much as possible" */ - endbyte = offset + len; + /* + * Careful about overflows. Len == 0 means "as much as possible". Use + * unsigned math because signed overflows are undefined and UBSan + * complains. + */ + endbyte = (u64)offset + (u64)len; if (!len || endbyte < len) endbyte = -1; else @@ -497,7 +497,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, unsigned long address, unsigned int *flags, int *nonblocking) { unsigned int fault_flags = 0; - int ret; + vm_fault_t ret; /* mlock all present pages, but do not fault in new pages */ if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) @@ -818,7 +818,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, bool *unlocked) { struct vm_area_struct *vma; - int ret, major = 0; + vm_fault_t ret, major = 0; if (unlocked) fault_flags |= FAULT_FLAG_ALLOW_RETRY; @@ -1238,8 +1238,6 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) int locked = 0; long ret = 0; - VM_BUG_ON(start & ~PAGE_MASK); - VM_BUG_ON(len != PAGE_ALIGN(len)); end = start + len; for (nstart = start; nstart < end; nstart = nend) { @@ -177,16 +177,19 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) up_write(&hmm->mirrors_sem); } -static void hmm_invalidate_range_start(struct mmu_notifier *mn, +static int hmm_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct hmm *hmm = mm->hmm; VM_BUG_ON(!hmm); atomic_inc(&hmm->sequence); + + return 0; } static void hmm_invalidate_range_end(struct mmu_notifier *mn, @@ -299,14 +302,14 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; - int r; + vm_fault_t ret; flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; flags |= write_fault ? FAULT_FLAG_WRITE : 0; - r = handle_mm_fault(vma, addr, flags); - if (r & VM_FAULT_RETRY) + ret = handle_mm_fault(vma, addr, flags); + if (ret & VM_FAULT_RETRY) return -EBUSY; - if (r & VM_FAULT_ERROR) { + if (ret & VM_FAULT_ERROR) { *pfn = range->values[HMM_PFN_ERROR]; return -EFAULT; } @@ -676,7 +679,8 @@ int hmm_vma_get_pfns(struct hmm_range *range) return -EINVAL; /* FIXME support hugetlb fs */ - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { + if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || + vma_is_dax(vma)) { hmm_pfns_special(range); return -EINVAL; } @@ -849,7 +853,8 @@ int hmm_vma_fault(struct hmm_range *range, bool block) return -EINVAL; /* FIXME support hugetlb fs */ - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { + if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || + vma_is_dax(vma)) { hmm_pfns_special(range); return -EINVAL; } @@ -963,6 +968,8 @@ static void hmm_devmem_free(struct page *page, void *data) { struct hmm_devmem *devmem = data; + page->mapping = NULL; + devmem->ops->free(devmem, page); } @@ -971,10 +978,7 @@ static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL); static void hmm_devmem_radix_release(struct resource *resource) { - resource_size_t key, align_start, align_size; - - align_start = resource->start & ~(PA_SECTION_SIZE - 1); - align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE); + resource_size_t key; mutex_lock(&hmm_devmem_lock); for (key = resource->start; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1cd7c1a57a14..c3bc7e9c9a2a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -541,18 +541,18 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); -static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, - gfp_t gfp) +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, + struct page *page, gfp_t gfp) { struct vm_area_struct *vma = vmf->vma; struct mem_cgroup *memcg; pgtable_t pgtable; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - int ret = 0; + vm_fault_t ret = 0; VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) { + if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -584,15 +584,15 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, /* Deliver the page fault to userland */ if (userfaultfd_missing(vma)) { - int ret; + vm_fault_t ret2; spin_unlock(vmf->ptl); mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(vma->vm_mm, pgtable); - ret = handle_userfault(vmf, VM_UFFD_MISSING); - VM_BUG_ON(ret & VM_FAULT_FALLBACK); - return ret; + ret2 = handle_userfault(vmf, VM_UFFD_MISSING); + VM_BUG_ON(ret2 & VM_FAULT_FALLBACK); + return ret2; } entry = mk_huge_pmd(page, vma->vm_page_prot); @@ -663,7 +663,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, return true; } -int do_huge_pmd_anonymous_page(struct vm_fault *vmf) +vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; gfp_t gfp; @@ -682,7 +682,7 @@ int do_huge_pmd_anonymous_page(struct vm_fault *vmf) pgtable_t pgtable; struct page *zero_page; bool set; - int ret; + vm_fault_t ret; pgtable = pte_alloc_one(vma->vm_mm, haddr); if (unlikely(!pgtable)) return VM_FAULT_OOM; @@ -752,7 +752,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, spin_unlock(ptl); } -int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, +vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, pfn_t pfn, bool write) { pgprot_t pgprot = vma->vm_page_prot; @@ -762,11 +762,11 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, * but we need to be consistent with PTEs and architectures that * can't support a 'special' bit. */ - BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && + !pfn_t_devmap(pfn)); BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); - BUG_ON(!pfn_t_devmap(pfn)); if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; @@ -812,7 +812,7 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, spin_unlock(ptl); } -int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, +vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, pfn_t pfn, bool write) { pgprot_t pgprot = vma->vm_page_prot; @@ -1118,15 +1118,16 @@ unlock: spin_unlock(vmf->ptl); } -static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, - struct page *page) +static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, + pmd_t orig_pmd, struct page *page) { struct vm_area_struct *vma = vmf->vma; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; struct mem_cgroup *memcg; pgtable_t pgtable; pmd_t _pmd; - int ret = 0, i; + int i; + vm_fault_t ret = 0; struct page **pages; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ @@ -1142,7 +1143,7 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma, vmf->address, page_to_nid(page)); if (unlikely(!pages[i] || - mem_cgroup_try_charge(pages[i], vma->vm_mm, + mem_cgroup_try_charge_delay(pages[i], vma->vm_mm, GFP_KERNEL, &memcg, false))) { if (pages[i]) put_page(pages[i]); @@ -1236,7 +1237,7 @@ out_free_pages: goto out; } -int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) +vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) { struct vm_area_struct *vma = vmf->vma; struct page *page = NULL, *new_page; @@ -1245,7 +1246,7 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ gfp_t huge_gfp; /* for allocation and charge */ - int ret = 0; + vm_fault_t ret = 0; vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); @@ -1312,7 +1313,7 @@ alloc: goto out; } - if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, + if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm, huge_gfp, &memcg, true))) { put_page(new_page); split_huge_pmd(vma, vmf->pmd, vmf->address); @@ -1328,7 +1329,8 @@ alloc: if (!page) clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR); else - copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); + copy_user_huge_page(new_page, page, vmf->address, + vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); mmun_start = haddr; @@ -1456,7 +1458,7 @@ out: } /* NUMA hinting page fault entry point for trans huge pmds */ -int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) +vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) { struct vm_area_struct *vma = vmf->vma; struct anon_vma *anon_vma = NULL; @@ -1740,7 +1742,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, } else { if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); - add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR); + add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR); } spin_unlock(ptl); @@ -2084,11 +2086,13 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (vma_is_dax(vma)) return; page = pmd_page(_pmd); + if (!PageDirty(page) && pmd_dirty(_pmd)) + set_page_dirty(page); if (!PageReferenced(page) && pmd_young(_pmd)) SetPageReferenced(page); page_remove_rmap(page, true); put_page(page); - add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR); + add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); return; } else if (is_huge_zero_pmd(*pmd)) { /* diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3612fbb32e9d..3c21775f196b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1479,22 +1479,20 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, /* * Dissolve a given free hugepage into free buddy pages. This function does * nothing for in-use (including surplus) hugepages. Returns -EBUSY if the - * number of free hugepages would be reduced below the number of reserved - * hugepages. + * dissolution fails because a give page is not a free hugepage, or because + * free hugepages are fully reserved. */ int dissolve_free_huge_page(struct page *page) { - int rc = 0; + int rc = -EBUSY; spin_lock(&hugetlb_lock); if (PageHuge(page) && !page_count(page)) { struct page *head = compound_head(page); struct hstate *h = page_hstate(head); int nid = page_to_nid(head); - if (h->free_huge_pages - h->resv_huge_pages == 0) { - rc = -EBUSY; + if (h->free_huge_pages - h->resv_huge_pages == 0) goto out; - } /* * Move PageHWPoison flag from head page to the raw error page, * which makes any subpages rather than the error page reusable. @@ -1508,6 +1506,7 @@ int dissolve_free_huge_page(struct page *page) h->free_huge_pages_node[nid]--; h->max_huge_pages--; update_and_free_page(h, head); + rc = 0; } out: spin_unlock(&hugetlb_lock); @@ -2101,7 +2100,7 @@ int __alloc_bootmem_huge_page(struct hstate *h) for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { void *addr; - addr = memblock_virt_alloc_try_nid_nopanic( + addr = memblock_virt_alloc_try_nid_raw( huge_page_size(h), huge_page_size(h), 0, BOOTMEM_ALLOC_ACCESSIBLE, node); if (addr) { @@ -2119,6 +2118,7 @@ int __alloc_bootmem_huge_page(struct hstate *h) found: BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h))); /* Put them into a private list first because mem_map is not up yet */ + INIT_LIST_HEAD(&m->list); list_add(&m->list, &huge_boot_pages); m->hstate = h; return 1; @@ -2139,16 +2139,9 @@ static void __init gather_bootmem_prealloc(void) struct huge_bootmem_page *m; list_for_each_entry(m, &huge_boot_pages, list) { + struct page *page = virt_to_page(m); struct hstate *h = m->hstate; - struct page *page; -#ifdef CONFIG_HIGHMEM - page = pfn_to_page(m->phys >> PAGE_SHIFT); - memblock_free_late(__pa(m), - sizeof(struct huge_bootmem_page)); -#else - page = virt_to_page(m); -#endif WARN_ON(page_count(page) != 1); prep_compound_huge_page(page, h->order); WARN_ON(PageReserved(page)); @@ -2163,6 +2156,7 @@ static void __init gather_bootmem_prealloc(void) */ if (hstate_is_gigantic(h)) adjust_managed_page_count(page, 1 << h->order); + cond_resched(); } } @@ -3166,6 +3160,13 @@ static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) return 0; } +/* + * When a new function is introduced to vm_operations_struct and added + * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops. + * This is because under System V memory model, mappings created via + * shmget/shmat with "huge page" specified are backed by hugetlbfs files, + * their original vm_ops are overwritten with shm_vm_ops. + */ const struct vm_operations_struct hugetlb_vm_ops = { .fault = hugetlb_vm_op_fault, .open = hugetlb_vm_op_open, @@ -3500,16 +3501,18 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * cannot race with other handlers or page migration. * Keep the pte_same checks anyway to make transition from the mutex easier. */ -static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, +static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, struct page *pagecache_page, spinlock_t *ptl) { pte_t pte; struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; - int ret = 0, outside_reserve = 0; + int outside_reserve = 0; + vm_fault_t ret = 0; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ + unsigned long haddr = address & huge_page_mask(h); pte = huge_ptep_get(ptep); old_page = pte_page(pte); @@ -3519,7 +3522,7 @@ retry_avoidcopy: * and just make the page writable */ if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { page_move_anon_rmap(old_page, vma); - set_huge_ptep_writable(vma, address, ptep); + set_huge_ptep_writable(vma, haddr, ptep); return 0; } @@ -3543,7 +3546,7 @@ retry_avoidcopy: * be acquired again before returning to the caller, as expected. */ spin_unlock(ptl); - new_page = alloc_huge_page(vma, address, outside_reserve); + new_page = alloc_huge_page(vma, haddr, outside_reserve); if (IS_ERR(new_page)) { /* @@ -3556,11 +3559,10 @@ retry_avoidcopy: if (outside_reserve) { put_page(old_page); BUG_ON(huge_pte_none(pte)); - unmap_ref_private(mm, vma, old_page, address); + unmap_ref_private(mm, vma, old_page, haddr); BUG_ON(huge_pte_none(pte)); spin_lock(ptl); - ptep = huge_pte_offset(mm, address & huge_page_mask(h), - huge_page_size(h)); + ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) goto retry_avoidcopy; @@ -3571,8 +3573,7 @@ retry_avoidcopy: return 0; } - ret = (PTR_ERR(new_page) == -ENOMEM) ? - VM_FAULT_OOM : VM_FAULT_SIGBUS; + ret = vmf_error(PTR_ERR(new_page)); goto out_release_old; } @@ -3590,7 +3591,7 @@ retry_avoidcopy: __SetPageUptodate(new_page); set_page_huge_active(new_page); - mmun_start = address & huge_page_mask(h); + mmun_start = haddr; mmun_end = mmun_start + huge_page_size(h); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); @@ -3599,25 +3600,24 @@ retry_avoidcopy: * before the page tables are altered */ spin_lock(ptl); - ptep = huge_pte_offset(mm, address & huge_page_mask(h), - huge_page_size(h)); + ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { ClearPagePrivate(new_page); /* Break COW */ - huge_ptep_clear_flush(vma, address, ptep); + huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); - set_huge_pte_at(mm, address, ptep, + set_huge_pte_at(mm, haddr, ptep, make_huge_pte(vma, new_page, 1)); page_remove_rmap(old_page, true); - hugepage_add_new_anon_rmap(new_page, vma, address); + hugepage_add_new_anon_rmap(new_page, vma, haddr); /* Make the old page be freed below */ new_page = old_page; } spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out_release_all: - restore_reserve_on_error(h, vma, address, new_page); + restore_reserve_on_error(h, vma, haddr, new_page); put_page(new_page); out_release_old: put_page(old_page); @@ -3676,12 +3676,13 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, return 0; } -static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, - struct address_space *mapping, pgoff_t idx, - unsigned long address, pte_t *ptep, unsigned int flags) +static vm_fault_t hugetlb_no_page(struct mm_struct *mm, + struct vm_area_struct *vma, + struct address_space *mapping, pgoff_t idx, + unsigned long address, pte_t *ptep, unsigned int flags) { struct hstate *h = hstate_vma(vma); - int ret = VM_FAULT_SIGBUS; + vm_fault_t ret = VM_FAULT_SIGBUS; int anon_rmap = 0; unsigned long size; struct page *page; @@ -3744,11 +3745,7 @@ retry: page = alloc_huge_page(vma, haddr, 0); if (IS_ERR(page)) { - ret = PTR_ERR(page); - if (ret == -ENOMEM) - ret = VM_FAULT_OOM; - else - ret = VM_FAULT_SIGBUS; + ret = vmf_error(PTR_ERR(page)); goto out; } clear_huge_page(page, address, pages_per_huge_page(h)); @@ -3820,7 +3817,7 @@ retry: hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_cow(mm, vma, haddr, ptep, page, ptl); + ret = hugetlb_cow(mm, vma, address, ptep, page, ptl); } spin_unlock(ptl); @@ -3872,12 +3869,12 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, } #endif -int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, +vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { pte_t *ptep, entry; spinlock_t *ptl; - int ret; + vm_fault_t ret; u32 hash; pgoff_t idx; struct page *page = NULL; @@ -3974,7 +3971,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_WRITE) { if (!huge_pte_write(entry)) { - ret = hugetlb_cow(mm, vma, haddr, ptep, + ret = hugetlb_cow(mm, vma, address, ptep, pagecache_page, ptl); goto out_put_page; } @@ -4207,7 +4204,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, if (absent || is_swap_pte(huge_ptep_get(pte)) || ((flags & FOLL_WRITE) && !huge_pte_write(huge_ptep_get(pte)))) { - int ret; + vm_fault_t ret; unsigned int fault_flags = 0; if (pte) diff --git a/mm/init-mm.c b/mm/init-mm.c index f0179c9c04c2..a787a319211e 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -15,6 +15,16 @@ #define INIT_MM_CONTEXT(name) #endif +/* + * For dynamically allocated mm_structs, there is a dynamically sized cpumask + * at the end of the structure, the size of which depends on the maximum CPU + * number the system can see. That way we allocate only as much memory for + * mm_cpumask() as needed for the hundreds, or thousands of processes that + * a system typically runs. + * + * Since there is only one init_mm in the entire system, keep it simple + * and size this cpu_bitmask to NR_CPUS. + */ struct mm_struct init_mm = { .mm_rb = RB_ROOT, .pgd = swapper_pg_dir, @@ -25,5 +35,6 @@ struct mm_struct init_mm = { .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), .user_ns = &init_user_ns, + .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, INIT_MM_CONTEXT(init_mm) }; diff --git a/mm/internal.h b/mm/internal.h index 9e3654d70289..87256ae1bef8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -38,7 +38,7 @@ void page_writeback_init(void); -int do_swap_page(struct vm_fault *vmf); +vm_fault_t do_swap_page(struct vm_fault *vmf); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); @@ -389,18 +389,6 @@ static inline struct page *mem_map_next(struct page *iter, return iter + 1; } -/* - * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node, - * so all functions starting at paging_init should be marked __init - * in those cases. SPARSEMEM, however, allows for memory hotplug, - * and alloc_bootmem_node is not used. - */ -#ifdef CONFIG_SPARSEMEM -#define __paginginit __meminit -#else -#define __paginginit __init -#endif - /* Memory initialisation debug and verification */ enum mminit_level { MMINIT_WARNING, diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index f185455b3406..c3bd5209da38 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -619,12 +619,13 @@ void kasan_kfree_large(void *ptr, unsigned long ip) int kasan_module_alloc(void *addr, size_t size) { void *ret; + size_t scaled_size; size_t shadow_size; unsigned long shadow_start; shadow_start = (unsigned long)kasan_mem_to_shadow(addr); - shadow_size = round_up(size >> KASAN_SHADOW_SCALE_SHIFT, - PAGE_SIZE); + scaled_size = (size + KASAN_SHADOW_MASK) >> KASAN_SHADOW_SCALE_SHIFT; + shadow_size = round_up(scaled_size, PAGE_SIZE); if (WARN_ON(!PAGE_ALIGNED(shadow_start))) return -EINVAL; diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c index f436246ccc79..7a2a2f13f86f 100644 --- a/mm/kasan/kasan_init.c +++ b/mm/kasan/kasan_init.c @@ -17,10 +17,13 @@ #include <linux/memblock.h> #include <linux/mm.h> #include <linux/pfn.h> +#include <linux/slab.h> #include <asm/page.h> #include <asm/pgalloc.h> +#include "kasan.h" + /* * This page serves two purposes: * - It used as early shadow memory. The entire shadow region populated @@ -32,22 +35,59 @@ unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss; #if CONFIG_PGTABLE_LEVELS > 4 p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss; +static inline bool kasan_p4d_table(pgd_t pgd) +{ + return pgd_page(pgd) == virt_to_page(lm_alias(kasan_zero_p4d)); +} +#else +static inline bool kasan_p4d_table(pgd_t pgd) +{ + return 0; +} #endif #if CONFIG_PGTABLE_LEVELS > 3 pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss; +static inline bool kasan_pud_table(p4d_t p4d) +{ + return p4d_page(p4d) == virt_to_page(lm_alias(kasan_zero_pud)); +} +#else +static inline bool kasan_pud_table(p4d_t p4d) +{ + return 0; +} #endif #if CONFIG_PGTABLE_LEVELS > 2 pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss; +static inline bool kasan_pmd_table(pud_t pud) +{ + return pud_page(pud) == virt_to_page(lm_alias(kasan_zero_pmd)); +} +#else +static inline bool kasan_pmd_table(pud_t pud) +{ + return 0; +} #endif pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss; +static inline bool kasan_pte_table(pmd_t pmd) +{ + return pmd_page(pmd) == virt_to_page(lm_alias(kasan_zero_pte)); +} + +static inline bool kasan_zero_page_entry(pte_t pte) +{ + return pte_page(pte) == virt_to_page(lm_alias(kasan_zero_page)); +} + static __init void *early_alloc(size_t size, int node) { return memblock_virt_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, node); } -static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr, +static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr, unsigned long end) { pte_t *pte = pte_offset_kernel(pmd, addr); @@ -63,7 +103,7 @@ static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr, } } -static void __init zero_pmd_populate(pud_t *pud, unsigned long addr, +static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr, unsigned long end) { pmd_t *pmd = pmd_offset(pud, addr); @@ -78,14 +118,24 @@ static void __init zero_pmd_populate(pud_t *pud, unsigned long addr, } if (pmd_none(*pmd)) { - pmd_populate_kernel(&init_mm, pmd, - early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + pte_t *p; + + if (slab_is_available()) + p = pte_alloc_one_kernel(&init_mm, addr); + else + p = early_alloc(PAGE_SIZE, NUMA_NO_NODE); + if (!p) + return -ENOMEM; + + pmd_populate_kernel(&init_mm, pmd, p); } zero_pte_populate(pmd, addr, next); } while (pmd++, addr = next, addr != end); + + return 0; } -static void __init zero_pud_populate(p4d_t *p4d, unsigned long addr, +static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long end) { pud_t *pud = pud_offset(p4d, addr); @@ -103,14 +153,24 @@ static void __init zero_pud_populate(p4d_t *p4d, unsigned long addr, } if (pud_none(*pud)) { - pud_populate(&init_mm, pud, - early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + pmd_t *p; + + if (slab_is_available()) { + p = pmd_alloc(&init_mm, pud, addr); + if (!p) + return -ENOMEM; + } else { + pud_populate(&init_mm, pud, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } } zero_pmd_populate(pud, addr, next); } while (pud++, addr = next, addr != end); + + return 0; } -static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr, +static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr, unsigned long end) { p4d_t *p4d = p4d_offset(pgd, addr); @@ -132,11 +192,21 @@ static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr, } if (p4d_none(*p4d)) { - p4d_populate(&init_mm, p4d, - early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + pud_t *p; + + if (slab_is_available()) { + p = pud_alloc(&init_mm, p4d, addr); + if (!p) + return -ENOMEM; + } else { + p4d_populate(&init_mm, p4d, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } } zero_pud_populate(p4d, addr, next); } while (p4d++, addr = next, addr != end); + + return 0; } /** @@ -145,7 +215,7 @@ static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr, * @shadow_start - start of the memory range to populate * @shadow_end - end of the memory range to populate */ -void __init kasan_populate_zero_shadow(const void *shadow_start, +int __ref kasan_populate_zero_shadow(const void *shadow_start, const void *shadow_end) { unsigned long addr = (unsigned long)shadow_start; @@ -191,9 +261,229 @@ void __init kasan_populate_zero_shadow(const void *shadow_start, } if (pgd_none(*pgd)) { - pgd_populate(&init_mm, pgd, - early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + p4d_t *p; + + if (slab_is_available()) { + p = p4d_alloc(&init_mm, pgd, addr); + if (!p) + return -ENOMEM; + } else { + pgd_populate(&init_mm, pgd, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } } zero_p4d_populate(pgd, addr, next); } while (pgd++, addr = next, addr != end); + + return 0; +} + +static void kasan_free_pte(pte_t *pte_start, pmd_t *pmd) +{ + pte_t *pte; + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (!pte_none(*pte)) + return; + } + + pte_free_kernel(&init_mm, (pte_t *)page_to_virt(pmd_page(*pmd))); + pmd_clear(pmd); +} + +static void kasan_free_pmd(pmd_t *pmd_start, pud_t *pud) +{ + pmd_t *pmd; + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (!pmd_none(*pmd)) + return; + } + + pmd_free(&init_mm, (pmd_t *)page_to_virt(pud_page(*pud))); + pud_clear(pud); +} + +static void kasan_free_pud(pud_t *pud_start, p4d_t *p4d) +{ + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (!pud_none(*pud)) + return; + } + + pud_free(&init_mm, (pud_t *)page_to_virt(p4d_page(*p4d))); + p4d_clear(p4d); +} + +static void kasan_free_p4d(p4d_t *p4d_start, pgd_t *pgd) +{ + p4d_t *p4d; + int i; + + for (i = 0; i < PTRS_PER_P4D; i++) { + p4d = p4d_start + i; + if (!p4d_none(*p4d)) + return; + } + + p4d_free(&init_mm, (p4d_t *)page_to_virt(pgd_page(*pgd))); + pgd_clear(pgd); +} + +static void kasan_remove_pte_table(pte_t *pte, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, pte++) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + if (!pte_present(*pte)) + continue; + + if (WARN_ON(!kasan_zero_page_entry(*pte))) + continue; + pte_clear(&init_mm, addr, pte); + } +} + +static void kasan_remove_pmd_table(pmd_t *pmd, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, pmd++) { + pte_t *pte; + + next = pmd_addr_end(addr, end); + + if (!pmd_present(*pmd)) + continue; + + if (kasan_pte_table(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) + pmd_clear(pmd); + continue; + } + pte = pte_offset_kernel(pmd, addr); + kasan_remove_pte_table(pte, addr, next); + kasan_free_pte(pte_offset_kernel(pmd, 0), pmd); + } +} + +static void kasan_remove_pud_table(pud_t *pud, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, pud++) { + pmd_t *pmd, *pmd_base; + + next = pud_addr_end(addr, end); + + if (!pud_present(*pud)) + continue; + + if (kasan_pmd_table(*pud)) { + if (IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE)) + pud_clear(pud); + continue; + } + pmd = pmd_offset(pud, addr); + pmd_base = pmd_offset(pud, 0); + kasan_remove_pmd_table(pmd, addr, next); + kasan_free_pmd(pmd_base, pud); + } +} + +static void kasan_remove_p4d_table(p4d_t *p4d, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, p4d++) { + pud_t *pud; + + next = p4d_addr_end(addr, end); + + if (!p4d_present(*p4d)) + continue; + + if (kasan_pud_table(*p4d)) { + if (IS_ALIGNED(addr, P4D_SIZE) && + IS_ALIGNED(next, P4D_SIZE)) + p4d_clear(p4d); + continue; + } + pud = pud_offset(p4d, addr); + kasan_remove_pud_table(pud, addr, next); + kasan_free_pud(pud_offset(p4d, 0), p4d); + } +} + +void kasan_remove_zero_shadow(void *start, unsigned long size) +{ + unsigned long addr, end, next; + pgd_t *pgd; + + addr = (unsigned long)kasan_mem_to_shadow(start); + end = addr + (size >> KASAN_SHADOW_SCALE_SHIFT); + + if (WARN_ON((unsigned long)start % + (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) || + WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE))) + return; + + for (; addr < end; addr = next) { + p4d_t *p4d; + + next = pgd_addr_end(addr, end); + + pgd = pgd_offset_k(addr); + if (!pgd_present(*pgd)) + continue; + + if (kasan_p4d_table(*pgd)) { + if (IS_ALIGNED(addr, PGDIR_SIZE) && + IS_ALIGNED(next, PGDIR_SIZE)) + pgd_clear(pgd); + continue; + } + + p4d = p4d_offset(pgd, addr); + kasan_remove_p4d_table(p4d, addr, next); + kasan_free_p4d(p4d_offset(pgd, 0), pgd); + } +} + +int kasan_add_zero_shadow(void *start, unsigned long size) +{ + int ret; + void *shadow_start, *shadow_end; + + shadow_start = kasan_mem_to_shadow(start); + shadow_end = shadow_start + (size >> KASAN_SHADOW_SCALE_SHIFT); + + if (WARN_ON((unsigned long)start % + (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) || + WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE))) + return -EINVAL; + + ret = kasan_populate_zero_shadow(shadow_start, shadow_end); + if (ret) + kasan_remove_zero_shadow(shadow_start, + size >> KASAN_SHADOW_SCALE_SHIFT); + return ret; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index d7b2a4bf8671..a31d740e6cd1 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -397,6 +397,26 @@ static inline int khugepaged_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } +static bool hugepage_vma_check(struct vm_area_struct *vma, + unsigned long vm_flags) +{ + if ((!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || + (vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + return false; + if (shmem_file(vma->vm_file)) { + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) + return false; + return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, + HPAGE_PMD_NR); + } + if (!vma->anon_vma || vma->vm_ops) + return false; + if (is_vma_temporary_stack(vma)) + return false; + return !(vm_flags & VM_NO_KHUGEPAGED); +} + int __khugepaged_enter(struct mm_struct *mm) { struct mm_slot *mm_slot; @@ -434,15 +454,14 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma, unsigned long vm_flags) { unsigned long hstart, hend; - if (!vma->anon_vma) - /* - * Not yet faulted in so we will register later in the - * page fault if needed. - */ - return 0; - if (vma->vm_ops || (vm_flags & VM_NO_KHUGEPAGED)) - /* khugepaged not yet working on file or special mappings */ + + /* + * khugepaged does not yet work on non-shmem files or special + * mappings. And file-private shmem THP is not supported. + */ + if (!hugepage_vma_check(vma, vm_flags)) return 0; + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) @@ -819,25 +838,6 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) } #endif -static bool hugepage_vma_check(struct vm_area_struct *vma) -{ - if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || - (vma->vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - return false; - if (shmem_file(vma->vm_file)) { - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) - return false; - return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, - HPAGE_PMD_NR); - } - if (!vma->anon_vma || vma->vm_ops) - return false; - if (is_vma_temporary_stack(vma)) - return false; - return !(vma->vm_flags & VM_NO_KHUGEPAGED); -} - /* * If mmap_sem temporarily dropped, revalidate vma * before taking mmap_sem. @@ -862,7 +862,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, hend = vma->vm_end & HPAGE_PMD_MASK; if (address < hstart || address + HPAGE_PMD_SIZE > hend) return SCAN_ADDRESS_RANGE; - if (!hugepage_vma_check(vma)) + if (!hugepage_vma_check(vma, vma->vm_flags)) return SCAN_VMA_CHECK; return 0; } @@ -880,7 +880,8 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int referenced) { - int swapped_in = 0, ret = 0; + int swapped_in = 0; + vm_fault_t ret = 0; struct vm_fault vmf = { .vma = vma, .address = address, @@ -1517,6 +1518,8 @@ tree_unlocked: unlock_page(new_page); *hpage = NULL; + + khugepaged_pages_collapsed++; } else { /* Something went wrong: rollback changes to the radix-tree */ shmem_uncharge(mapping->host, nr_none); @@ -1694,7 +1697,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, progress++; break; } - if (!hugepage_vma_check(vma)) { + if (!hugepage_vma_check(vma, vma->vm_flags)) { skip: progress++; continue; @@ -470,7 +470,7 @@ static inline bool ksm_test_exit(struct mm_struct *mm) static int break_ksm(struct vm_area_struct *vma, unsigned long addr) { struct page *page; - int ret = 0; + vm_fault_t ret = 0; do { cond_resched(); @@ -652,9 +652,9 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) * list_head to stay clear from the rb_parent_color union * (aligned and different than any node) and also different * from &migrate_nodes. This will verify that future list.h changes - * don't break STABLE_NODE_DUP_HEAD. + * don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it. */ -#if GCC_VERSION >= 40903 /* only recent gcc can handle it */ +#if defined(GCC_VERSION) && GCC_VERSION >= 40903 BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes); BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1); #endif @@ -703,7 +703,7 @@ again: * We cannot do anything with the page while its refcount is 0. * Usually 0 means free, or tail of a higher-order page: in which * case this node is no longer referenced, and should be freed; - * however, it might mean that the page is under page_freeze_refs(). + * however, it might mean that the page is under page_ref_freeze(). * The __remove_mapping() case is easy, again the node is now stale; * but if page is swapcache in migrate_page_move_mapping(), it might * still be our page, in which case it's essential to keep the node. @@ -714,7 +714,7 @@ again: * work here too. We have chosen the !PageSwapCache test to * optimize the common case, when the page is or is about to * be freed: PageSwapCache is cleared (under spin_lock_irq) - * in the freeze_refs section of __remove_mapping(); but Anon + * in the ref_freeze section of __remove_mapping(); but Anon * page->mapping reset to NULL later, in free_pages_prepare(). */ if (!PageSwapCache(page)) @@ -2430,6 +2430,9 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, VM_HUGETLB | VM_MIXEDMAP)) return 0; /* just ignore the advice */ + if (vma_is_dax(vma)) + return 0; + #ifdef VM_SAO if (*vm_flags & VM_SAO) return 0; diff --git a/mm/list_lru.c b/mm/list_lru.c index fcfb6c89ed47..5b30625fd365 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -12,7 +12,7 @@ #include <linux/mutex.h> #include <linux/memcontrol.h> -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM static LIST_HEAD(list_lrus); static DEFINE_MUTEX(list_lrus_mutex); @@ -29,17 +29,12 @@ static void list_lru_unregister(struct list_lru *lru) list_del(&lru->list); mutex_unlock(&list_lrus_mutex); } -#else -static void list_lru_register(struct list_lru *lru) -{ -} -static void list_lru_unregister(struct list_lru *lru) +static int lru_shrinker_id(struct list_lru *lru) { + return lru->shrinker_id; } -#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) static inline bool list_lru_memcg_aware(struct list_lru *lru) { /* @@ -75,20 +70,39 @@ static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr) } static inline struct list_lru_one * -list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) +list_lru_from_kmem(struct list_lru_node *nlru, void *ptr, + struct mem_cgroup **memcg_ptr) { - struct mem_cgroup *memcg; + struct list_lru_one *l = &nlru->lru; + struct mem_cgroup *memcg = NULL; if (!nlru->memcg_lrus) - return &nlru->lru; + goto out; memcg = mem_cgroup_from_kmem(ptr); if (!memcg) - return &nlru->lru; + goto out; - return list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg)); + l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg)); +out: + if (memcg_ptr) + *memcg_ptr = memcg; + return l; } #else +static void list_lru_register(struct list_lru *lru) +{ +} + +static void list_lru_unregister(struct list_lru *lru) +{ +} + +static int lru_shrinker_id(struct list_lru *lru) +{ + return -1; +} + static inline bool list_lru_memcg_aware(struct list_lru *lru) { return false; @@ -101,23 +115,30 @@ list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) } static inline struct list_lru_one * -list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) +list_lru_from_kmem(struct list_lru_node *nlru, void *ptr, + struct mem_cgroup **memcg_ptr) { + if (memcg_ptr) + *memcg_ptr = NULL; return &nlru->lru; } -#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ bool list_lru_add(struct list_lru *lru, struct list_head *item) { int nid = page_to_nid(virt_to_page(item)); struct list_lru_node *nlru = &lru->node[nid]; + struct mem_cgroup *memcg; struct list_lru_one *l; spin_lock(&nlru->lock); if (list_empty(item)) { - l = list_lru_from_kmem(nlru, item); + l = list_lru_from_kmem(nlru, item, &memcg); list_add_tail(item, &l->list); - l->nr_items++; + /* Set shrinker bit if the first element was added */ + if (!l->nr_items++) + memcg_set_shrinker_bit(memcg, nid, + lru_shrinker_id(lru)); nlru->nr_items++; spin_unlock(&nlru->lock); return true; @@ -135,7 +156,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) spin_lock(&nlru->lock); if (!list_empty(item)) { - l = list_lru_from_kmem(nlru, item); + l = list_lru_from_kmem(nlru, item, NULL); list_del_init(item); l->nr_items--; nlru->nr_items--; @@ -162,26 +183,20 @@ void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, } EXPORT_SYMBOL_GPL(list_lru_isolate_move); -static unsigned long __list_lru_count_one(struct list_lru *lru, - int nid, int memcg_idx) +unsigned long list_lru_count_one(struct list_lru *lru, + int nid, struct mem_cgroup *memcg) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; unsigned long count; rcu_read_lock(); - l = list_lru_from_memcg_idx(nlru, memcg_idx); + l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg)); count = l->nr_items; rcu_read_unlock(); return count; } - -unsigned long list_lru_count_one(struct list_lru *lru, - int nid, struct mem_cgroup *memcg) -{ - return __list_lru_count_one(lru, nid, memcg_cache_id(memcg)); -} EXPORT_SYMBOL_GPL(list_lru_count_one); unsigned long list_lru_count_node(struct list_lru *lru, int nid) @@ -194,17 +209,15 @@ unsigned long list_lru_count_node(struct list_lru *lru, int nid) EXPORT_SYMBOL_GPL(list_lru_count_node); static unsigned long -__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, +__list_lru_walk_one(struct list_lru_node *nlru, int memcg_idx, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { - struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; struct list_head *item, *n; unsigned long isolated = 0; - spin_lock(&nlru->lock); l = list_lru_from_memcg_idx(nlru, memcg_idx); restart: list_for_each_safe(item, n, &l->list) { @@ -250,8 +263,6 @@ restart: BUG(); } } - - spin_unlock(&nlru->lock); return isolated; } @@ -260,11 +271,32 @@ list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { - return __list_lru_walk_one(lru, nid, memcg_cache_id(memcg), - isolate, cb_arg, nr_to_walk); + struct list_lru_node *nlru = &lru->node[nid]; + unsigned long ret; + + spin_lock(&nlru->lock); + ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg, + nr_to_walk); + spin_unlock(&nlru->lock); + return ret; } EXPORT_SYMBOL_GPL(list_lru_walk_one); +unsigned long +list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long *nr_to_walk) +{ + struct list_lru_node *nlru = &lru->node[nid]; + unsigned long ret; + + spin_lock_irq(&nlru->lock); + ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg, + nr_to_walk); + spin_unlock_irq(&nlru->lock); + return ret; +} + unsigned long list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) @@ -272,12 +304,18 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid, long isolated = 0; int memcg_idx; - isolated += __list_lru_walk_one(lru, nid, -1, isolate, cb_arg, - nr_to_walk); + isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg, + nr_to_walk); if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { for_each_memcg_cache_index(memcg_idx) { - isolated += __list_lru_walk_one(lru, nid, memcg_idx, - isolate, cb_arg, nr_to_walk); + struct list_lru_node *nlru = &lru->node[nid]; + + spin_lock(&nlru->lock); + isolated += __list_lru_walk_one(nlru, memcg_idx, + isolate, cb_arg, + nr_to_walk); + spin_unlock(&nlru->lock); + if (*nr_to_walk <= 0) break; } @@ -292,7 +330,7 @@ static void init_one_lru(struct list_lru_one *l) l->nr_items = 0; } -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus, int begin, int end) { @@ -500,10 +538,13 @@ fail: goto out; } -static void memcg_drain_list_lru_node(struct list_lru_node *nlru, - int src_idx, int dst_idx) +static void memcg_drain_list_lru_node(struct list_lru *lru, int nid, + int src_idx, struct mem_cgroup *dst_memcg) { + struct list_lru_node *nlru = &lru->node[nid]; + int dst_idx = dst_memcg->kmemcg_id; struct list_lru_one *src, *dst; + bool set; /* * Since list_lru_{add,del} may be called under an IRQ-safe lock, @@ -515,14 +556,17 @@ static void memcg_drain_list_lru_node(struct list_lru_node *nlru, dst = list_lru_from_memcg_idx(nlru, dst_idx); list_splice_init(&src->list, &dst->list); + set = (!dst->nr_items && src->nr_items); dst->nr_items += src->nr_items; + if (set) + memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); src->nr_items = 0; spin_unlock_irq(&nlru->lock); } static void memcg_drain_list_lru(struct list_lru *lru, - int src_idx, int dst_idx) + int src_idx, struct mem_cgroup *dst_memcg) { int i; @@ -530,16 +574,16 @@ static void memcg_drain_list_lru(struct list_lru *lru, return; for_each_node(i) - memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx); + memcg_drain_list_lru_node(lru, i, src_idx, dst_memcg); } -void memcg_drain_all_list_lrus(int src_idx, int dst_idx) +void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg) { struct list_lru *lru; mutex_lock(&list_lrus_mutex); list_for_each_entry(lru, &list_lrus, list) - memcg_drain_list_lru(lru, src_idx, dst_idx); + memcg_drain_list_lru(lru, src_idx, dst_memcg); mutex_unlock(&list_lrus_mutex); } #else @@ -551,15 +595,21 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) static void memcg_destroy_list_lru(struct list_lru *lru) { } -#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ int __list_lru_init(struct list_lru *lru, bool memcg_aware, - struct lock_class_key *key) + struct lock_class_key *key, struct shrinker *shrinker) { int i; size_t size = sizeof(*lru->node) * nr_node_ids; int err = -ENOMEM; +#ifdef CONFIG_MEMCG_KMEM + if (shrinker) + lru->shrinker_id = shrinker->id; + else + lru->shrinker_id = -1; +#endif memcg_get_cache_ids(); lru->node = kzalloc(size, GFP_KERNEL); @@ -602,6 +652,9 @@ void list_lru_destroy(struct list_lru *lru) kfree(lru->node); lru->node = NULL; +#ifdef CONFIG_MEMCG_KMEM + lru->shrinker_id = -1; +#endif memcg_put_cache_ids(); } EXPORT_SYMBOL_GPL(list_lru_destroy); diff --git a/mm/madvise.c b/mm/madvise.c index 4d3c922ea1a1..972a9eaa898b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -631,11 +631,13 @@ static int madvise_inject_error(int behavior, for (; start < end; start += PAGE_SIZE << order) { + unsigned long pfn; int ret; ret = get_user_pages_fast(start, 1, 0, &page); if (ret != 1) return ret; + pfn = page_to_pfn(page); /* * When soft offlining hugepages, after migrating the page @@ -651,17 +653,25 @@ static int madvise_inject_error(int behavior, if (behavior == MADV_SOFT_OFFLINE) { pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", - page_to_pfn(page), start); + pfn, start); ret = soft_offline_page(page, MF_COUNT_INCREASED); if (ret) return ret; continue; } + pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", - page_to_pfn(page), start); + pfn, start); - ret = memory_failure(page_to_pfn(page), MF_COUNT_INCREASED); + /* + * Drop the page reference taken by get_user_pages_fast(). In + * the absence of MF_COUNT_INCREASED the memory_failure() + * routine is responsible for pinning the page to prevent it + * from being released back to the page allocator. + */ + put_page(page); + ret = memory_failure(pfn, 0); if (ret) return ret; } diff --git a/mm/memblock.c b/mm/memblock.c index 03d48d8835ba..237944479d25 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -20,12 +20,68 @@ #include <linux/kmemleak.h> #include <linux/seq_file.h> #include <linux/memblock.h> +#include <linux/bootmem.h> #include <asm/sections.h> #include <linux/io.h> #include "internal.h" +/** + * DOC: memblock overview + * + * Memblock is a method of managing memory regions during the early + * boot period when the usual kernel memory allocators are not up and + * running. + * + * Memblock views the system memory as collections of contiguous + * regions. There are several types of these collections: + * + * * ``memory`` - describes the physical memory available to the + * kernel; this may differ from the actual physical memory installed + * in the system, for instance when the memory is restricted with + * ``mem=`` command line parameter + * * ``reserved`` - describes the regions that were allocated + * * ``physmap`` - describes the actual physical memory regardless of + * the possible restrictions; the ``physmap`` type is only available + * on some architectures. + * + * Each region is represented by :c:type:`struct memblock_region` that + * defines the region extents, its attributes and NUMA node id on NUMA + * systems. Every memory type is described by the :c:type:`struct + * memblock_type` which contains an array of memory regions along with + * the allocator metadata. The memory types are nicely wrapped with + * :c:type:`struct memblock`. This structure is statically initialzed + * at build time. The region arrays for the "memory" and "reserved" + * types are initially sized to %INIT_MEMBLOCK_REGIONS and for the + * "physmap" type to %INIT_PHYSMEM_REGIONS. + * The :c:func:`memblock_allow_resize` enables automatic resizing of + * the region arrays during addition of new regions. This feature + * should be used with care so that memory allocated for the region + * array will not overlap with areas that should be reserved, for + * example initrd. + * + * The early architecture setup should tell memblock what the physical + * memory layout is by using :c:func:`memblock_add` or + * :c:func:`memblock_add_node` functions. The first function does not + * assign the region to a NUMA node and it is appropriate for UMA + * systems. Yet, it is possible to use it on NUMA systems as well and + * assign the region to a NUMA node later in the setup process using + * :c:func:`memblock_set_node`. The :c:func:`memblock_add_node` + * performs such an assignment directly. + * + * Once memblock is setup the memory can be allocated using either + * memblock or bootmem APIs. + * + * As the system boot progresses, the architecture specific + * :c:func:`mem_init` function frees all the memory to the buddy page + * allocator. + * + * If an architecure enables %CONFIG_ARCH_DISCARD_MEMBLOCK, the + * memblock data structures will be discarded after the system + * initialization compltes. + */ + static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP @@ -60,7 +116,7 @@ static int memblock_can_resize __initdata_memblock; static int memblock_memory_in_slab __initdata_memblock = 0; static int memblock_reserved_in_slab __initdata_memblock = 0; -ulong __init_memblock choose_memblock_flags(void) +enum memblock_flags __init_memblock choose_memblock_flags(void) { return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE; } @@ -92,10 +148,11 @@ bool __init_memblock memblock_overlaps_region(struct memblock_type *type, return i < type->cnt; } -/* +/** * __memblock_find_range_bottom_up - find free area utility in bottom-up * @start: start of candidate range - * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or + * %MEMBLOCK_ALLOC_ACCESSIBLE * @size: size of free area to find * @align: alignment of free area to find * @nid: nid of the free area to find, %NUMA_NO_NODE for any node @@ -103,13 +160,13 @@ bool __init_memblock memblock_overlaps_region(struct memblock_type *type, * * Utility called from memblock_find_in_range_node(), find free area bottom-up. * - * RETURNS: + * Return: * Found address on success, 0 on failure. */ static phys_addr_t __init_memblock __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align, int nid, - ulong flags) + enum memblock_flags flags) { phys_addr_t this_start, this_end, cand; u64 i; @@ -129,7 +186,8 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, /** * __memblock_find_range_top_down - find free area utility, in top-down * @start: start of candidate range - * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or + * %MEMBLOCK_ALLOC_ACCESSIBLE * @size: size of free area to find * @align: alignment of free area to find * @nid: nid of the free area to find, %NUMA_NO_NODE for any node @@ -137,13 +195,13 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, * * Utility called from memblock_find_in_range_node(), find free area top-down. * - * RETURNS: + * Return: * Found address on success, 0 on failure. */ static phys_addr_t __init_memblock __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align, int nid, - ulong flags) + enum memblock_flags flags) { phys_addr_t this_start, this_end, cand; u64 i; @@ -169,7 +227,8 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, * @size: size of free area to find * @align: alignment of free area to find * @start: start of candidate range - * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or + * %MEMBLOCK_ALLOC_ACCESSIBLE * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * @flags: pick from blocks based on memory attributes * @@ -183,12 +242,13 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, * * If bottom-up allocation failed, will try to allocate memory top-down. * - * RETURNS: + * Return: * Found address on success, 0 on failure. */ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, phys_addr_t start, - phys_addr_t end, int nid, ulong flags) + phys_addr_t end, int nid, + enum memblock_flags flags) { phys_addr_t kernel_end, ret; @@ -227,7 +287,8 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, * so we use WARN_ONCE() here to see the stack trace if * fail happens. */ - WARN_ONCE(1, "memblock: bottom-up allocation failed, memory hotunplug may be affected\n"); + WARN_ONCE(IS_ENABLED(CONFIG_MEMORY_HOTREMOVE), + "memblock: bottom-up allocation failed, memory hotremove may be affected\n"); } return __memblock_find_range_top_down(start, end, size, align, nid, @@ -237,13 +298,14 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, /** * memblock_find_in_range - find free area in given range * @start: start of candidate range - * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or + * %MEMBLOCK_ALLOC_ACCESSIBLE * @size: size of free area to find * @align: alignment of free area to find * * Find @size free area aligned to @align in the specified range. * - * RETURNS: + * Return: * Found address on success, 0 on failure. */ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, @@ -251,7 +313,7 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t align) { phys_addr_t ret; - ulong flags = choose_memblock_flags(); + enum memblock_flags flags = choose_memblock_flags(); again: ret = memblock_find_in_range_node(size, align, start, end, @@ -287,7 +349,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK /** - * Discard memory and reserved arrays if they were allocated + * memblock_discard - discard memory and reserved arrays if they were allocated */ void __init memblock_discard(void) { @@ -317,11 +379,11 @@ void __init memblock_discard(void) * * Double the size of the @type regions array. If memblock is being used to * allocate memory for a new reserved regions array and there is a previously - * allocated memory range [@new_area_start,@new_area_start+@new_area_size] + * allocated memory range [@new_area_start, @new_area_start + @new_area_size] * waiting to be reserved, ensure the memory used by the new array does * not overlap. * - * RETURNS: + * Return: * 0 on success, -1 on failure. */ static int __init_memblock memblock_double_array(struct memblock_type *type, @@ -330,7 +392,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, { struct memblock_region *new_array, *old_array; phys_addr_t old_alloc_size, new_alloc_size; - phys_addr_t old_size, new_size, addr; + phys_addr_t old_size, new_size, addr, new_end; int use_slab = slab_is_available(); int *in_slab; @@ -391,9 +453,9 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, return -1; } - memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]", - type->name, type->max * 2, (u64)addr, - (u64)addr + new_size - 1); + new_end = addr + new_size - 1; + memblock_dbg("memblock: %s is doubled to %ld at [%pa-%pa]", + type->name, type->max * 2, &addr, &new_end); /* * Found space, we now need to move the array over before we add the @@ -466,13 +528,14 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) * @nid: node id of the new region * @flags: flags of the new region * - * Insert new memblock region [@base,@base+@size) into @type at @idx. + * Insert new memblock region [@base, @base + @size) into @type at @idx. * @type must already have extra room to accommodate the new region. */ static void __init_memblock memblock_insert_region(struct memblock_type *type, int idx, phys_addr_t base, phys_addr_t size, - int nid, unsigned long flags) + int nid, + enum memblock_flags flags) { struct memblock_region *rgn = &type->regions[idx]; @@ -494,17 +557,17 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, * @nid: nid of the new region * @flags: flags of the new region * - * Add new memblock region [@base,@base+@size) into @type. The new region + * Add new memblock region [@base, @base + @size) into @type. The new region * is allowed to overlap with existing ones - overlaps don't affect already * existing regions. @type is guaranteed to be minimal (all neighbouring * compatible regions are merged) after the addition. * - * RETURNS: + * Return: * 0 on success, -errno on failure. */ int __init_memblock memblock_add_range(struct memblock_type *type, phys_addr_t base, phys_addr_t size, - int nid, unsigned long flags) + int nid, enum memblock_flags flags) { bool insert = false; phys_addr_t obase = base; @@ -588,12 +651,35 @@ repeat: } } +/** + * memblock_add_node - add new memblock region within a NUMA node + * @base: base address of the new region + * @size: size of the new region + * @nid: nid of the new region + * + * Add new memblock region [@base, @base + @size) to the "memory" + * type. See memblock_add_range() description for mode details + * + * Return: + * 0 on success, -errno on failure. + */ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, int nid) { return memblock_add_range(&memblock.memory, base, size, nid, 0); } +/** + * memblock_add - add new memblock region + * @base: base address of the new region + * @size: size of the new region + * + * Add new memblock region [@base, @base + @size) to the "memory" + * type. See memblock_add_range() description for mode details + * + * Return: + * 0 on success, -errno on failure. + */ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) { phys_addr_t end = base + size - 1; @@ -613,11 +699,11 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) * @end_rgn: out parameter for the end of isolated region * * Walk @type and ensure that regions don't cross the boundaries defined by - * [@base,@base+@size). Crossing regions are split at the boundaries, + * [@base, @base + @size). Crossing regions are split at the boundaries, * which may create at most two more regions. The index of the first * region inside the range is returned in *@start_rgn and end in *@end_rgn. * - * RETURNS: + * Return: * 0 on success, -errno on failure. */ static int __init_memblock memblock_isolate_range(struct memblock_type *type, @@ -728,10 +814,15 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) } /** + * memblock_setclr_flag - set or clear flag for a memory region + * @base: base address of the region + * @size: size of the region + * @set: set or clear the flag + * @flag: the flag to udpate * * This function isolates region [@base, @base + @size), and sets/clears flag * - * Return 0 on success, -errno on failure. + * Return: 0 on success, -errno on failure. */ static int __init_memblock memblock_setclr_flag(phys_addr_t base, phys_addr_t size, int set, int flag) @@ -758,7 +849,7 @@ static int __init_memblock memblock_setclr_flag(phys_addr_t base, * @base: the base phys addr of the region * @size: the size of the region * - * Return 0 on success, -errno on failure. + * Return: 0 on success, -errno on failure. */ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) { @@ -770,7 +861,7 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) * @base: the base phys addr of the region * @size: the size of the region * - * Return 0 on success, -errno on failure. + * Return: 0 on success, -errno on failure. */ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) { @@ -782,7 +873,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) * @base: the base phys addr of the region * @size: the size of the region * - * Return 0 on success, -errno on failure. + * Return: 0 on success, -errno on failure. */ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) { @@ -796,7 +887,7 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) * @base: the base phys addr of the region * @size: the size of the region * - * Return 0 on success, -errno on failure. + * Return: 0 on success, -errno on failure. */ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size) { @@ -808,7 +899,7 @@ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size) * @base: the base phys addr of the region * @size: the size of the region * - * Return 0 on success, -errno on failure. + * Return: 0 on success, -errno on failure. */ int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size) { @@ -873,7 +964,8 @@ void __init_memblock __next_reserved_mem_region(u64 *idx, * As both region arrays are sorted, the function advances the two indices * in lockstep and returns each intersection. */ -void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, +void __init_memblock __next_mem_range(u64 *idx, int nid, + enum memblock_flags flags, struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, @@ -968,9 +1060,6 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, /** * __next_mem_range_rev - generic next function for for_each_*_range_rev() * - * Finds the next range from type_a which is not marked as unsuitable - * in type_b. - * * @idx: pointer to u64 loop variable * @nid: node selector, %NUMA_NO_NODE for all nodes * @flags: pick from blocks based on memory attributes @@ -980,9 +1069,13 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL * @out_nid: ptr to int for nid of the range, can be %NULL * + * Finds the next range from type_a which is not marked as unsuitable + * in type_b. + * * Reverse of __next_mem_range(). */ -void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, +void __init_memblock __next_mem_range_rev(u64 *idx, int nid, + enum memblock_flags flags, struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, @@ -1114,10 +1207,10 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, * @type: memblock type to set node ID for * @nid: node ID to set * - * Set the nid of memblock @type regions in [@base,@base+@size) to @nid. + * Set the nid of memblock @type regions in [@base, @base + @size) to @nid. * Regions which cross the area boundaries are split as necessary. * - * RETURNS: + * Return: * 0 on success, -errno on failure. */ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, @@ -1140,7 +1233,8 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, phys_addr_t align, phys_addr_t start, - phys_addr_t end, int nid, ulong flags) + phys_addr_t end, int nid, + enum memblock_flags flags) { phys_addr_t found; @@ -1162,7 +1256,7 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, phys_addr_t start, phys_addr_t end, - ulong flags) + enum memblock_flags flags) { return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE, flags); @@ -1170,14 +1264,14 @@ phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr, - int nid, ulong flags) + int nid, enum memblock_flags flags) { return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags); } phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) { - ulong flags = choose_memblock_flags(); + enum memblock_flags flags = choose_memblock_flags(); phys_addr_t ret; again: @@ -1224,6 +1318,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); } +#if defined(CONFIG_NO_BOOTMEM) /** * memblock_virt_alloc_internal - allocate boot memory block * @size: size of memory block to be allocated in bytes @@ -1240,7 +1335,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i * The allocation is performed from memory region limited by * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE. * - * The memory block is aligned on SMP_CACHE_BYTES if @align == 0. + * The memory block is aligned on %SMP_CACHE_BYTES if @align == 0. * * The phys address of allocated boot memory block is converted to virtual and * allocated memory is reset to 0. @@ -1248,7 +1343,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i * In addition, function sets the min_count to 0 using kmemleak_alloc for * allocated boot memory block, so that it is never reported as leaks. * - * RETURNS: + * Return: * Virtual address of allocated memory block on success, NULL on failure. */ static void * __init memblock_virt_alloc_internal( @@ -1258,7 +1353,7 @@ static void * __init memblock_virt_alloc_internal( { phys_addr_t alloc; void *ptr; - ulong flags = choose_memblock_flags(); + enum memblock_flags flags = choose_memblock_flags(); if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) nid = NUMA_NO_NODE; @@ -1333,7 +1428,7 @@ done: * info), if enabled. Does not zero allocated memory, does not panic if request * cannot be satisfied. * - * RETURNS: + * Return: * Virtual address of allocated memory block on success, NULL on failure. */ void * __init memblock_virt_alloc_try_nid_raw( @@ -1343,9 +1438,9 @@ void * __init memblock_virt_alloc_try_nid_raw( { void *ptr; - memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", - __func__, (u64)size, (u64)align, nid, (u64)min_addr, - (u64)max_addr, (void *)_RET_IP_); + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n", + __func__, (u64)size, (u64)align, nid, &min_addr, + &max_addr, (void *)_RET_IP_); ptr = memblock_virt_alloc_internal(size, align, min_addr, max_addr, nid); @@ -1370,7 +1465,7 @@ void * __init memblock_virt_alloc_try_nid_raw( * Public function, provides additional debug information (including caller * info), if enabled. This function zeroes the allocated memory. * - * RETURNS: + * Return: * Virtual address of allocated memory block on success, NULL on failure. */ void * __init memblock_virt_alloc_try_nid_nopanic( @@ -1380,9 +1475,9 @@ void * __init memblock_virt_alloc_try_nid_nopanic( { void *ptr; - memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", - __func__, (u64)size, (u64)align, nid, (u64)min_addr, - (u64)max_addr, (void *)_RET_IP_); + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n", + __func__, (u64)size, (u64)align, nid, &min_addr, + &max_addr, (void *)_RET_IP_); ptr = memblock_virt_alloc_internal(size, align, min_addr, max_addr, nid); @@ -1406,7 +1501,7 @@ void * __init memblock_virt_alloc_try_nid_nopanic( * which provides debug information (including caller info), if enabled, * and panics if the request can not be satisfied. * - * RETURNS: + * Return: * Virtual address of allocated memory block on success, NULL on failure. */ void * __init memblock_virt_alloc_try_nid( @@ -1416,9 +1511,9 @@ void * __init memblock_virt_alloc_try_nid( { void *ptr; - memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", - __func__, (u64)size, (u64)align, nid, (u64)min_addr, - (u64)max_addr, (void *)_RET_IP_); + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n", + __func__, (u64)size, (u64)align, nid, &min_addr, + &max_addr, (void *)_RET_IP_); ptr = memblock_virt_alloc_internal(size, align, min_addr, max_addr, nid); if (ptr) { @@ -1426,11 +1521,11 @@ void * __init memblock_virt_alloc_try_nid( return ptr; } - panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n", - __func__, (u64)size, (u64)align, nid, (u64)min_addr, - (u64)max_addr); + panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa\n", + __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr); return NULL; } +#endif /** * __memblock_free_early - free boot memory block @@ -1442,16 +1537,17 @@ void * __init memblock_virt_alloc_try_nid( */ void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) { - memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", - __func__, (u64)base, (u64)base + size - 1, - (void *)_RET_IP_); + phys_addr_t end = base + size - 1; + + memblock_dbg("%s: [%pa-%pa] %pF\n", + __func__, &base, &end, (void *)_RET_IP_); kmemleak_free_part_phys(base, size); memblock_remove_range(&memblock.reserved, base, size); } -/* +/** * __memblock_free_late - free bootmem block pages directly to buddy allocator - * @addr: phys starting address of the boot memory block + * @base: phys starting address of the boot memory block * @size: size of the boot memory block in bytes * * This is only useful when the bootmem allocator has already been torn @@ -1460,11 +1556,11 @@ void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) */ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) { - u64 cursor, end; + phys_addr_t cursor, end; - memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", - __func__, (u64)base, (u64)base + size - 1, - (void *)_RET_IP_); + end = base + size - 1; + memblock_dbg("%s: [%pa-%pa] %pF\n", + __func__, &base, &end, (void *)_RET_IP_); kmemleak_free_part_phys(base, size); cursor = PFN_UP(base); end = PFN_DOWN(base + size); @@ -1663,9 +1759,9 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn, * @base: base of region to check * @size: size of region to check * - * Check if the region [@base, @base+@size) is a subset of a memory block. + * Check if the region [@base, @base + @size) is a subset of a memory block. * - * RETURNS: + * Return: * 0 if false, non-zero if true */ bool __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) @@ -1684,9 +1780,10 @@ bool __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t siz * @base: base of region to check * @size: size of region to check * - * Check if the region [@base, @base+@size) intersects a reserved memory block. + * Check if the region [@base, @base + @size) intersects a reserved + * memory block. * - * RETURNS: + * Return: * True if they intersect, false if not. */ bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) @@ -1733,7 +1830,7 @@ phys_addr_t __init_memblock memblock_get_current_limit(void) static void __init_memblock memblock_dump(struct memblock_type *type) { phys_addr_t base, end, size; - unsigned long flags; + enum memblock_flags flags; int idx; struct memblock_region *rgn; @@ -1751,7 +1848,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); #endif - pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#lx\n", + pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#x\n", type->name, idx, &base, &end, &size, nid_buf, flags); } } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e6f0d5ef320a..4ead5a4817de 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -233,6 +233,21 @@ enum res_type { /* Used for OOM nofiier */ #define OOM_CONTROL (0) +/* + * Iteration constructs for visiting all cgroups (under a tree). If + * loops are exited prematurely (break), mem_cgroup_iter_break() must + * be used for reference counting. + */ +#define for_each_mem_cgroup_tree(iter, root) \ + for (iter = mem_cgroup_iter(root, NULL, NULL); \ + iter != NULL; \ + iter = mem_cgroup_iter(root, iter, NULL)) + +#define for_each_mem_cgroup(iter) \ + for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ + iter != NULL; \ + iter = mem_cgroup_iter(NULL, iter, NULL)) + /* Some nice accessors for the vmpressure. */ struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) { @@ -246,12 +261,7 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; } -static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) -{ - return (memcg == root_mem_cgroup); -} - -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM /* * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. * The main reason for not using cgroup id for this: @@ -305,7 +315,135 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key); struct workqueue_struct *memcg_kmem_cache_wq; -#endif /* !CONFIG_SLOB */ +static int memcg_shrinker_map_size; +static DEFINE_MUTEX(memcg_shrinker_map_mutex); + +static void memcg_free_shrinker_map_rcu(struct rcu_head *head) +{ + kvfree(container_of(head, struct memcg_shrinker_map, rcu)); +} + +static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg, + int size, int old_size) +{ + struct memcg_shrinker_map *new, *old; + int nid; + + lockdep_assert_held(&memcg_shrinker_map_mutex); + + for_each_node(nid) { + old = rcu_dereference_protected( + mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true); + /* Not yet online memcg */ + if (!old) + return 0; + + new = kvmalloc(sizeof(*new) + size, GFP_KERNEL); + if (!new) + return -ENOMEM; + + /* Set all old bits, clear all new bits */ + memset(new->map, (int)0xff, old_size); + memset((void *)new->map + old_size, 0, size - old_size); + + rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new); + call_rcu(&old->rcu, memcg_free_shrinker_map_rcu); + } + + return 0; +} + +static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_node *pn; + struct memcg_shrinker_map *map; + int nid; + + if (mem_cgroup_is_root(memcg)) + return; + + for_each_node(nid) { + pn = mem_cgroup_nodeinfo(memcg, nid); + map = rcu_dereference_protected(pn->shrinker_map, true); + if (map) + kvfree(map); + rcu_assign_pointer(pn->shrinker_map, NULL); + } +} + +static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) +{ + struct memcg_shrinker_map *map; + int nid, size, ret = 0; + + if (mem_cgroup_is_root(memcg)) + return 0; + + mutex_lock(&memcg_shrinker_map_mutex); + size = memcg_shrinker_map_size; + for_each_node(nid) { + map = kvzalloc(sizeof(*map) + size, GFP_KERNEL); + if (!map) { + memcg_free_shrinker_maps(memcg); + ret = -ENOMEM; + break; + } + rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map); + } + mutex_unlock(&memcg_shrinker_map_mutex); + + return ret; +} + +int memcg_expand_shrinker_maps(int new_id) +{ + int size, old_size, ret = 0; + struct mem_cgroup *memcg; + + size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long); + old_size = memcg_shrinker_map_size; + if (size <= old_size) + return 0; + + mutex_lock(&memcg_shrinker_map_mutex); + if (!root_mem_cgroup) + goto unlock; + + for_each_mem_cgroup(memcg) { + if (mem_cgroup_is_root(memcg)) + continue; + ret = memcg_expand_one_shrinker_map(memcg, size, old_size); + if (ret) + goto unlock; + } +unlock: + if (!ret) + memcg_shrinker_map_size = size; + mutex_unlock(&memcg_shrinker_map_mutex); + return ret; +} + +void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) +{ + if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { + struct memcg_shrinker_map *map; + + rcu_read_lock(); + map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map); + /* Pairs with smp mb in shrink_slab() */ + smp_mb__before_atomic(); + set_bit(shrinker_id, map->map); + rcu_read_unlock(); + } +} + +#else /* CONFIG_MEMCG_KMEM */ +static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) +{ + return 0; +} +static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { } +#endif /* CONFIG_MEMCG_KMEM */ /** * mem_cgroup_css_from_page - css of the memcg associated with a page @@ -678,9 +816,20 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) } EXPORT_SYMBOL(mem_cgroup_from_task); -static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) +/** + * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg. + * @mm: mm from which memcg should be extracted. It can be NULL. + * + * Obtain a reference on mm->memcg and returns it if successful. Otherwise + * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is + * returned. + */ +struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { - struct mem_cgroup *memcg = NULL; + struct mem_cgroup *memcg; + + if (mem_cgroup_disabled()) + return NULL; rcu_read_lock(); do { @@ -700,6 +849,46 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) rcu_read_unlock(); return memcg; } +EXPORT_SYMBOL(get_mem_cgroup_from_mm); + +/** + * get_mem_cgroup_from_page: Obtain a reference on given page's memcg. + * @page: page from which memcg should be extracted. + * + * Obtain a reference on page->memcg and returns it if successful. Otherwise + * root_mem_cgroup is returned. + */ +struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) +{ + struct mem_cgroup *memcg = page->mem_cgroup; + + if (mem_cgroup_disabled()) + return NULL; + + rcu_read_lock(); + if (!memcg || !css_tryget_online(&memcg->css)) + memcg = root_mem_cgroup; + rcu_read_unlock(); + return memcg; +} +EXPORT_SYMBOL(get_mem_cgroup_from_page); + +/** + * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg. + */ +static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) +{ + if (unlikely(current->active_memcg)) { + struct mem_cgroup *memcg = root_mem_cgroup; + + rcu_read_lock(); + if (css_tryget_online(¤t->active_memcg->css)) + memcg = current->active_memcg; + rcu_read_unlock(); + return memcg; + } + return get_mem_cgroup_from_mm(current->mm); +} /** * mem_cgroup_iter - iterate over memory cgroup hierarchy @@ -850,7 +1039,7 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) int nid; int i; - while ((memcg = parent_mem_cgroup(memcg))) { + for (; memcg; memcg = parent_mem_cgroup(memcg)) { for_each_node(nid) { mz = mem_cgroup_nodeinfo(memcg, nid); for (i = 0; i <= DEF_PRIORITY; i++) { @@ -862,21 +1051,6 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) } } -/* - * Iteration constructs for visiting all cgroups (under a tree). If - * loops are exited prematurely (break), mem_cgroup_iter_break() must - * be used for reference counting. - */ -#define for_each_mem_cgroup_tree(iter, root) \ - for (iter = mem_cgroup_iter(root, NULL, NULL); \ - iter != NULL; \ - iter = mem_cgroup_iter(root, iter, NULL)) - -#define for_each_mem_cgroup(iter) \ - for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ - iter != NULL; \ - iter = mem_cgroup_iter(NULL, iter, NULL)) - /** * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy * @memcg: hierarchy root @@ -1483,28 +1657,53 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); } -static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) +enum oom_status { + OOM_SUCCESS, + OOM_FAILED, + OOM_ASYNC, + OOM_SKIPPED +}; + +static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - if (!current->memcg_may_oom || order > PAGE_ALLOC_COSTLY_ORDER) - return; + if (order > PAGE_ALLOC_COSTLY_ORDER) + return OOM_SKIPPED; + /* * We are in the middle of the charge context here, so we * don't want to block when potentially sitting on a callstack * that holds all kinds of filesystem and mm locks. * - * Also, the caller may handle a failed allocation gracefully - * (like optional page cache readahead) and so an OOM killer - * invocation might not even be necessary. + * cgroup1 allows disabling the OOM killer and waiting for outside + * handling until the charge can succeed; remember the context and put + * the task to sleep at the end of the page fault when all locks are + * released. * - * That's why we don't do anything here except remember the - * OOM context and then deal with it at the end of the page - * fault when the stack is unwound, the locks are released, - * and when we know whether the fault was overall successful. + * On the other hand, in-kernel OOM killer allows for an async victim + * memory reclaim (oom_reaper) and that means that we are not solely + * relying on the oom victim to make a forward progress and we can + * invoke the oom killer here. + * + * Please note that mem_cgroup_out_of_memory might fail to find a + * victim and then we have to bail out from the charge path. */ - css_get(&memcg->css); - current->memcg_in_oom = memcg; - current->memcg_oom_gfp_mask = mask; - current->memcg_oom_order = order; + if (memcg->oom_kill_disable) { + if (!current->in_user_fault) + return OOM_SKIPPED; + css_get(&memcg->css); + current->memcg_in_oom = memcg; + current->memcg_oom_gfp_mask = mask; + current->memcg_oom_order = order; + + return OOM_ASYNC; + } + + if (mem_cgroup_out_of_memory(memcg, mask, order)) + return OOM_SUCCESS; + + WARN(1,"Memory cgroup charge failed because of no reclaimable memory! " + "This looks like a misconfiguration or a kernel bug."); + return OOM_FAILED; } /** @@ -1578,6 +1777,62 @@ cleanup: } /** + * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM + * @victim: task to be killed by the OOM killer + * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM + * + * Returns a pointer to a memory cgroup, which has to be cleaned up + * by killing all belonging OOM-killable tasks. + * + * Caller has to call mem_cgroup_put() on the returned non-NULL memcg. + */ +struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, + struct mem_cgroup *oom_domain) +{ + struct mem_cgroup *oom_group = NULL; + struct mem_cgroup *memcg; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return NULL; + + if (!oom_domain) + oom_domain = root_mem_cgroup; + + rcu_read_lock(); + + memcg = mem_cgroup_from_task(victim); + if (memcg == root_mem_cgroup) + goto out; + + /* + * Traverse the memory cgroup hierarchy from the victim task's + * cgroup up to the OOMing cgroup (or root) to find the + * highest-level memory cgroup with oom.group set. + */ + for (; memcg; memcg = parent_mem_cgroup(memcg)) { + if (memcg->oom_group) + oom_group = memcg; + + if (memcg == oom_domain) + break; + } + + if (oom_group) + css_get(&oom_group->css); +out: + rcu_read_unlock(); + + return oom_group; +} + +void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) +{ + pr_info("Tasks in "); + pr_cont_cgroup_path(memcg->css.cgroup); + pr_cont(" are going to be killed due to memory.oom.group set\n"); +} + +/** * lock_page_memcg - lock a page->mem_cgroup binding * @page: the page * @@ -1899,6 +2154,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned long nr_reclaimed; bool may_swap = true; bool drained = false; + bool oomed = false; + enum oom_status oom_status; if (mem_cgroup_is_root(memcg)) return 0; @@ -1986,6 +2243,9 @@ retry: if (nr_retries--) goto retry; + if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed) + goto nomem; + if (gfp_mask & __GFP_NOFAIL) goto force; @@ -1994,8 +2254,23 @@ retry: memcg_memory_event(mem_over_limit, MEMCG_OOM); - mem_cgroup_oom(mem_over_limit, gfp_mask, + /* + * keep retrying as long as the memcg oom killer is able to make + * a forward progress or bypass the charge if the oom killer + * couldn't make any progress. + */ + oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages * PAGE_SIZE)); + switch (oom_status) { + case OOM_SUCCESS: + nr_retries = MEM_CGROUP_RECLAIM_RETRIES; + oomed = true; + goto retry; + case OOM_FAILED: + goto force; + default: + goto nomem; + } nomem: if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; @@ -2119,7 +2394,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, unlock_page_lru(page, isolated); } -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM static int memcg_alloc_cache_id(void) { int id, size; @@ -2261,7 +2536,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) if (current->memcg_kmem_skip_account) return cachep; - memcg = get_mem_cgroup_from_mm(current->mm); + memcg = get_mem_cgroup_from_current(); kmemcg_id = READ_ONCE(memcg->kmemcg_id); if (kmemcg_id < 0) goto out; @@ -2345,7 +2620,7 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) if (memcg_kmem_bypass()) return 0; - memcg = get_mem_cgroup_from_mm(current->mm); + memcg = get_mem_cgroup_from_current(); if (!mem_cgroup_is_root(memcg)) { ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); if (!ret) @@ -2384,7 +2659,7 @@ void memcg_kmem_uncharge(struct page *page, int order) css_put_many(&memcg->css, nr_pages); } -#endif /* !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -2680,29 +2955,34 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, return retval; } -static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat) -{ - struct mem_cgroup *iter; - int i; - - memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT); - - for_each_mem_cgroup_tree(iter, memcg) { - for (i = 0; i < MEMCG_NR_STAT; i++) - stat[i] += memcg_page_state(iter, i); - } -} +struct accumulated_stats { + unsigned long stat[MEMCG_NR_STAT]; + unsigned long events[NR_VM_EVENT_ITEMS]; + unsigned long lru_pages[NR_LRU_LISTS]; + const unsigned int *stats_array; + const unsigned int *events_array; + int stats_size; + int events_size; +}; -static void tree_events(struct mem_cgroup *memcg, unsigned long *events) +static void accumulate_memcg_tree(struct mem_cgroup *memcg, + struct accumulated_stats *acc) { - struct mem_cgroup *iter; + struct mem_cgroup *mi; int i; - memset(events, 0, sizeof(*events) * NR_VM_EVENT_ITEMS); + for_each_mem_cgroup_tree(mi, memcg) { + for (i = 0; i < acc->stats_size; i++) + acc->stat[i] += memcg_page_state(mi, + acc->stats_array ? acc->stats_array[i] : i); - for_each_mem_cgroup_tree(iter, memcg) { - for (i = 0; i < NR_VM_EVENT_ITEMS; i++) - events[i] += memcg_sum_events(iter, i); + for (i = 0; i < acc->events_size; i++) + acc->events[i] += memcg_sum_events(mi, + acc->events_array ? acc->events_array[i] : i); + + for (i = 0; i < NR_LRU_LISTS; i++) + acc->lru_pages[i] += + mem_cgroup_nr_lru_pages(mi, BIT(i)); } } @@ -2779,7 +3059,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, } } -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM static int memcg_online_kmem(struct mem_cgroup *memcg) { int memcg_id; @@ -2851,7 +3131,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) } rcu_read_unlock(); - memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); + memcg_drain_all_list_lrus(kmemcg_id, parent); memcg_free_cache_id(kmemcg_id); } @@ -2879,7 +3159,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) static void memcg_free_kmem(struct mem_cgroup *memcg) { } -#endif /* !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ static int memcg_update_kmem_max(struct mem_cgroup *memcg, unsigned long max) @@ -3113,6 +3393,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) unsigned long memory, memsw; struct mem_cgroup *mi; unsigned int i; + struct accumulated_stats acc; BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); @@ -3145,32 +3426,27 @@ static int memcg_stat_show(struct seq_file *m, void *v) seq_printf(m, "hierarchical_memsw_limit %llu\n", (u64)memsw * PAGE_SIZE); - for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { - unsigned long long val = 0; + memset(&acc, 0, sizeof(acc)); + acc.stats_size = ARRAY_SIZE(memcg1_stats); + acc.stats_array = memcg1_stats; + acc.events_size = ARRAY_SIZE(memcg1_events); + acc.events_array = memcg1_events; + accumulate_memcg_tree(memcg, &acc); + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; - for_each_mem_cgroup_tree(mi, memcg) - val += memcg_page_state(mi, memcg1_stats[i]) * - PAGE_SIZE; - seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val); + seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], + (u64)acc.stat[i] * PAGE_SIZE); } - for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) { - unsigned long long val = 0; - - for_each_mem_cgroup_tree(mi, memcg) - val += memcg_sum_events(mi, memcg1_events[i]); - seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val); - } - - for (i = 0; i < NR_LRU_LISTS; i++) { - unsigned long long val = 0; + for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) + seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], + (u64)acc.events[i]); - for_each_mem_cgroup_tree(mi, memcg) - val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; - seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); - } + for (i = 0; i < NR_LRU_LISTS; i++) + seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], + (u64)acc.lru_pages[i] * PAGE_SIZE); #ifdef CONFIG_DEBUG_VM { @@ -4037,6 +4313,14 @@ static struct cftype mem_cgroup_legacy_files[] = { static DEFINE_IDR(mem_cgroup_idr); +static void mem_cgroup_id_remove(struct mem_cgroup *memcg) +{ + if (memcg->id.id > 0) { + idr_remove(&mem_cgroup_idr, memcg->id.id); + memcg->id.id = 0; + } +} + static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) { VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0); @@ -4047,8 +4331,7 @@ static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) { VM_BUG_ON(atomic_read(&memcg->id.ref) < n); if (atomic_sub_and_test(n, &memcg->id.ref)) { - idr_remove(&mem_cgroup_idr, memcg->id.id); - memcg->id.id = 0; + mem_cgroup_id_remove(memcg); /* Memcg ID pins CSS */ css_put(&memcg->css); @@ -4176,7 +4459,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) INIT_LIST_HEAD(&memcg->event_list); spin_lock_init(&memcg->event_list_lock); memcg->socket_pressure = jiffies; -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM memcg->kmemcg_id = -1; #endif #ifdef CONFIG_CGROUP_WRITEBACK @@ -4185,8 +4468,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); return memcg; fail: - if (memcg->id.id > 0) - idr_remove(&mem_cgroup_idr, memcg->id.id); + mem_cgroup_id_remove(memcg); __mem_cgroup_free(memcg); return NULL; } @@ -4245,6 +4527,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) return &memcg->css; fail: + mem_cgroup_id_remove(memcg); mem_cgroup_free(memcg); return ERR_PTR(-ENOMEM); } @@ -4253,6 +4536,16 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + /* + * A memcg must be visible for memcg_expand_shrinker_maps() + * by the time the maps are allocated. So, we allocate maps + * here, when for_each_mem_cgroup() can't skip it. + */ + if (memcg_alloc_shrinker_maps(memcg)) { + mem_cgroup_id_remove(memcg); + return -ENOMEM; + } + /* Online state pins memcg ID, memcg ID pins CSS */ atomic_set(&memcg->id.ref, 1); css_get(css); @@ -4305,6 +4598,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) vmpressure_cleanup(&memcg->vmpressure); cancel_work_sync(&memcg->high_work); mem_cgroup_remove_from_trees(memcg); + memcg_free_shrinker_maps(memcg); memcg_free_kmem(memcg); mem_cgroup_free(memcg); } @@ -5249,8 +5543,7 @@ static int memory_events_show(struct seq_file *m, void *v) static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long stat[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; + struct accumulated_stats acc; int i; /* @@ -5264,70 +5557,97 @@ static int memory_stat_show(struct seq_file *m, void *v) * Current memory state: */ - tree_stat(memcg, stat); - tree_events(memcg, events); + memset(&acc, 0, sizeof(acc)); + acc.stats_size = MEMCG_NR_STAT; + acc.events_size = NR_VM_EVENT_ITEMS; + accumulate_memcg_tree(memcg, &acc); seq_printf(m, "anon %llu\n", - (u64)stat[MEMCG_RSS] * PAGE_SIZE); + (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE); seq_printf(m, "file %llu\n", - (u64)stat[MEMCG_CACHE] * PAGE_SIZE); + (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE); seq_printf(m, "kernel_stack %llu\n", - (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024); + (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024); seq_printf(m, "slab %llu\n", - (u64)(stat[NR_SLAB_RECLAIMABLE] + - stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); + (u64)(acc.stat[NR_SLAB_RECLAIMABLE] + + acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); seq_printf(m, "sock %llu\n", - (u64)stat[MEMCG_SOCK] * PAGE_SIZE); + (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE); seq_printf(m, "shmem %llu\n", - (u64)stat[NR_SHMEM] * PAGE_SIZE); + (u64)acc.stat[NR_SHMEM] * PAGE_SIZE); seq_printf(m, "file_mapped %llu\n", - (u64)stat[NR_FILE_MAPPED] * PAGE_SIZE); + (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE); seq_printf(m, "file_dirty %llu\n", - (u64)stat[NR_FILE_DIRTY] * PAGE_SIZE); + (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE); seq_printf(m, "file_writeback %llu\n", - (u64)stat[NR_WRITEBACK] * PAGE_SIZE); + (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE); - for (i = 0; i < NR_LRU_LISTS; i++) { - struct mem_cgroup *mi; - unsigned long val = 0; - - for_each_mem_cgroup_tree(mi, memcg) - val += mem_cgroup_nr_lru_pages(mi, BIT(i)); - seq_printf(m, "%s %llu\n", - mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE); - } + for (i = 0; i < NR_LRU_LISTS; i++) + seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i], + (u64)acc.lru_pages[i] * PAGE_SIZE); seq_printf(m, "slab_reclaimable %llu\n", - (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE); + (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE); seq_printf(m, "slab_unreclaimable %llu\n", - (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE); + (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE); /* Accumulated memory events */ - seq_printf(m, "pgfault %lu\n", events[PGFAULT]); - seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]); + seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]); + seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]); - seq_printf(m, "pgrefill %lu\n", events[PGREFILL]); - seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] + - events[PGSCAN_DIRECT]); - seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] + - events[PGSTEAL_DIRECT]); - seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]); - seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]); - seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]); - seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]); + seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]); + seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] + + acc.events[PGSCAN_DIRECT]); + seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] + + acc.events[PGSTEAL_DIRECT]); + seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]); + seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]); + seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); + seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]); seq_printf(m, "workingset_refault %lu\n", - stat[WORKINGSET_REFAULT]); + acc.stat[WORKINGSET_REFAULT]); seq_printf(m, "workingset_activate %lu\n", - stat[WORKINGSET_ACTIVATE]); + acc.stat[WORKINGSET_ACTIVATE]); seq_printf(m, "workingset_nodereclaim %lu\n", - stat[WORKINGSET_NODERECLAIM]); + acc.stat[WORKINGSET_NODERECLAIM]); return 0; } +static int memory_oom_group_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + seq_printf(m, "%d\n", memcg->oom_group); + + return 0; +} + +static ssize_t memory_oom_group_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + int ret, oom_group; + + buf = strstrip(buf); + if (!buf) + return -EINVAL; + + ret = kstrtoint(buf, 0, &oom_group); + if (ret) + return ret; + + if (oom_group != 0 && oom_group != 1) + return -EINVAL; + + memcg->oom_group = oom_group; + + return nbytes; +} + static struct cftype memory_files[] = { { .name = "current", @@ -5369,6 +5689,12 @@ static struct cftype memory_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .seq_show = memory_stat_show, }, + { + .name = "oom.group", + .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, + .seq_show = memory_oom_group_show, + .write = memory_oom_group_write, + }, { } /* terminate */ }; @@ -5593,6 +5919,19 @@ out: return ret; } +int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask, struct mem_cgroup **memcgp, + bool compound) +{ + struct mem_cgroup *memcg; + int ret; + + ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound); + memcg = *memcgp; + mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask); + return ret; +} + /** * mem_cgroup_commit_charge - commit a page charge * @page: page to charge @@ -6003,7 +6342,7 @@ static int __init mem_cgroup_init(void) { int cpu, node; -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM /* * Kmem cache creation is mostly done with the slab_mutex held, * so use a workqueue with limited concurrency to avoid stalling diff --git a/mm/memfd.c b/mm/memfd.c index 27069518e3c5..2bb5e257080e 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -326,7 +326,7 @@ SYSCALL_DEFINE2(memfd_create, goto err_fd; } file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; - file->f_flags |= O_RDWR | O_LARGEFILE; + file->f_flags |= O_LARGEFILE; if (flags & MFD_ALLOW_SEALING) { file_seals = memfd_file_seals_ptr(file); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9d142b9b86dc..0cd3de3550f0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -55,8 +55,10 @@ #include <linux/hugetlb.h> #include <linux/memory_hotplug.h> #include <linux/mm_inline.h> +#include <linux/memremap.h> #include <linux/kfifo.h> #include <linux/ratelimit.h> +#include <linux/page-isolation.h> #include "internal.h" #include "ras/ras_event.h" @@ -174,22 +176,51 @@ int hwpoison_filter(struct page *p) EXPORT_SYMBOL_GPL(hwpoison_filter); /* + * Kill all processes that have a poisoned page mapped and then isolate + * the page. + * + * General strategy: + * Find all processes having the page mapped and kill them. + * But we keep a page reference around so that the page is not + * actually freed yet. + * Then stash the page away + * + * There's no convenient way to get back to mapped processes + * from the VMAs. So do a brute-force search over all + * running processes. + * + * Remember that machine checks are not common (or rather + * if they are common you have other problems), so this shouldn't + * be a performance issue. + * + * Also there are some races possible while we get from the + * error detection to actually handle it. + */ + +struct to_kill { + struct list_head nd; + struct task_struct *tsk; + unsigned long addr; + short size_shift; + char addr_valid; +}; + +/* * Send all the processes who have the page mapped a signal. * ``action optional'' if they are not immediately affected by the error * ``action required'' if error happened in current execution context */ -static int kill_proc(struct task_struct *t, unsigned long addr, - unsigned long pfn, struct page *page, int flags) +static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) { - short addr_lsb; + struct task_struct *t = tk->tsk; + short addr_lsb = tk->size_shift; int ret; pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n", pfn, t->comm, t->pid); - addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) { - ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)addr, + ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)tk->addr, addr_lsb, current); } else { /* @@ -198,7 +229,7 @@ static int kill_proc(struct task_struct *t, unsigned long addr, * This could cause a loop when the user sets SIGBUS * to SIG_IGN, but hopefully no one will do that? */ - ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)addr, + ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr, addr_lsb, t); /* synchronous? */ } if (ret < 0) @@ -234,34 +265,39 @@ void shake_page(struct page *p, int access) } EXPORT_SYMBOL_GPL(shake_page); -/* - * Kill all processes that have a poisoned page mapped and then isolate - * the page. - * - * General strategy: - * Find all processes having the page mapped and kill them. - * But we keep a page reference around so that the page is not - * actually freed yet. - * Then stash the page away - * - * There's no convenient way to get back to mapped processes - * from the VMAs. So do a brute-force search over all - * running processes. - * - * Remember that machine checks are not common (or rather - * if they are common you have other problems), so this shouldn't - * be a performance issue. - * - * Also there are some races possible while we get from the - * error detection to actually handle it. - */ - -struct to_kill { - struct list_head nd; - struct task_struct *tsk; - unsigned long addr; - char addr_valid; -}; +static unsigned long dev_pagemap_mapping_shift(struct page *page, + struct vm_area_struct *vma) +{ + unsigned long address = vma_address(page, vma); + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset(vma->vm_mm, address); + if (!pgd_present(*pgd)) + return 0; + p4d = p4d_offset(pgd, address); + if (!p4d_present(*p4d)) + return 0; + pud = pud_offset(p4d, address); + if (!pud_present(*pud)) + return 0; + if (pud_devmap(*pud)) + return PUD_SHIFT; + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return 0; + if (pmd_devmap(*pmd)) + return PMD_SHIFT; + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) + return 0; + if (pte_devmap(*pte)) + return PAGE_SHIFT; + return 0; +} /* * Failure handling: if we can't find or can't kill a process there's @@ -292,6 +328,10 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, } tk->addr = page_address_in_vma(p, vma); tk->addr_valid = 1; + if (is_zone_device_page(p)) + tk->size_shift = dev_pagemap_mapping_shift(p, vma); + else + tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT; /* * In theory we don't have to kill when the page was @@ -299,7 +339,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * likely very rare kill anyways just out of paranoia, but use * a SIGKILL because the error is not contained anymore. */ - if (tk->addr == -EFAULT) { + if (tk->addr == -EFAULT || tk->size_shift == 0) { pr_info("Memory failure: Unable to find user space address %lx in %s\n", page_to_pfn(p), tsk->comm); tk->addr_valid = 0; @@ -317,9 +357,8 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * Also when FAIL is set do a force kill because something went * wrong earlier. */ -static void kill_procs(struct list_head *to_kill, int forcekill, - bool fail, struct page *page, unsigned long pfn, - int flags) +static void kill_procs(struct list_head *to_kill, int forcekill, bool fail, + unsigned long pfn, int flags) { struct to_kill *tk, *next; @@ -342,8 +381,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, * check for that, but we need to tell the * process anyways. */ - else if (kill_proc(tk->tsk, tk->addr, - pfn, page, flags) < 0) + else if (kill_proc(tk, pfn, flags) < 0) pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n", pfn, tk->tsk->comm, tk->tsk->pid); } @@ -515,6 +553,7 @@ static const char * const action_page_types[] = { [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page", [MF_MSG_BUDDY] = "free buddy page", [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)", + [MF_MSG_DAX] = "dax page", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -1012,7 +1051,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * any accesses to the poisoned memory. */ forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL); - kill_procs(&tokill, forcekill, !unmap_success, p, pfn, flags); + kill_procs(&tokill, forcekill, !unmap_success, pfn, flags); return unmap_success; } @@ -1112,6 +1151,83 @@ out: return res; } +static int memory_failure_dev_pagemap(unsigned long pfn, int flags, + struct dev_pagemap *pgmap) +{ + struct page *page = pfn_to_page(pfn); + const bool unmap_success = true; + unsigned long size = 0; + struct to_kill *tk; + LIST_HEAD(tokill); + int rc = -EBUSY; + loff_t start; + + /* + * Prevent the inode from being freed while we are interrogating + * the address_space, typically this would be handled by + * lock_page(), but dax pages do not use the page lock. This + * also prevents changes to the mapping of this pfn until + * poison signaling is complete. + */ + if (!dax_lock_mapping_entry(page)) + goto out; + + if (hwpoison_filter(page)) { + rc = 0; + goto unlock; + } + + switch (pgmap->type) { + case MEMORY_DEVICE_PRIVATE: + case MEMORY_DEVICE_PUBLIC: + /* + * TODO: Handle HMM pages which may need coordination + * with device-side memory. + */ + goto unlock; + default: + break; + } + + /* + * Use this flag as an indication that the dax page has been + * remapped UC to prevent speculative consumption of poison. + */ + SetPageHWPoison(page); + + /* + * Unlike System-RAM there is no possibility to swap in a + * different physical page at a given virtual address, so all + * userspace consumption of ZONE_DEVICE memory necessitates + * SIGBUS (i.e. MF_MUST_KILL) + */ + flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; + collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED); + + list_for_each_entry(tk, &tokill, nd) + if (tk->size_shift) + size = max(size, 1UL << tk->size_shift); + if (size) { + /* + * Unmap the largest mapping to avoid breaking up + * device-dax mappings which are constant size. The + * actual size of the mapping being torn down is + * communicated in siginfo, see kill_proc() + */ + start = (page->index << PAGE_SHIFT) & ~(size - 1); + unmap_mapping_range(page->mapping, start, start + size, 0); + } + kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags); + rc = 0; +unlock: + dax_unlock_mapping_entry(page); +out: + /* drop pgmap ref acquired in caller */ + put_dev_pagemap(pgmap); + action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED); + return rc; +} + /** * memory_failure - Handle memory failure of a page. * @pfn: Page Number of the corrupted page @@ -1134,6 +1250,7 @@ int memory_failure(unsigned long pfn, int flags) struct page *p; struct page *hpage; struct page *orig_head; + struct dev_pagemap *pgmap; int res; unsigned long page_flags; @@ -1146,6 +1263,10 @@ int memory_failure(unsigned long pfn, int flags) return -ENXIO; } + pgmap = get_dev_pagemap(pfn, NULL); + if (pgmap) + return memory_failure_dev_pagemap(pfn, flags, pgmap); + p = pfn_to_page(pfn); if (PageHuge(p)) return memory_failure_hugetlb(pfn, flags); @@ -1167,7 +1288,7 @@ int memory_failure(unsigned long pfn, int flags) * R/W the page; let's pray that the page has been * used and will be freed some time later. * In fact it's dangerous to directly bump up page count from 0, - * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. + * that may make page_ref_freeze()/page_ref_unfreeze() mismatch. */ if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) { if (is_free_buddy_page(p)) { @@ -1598,8 +1719,18 @@ static int soft_offline_huge_page(struct page *page, int flags) if (ret > 0) ret = -EIO; } else { - if (PageHuge(page)) - dissolve_free_huge_page(page); + /* + * We set PG_hwpoison only when the migration source hugepage + * was successfully dissolved, because otherwise hwpoisoned + * hugepage remains on free hugepage list, then userspace will + * find it as SIGBUS by allocation failure. That's not expected + * in soft-offlining. + */ + ret = dissolve_free_huge_page(page); + if (!ret) { + if (set_hwpoison_free_buddy_page(page)) + num_poisoned_pages_inc(); + } } return ret; } @@ -1687,6 +1818,7 @@ static int __soft_offline_page(struct page *page, int flags) static int soft_offline_in_use_page(struct page *page, int flags) { int ret; + int mt; struct page *hpage = compound_head(page); if (!PageHuge(page) && PageTransHuge(hpage)) { @@ -1705,23 +1837,37 @@ static int soft_offline_in_use_page(struct page *page, int flags) put_hwpoison_page(hpage); } + /* + * Setting MIGRATE_ISOLATE here ensures that the page will be linked + * to free list immediately (not via pcplist) when released after + * successful page migration. Otherwise we can't guarantee that the + * page is really free after put_page() returns, so + * set_hwpoison_free_buddy_page() highly likely fails. + */ + mt = get_pageblock_migratetype(page); + set_pageblock_migratetype(page, MIGRATE_ISOLATE); if (PageHuge(page)) ret = soft_offline_huge_page(page, flags); else ret = __soft_offline_page(page, flags); - + set_pageblock_migratetype(page, mt); return ret; } -static void soft_offline_free_page(struct page *page) +static int soft_offline_free_page(struct page *page) { + int rc = 0; struct page *head = compound_head(page); - if (!TestSetPageHWPoison(head)) { - num_poisoned_pages_inc(); - if (PageHuge(head)) - dissolve_free_huge_page(page); + if (PageHuge(head)) + rc = dissolve_free_huge_page(page); + if (!rc) { + if (set_hwpoison_free_buddy_page(page)) + num_poisoned_pages_inc(); + else + rc = -EBUSY; } + return rc; } /** @@ -1751,6 +1897,14 @@ int soft_offline_page(struct page *page, int flags) int ret; unsigned long pfn = page_to_pfn(page); + if (is_zone_device_page(page)) { + pr_debug_ratelimited("soft_offline: %#lx page is device page\n", + pfn); + if (flags & MF_COUNT_INCREASED) + put_page(page); + return -EIO; + } + if (PageHWPoison(page)) { pr_info("soft offline: %#lx page already poisoned\n", pfn); if (flags & MF_COUNT_INCREASED) @@ -1765,7 +1919,7 @@ int soft_offline_page(struct page *page, int flags) if (ret > 0) ret = soft_offline_in_use_page(page, flags); else if (ret == 0) - soft_offline_free_page(page); + ret = soft_offline_free_page(page); return ret; } diff --git a/mm/memory.c b/mm/memory.c index 7206a634270b..c467102a5cbc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -238,23 +238,13 @@ void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, __tlb_reset_range(tlb); } -static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) +static void tlb_flush_mmu_free(struct mmu_gather *tlb) { - if (!tlb->end) - return; + struct mmu_gather_batch *batch; - tlb_flush(tlb); - mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end); #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb_table_flush(tlb); #endif - __tlb_reset_range(tlb); -} - -static void tlb_flush_mmu_free(struct mmu_gather *tlb) -{ - struct mmu_gather_batch *batch; - for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { free_pages_and_swap_cache(batch->pages, batch->nr); batch->nr = 0; @@ -330,6 +320,21 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ * See the comment near struct mmu_table_batch. */ +/* + * If we want tlb_remove_table() to imply TLB invalidates. + */ +static inline void tlb_table_invalidate(struct mmu_gather *tlb) +{ +#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE + /* + * Invalidate page-table caches used by hardware walkers. Then we still + * need to RCU-sched wait while freeing the pages because software + * walkers can still be in-flight. + */ + tlb_flush_mmu_tlbonly(tlb); +#endif +} + static void tlb_remove_table_smp_sync(void *arg) { /* Simply deliver the interrupt */ @@ -366,6 +371,7 @@ void tlb_table_flush(struct mmu_gather *tlb) struct mmu_table_batch **batch = &tlb->batch; if (*batch) { + tlb_table_invalidate(tlb); call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); *batch = NULL; } @@ -375,23 +381,16 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) { struct mmu_table_batch **batch = &tlb->batch; - /* - * When there's less then two users of this mm there cannot be a - * concurrent page-table walk. - */ - if (atomic_read(&tlb->mm->mm_users) < 2) { - __tlb_remove_table(table); - return; - } - if (*batch == NULL) { *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); if (*batch == NULL) { + tlb_table_invalidate(tlb); tlb_remove_table_one(table); return; } (*batch)->nr = 0; } + (*batch)->tables[(*batch)->nr++] = table; if ((*batch)->nr == MAX_TABLE_BATCH) tlb_table_flush(tlb); @@ -853,6 +852,10 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, return NULL; } } + + if (pte_devmap(pte)) + return NULL; + print_bad_pte(vma, addr, pte, NULL); return NULL; } @@ -917,6 +920,8 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, } } + if (pmd_devmap(pmd)) + return NULL; if (is_zero_pfn(pfn)) return NULL; if (unlikely(pfn > highest_memmap_pfn)) @@ -1017,7 +1022,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, * If it's a COW mapping, write protect it both * in the parent and the child */ - if (is_cow_mapping(vm_flags)) { + if (is_cow_mapping(vm_flags) && pte_write(pte)) { ptep_set_wrprotect(src_mm, addr, src_pte); pte = pte_wrprotect(pte); } @@ -1417,11 +1422,9 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, do { next = pmd_addr_end(addr, end); if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { - if (next - addr != HPAGE_PMD_SIZE) { - VM_BUG_ON_VMA(vma_is_anonymous(vma) && - !rwsem_is_locked(&tlb->mm->mmap_sem), vma); + if (next - addr != HPAGE_PMD_SIZE) __split_huge_pmd(vma, pmd, addr, false, NULL); - } else if (zap_huge_pmd(tlb, vma, pmd, addr)) + else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; /* fall through */ } @@ -1603,20 +1606,8 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, tlb_gather_mmu(&tlb, mm, start, end); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(mm, start, end); - for ( ; vma && vma->vm_start < end; vma = vma->vm_next) { + for ( ; vma && vma->vm_start < end; vma = vma->vm_next) unmap_single_vma(&tlb, vma, start, end, NULL); - - /* - * zap_page_range does not specify whether mmap_sem should be - * held for read or write. That allows parallel zap_page_range - * operations to unmap a PTE and defer a flush meaning that - * this call observes pte_none and fails to flush the TLB. - * Rather than adding a complex API, ensure that no stale - * TLB entries exist when this call returns. - */ - flush_tlb_range(vma, start, end); - } - mmu_notifier_invalidate_range_end(mm, start, end); tlb_finish_mmu(&tlb, start, end); } @@ -1886,6 +1877,9 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; + if (!pfn_modify_allowed(pfn, pgprot)) + return -EACCES; + track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, @@ -1921,6 +1915,9 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, track_pfn_insert(vma, &pgprot, pfn); + if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) + return -EACCES; + /* * If we don't have pte special, then we have to use the pfn_valid() * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* @@ -1982,6 +1979,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, { pte_t *pte; spinlock_t *ptl; + int err = 0; pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) @@ -1989,12 +1987,16 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, arch_enter_lazy_mmu_mode(); do { BUG_ON(!pte_none(*pte)); + if (!pfn_modify_allowed(pfn, prot)) { + err = -EACCES; + break; + } set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); - return 0; + return err; } static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, @@ -2003,6 +2005,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, { pmd_t *pmd; unsigned long next; + int err; pfn -= addr >> PAGE_SHIFT; pmd = pmd_alloc(mm, pud, addr); @@ -2011,9 +2014,10 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, VM_BUG_ON(pmd_trans_huge(*pmd)); do { next = pmd_addr_end(addr, end); - if (remap_pte_range(mm, pmd, addr, next, - pfn + (addr >> PAGE_SHIFT), prot)) - return -ENOMEM; + err = remap_pte_range(mm, pmd, addr, next, + pfn + (addr >> PAGE_SHIFT), prot); + if (err) + return err; } while (pmd++, addr = next, addr != end); return 0; } @@ -2024,6 +2028,7 @@ static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d, { pud_t *pud; unsigned long next; + int err; pfn -= addr >> PAGE_SHIFT; pud = pud_alloc(mm, p4d, addr); @@ -2031,9 +2036,10 @@ static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d, return -ENOMEM; do { next = pud_addr_end(addr, end); - if (remap_pmd_range(mm, pud, addr, next, - pfn + (addr >> PAGE_SHIFT), prot)) - return -ENOMEM; + err = remap_pmd_range(mm, pud, addr, next, + pfn + (addr >> PAGE_SHIFT), prot); + if (err) + return err; } while (pud++, addr = next, addr != end); return 0; } @@ -2044,6 +2050,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, { p4d_t *p4d; unsigned long next; + int err; pfn -= addr >> PAGE_SHIFT; p4d = p4d_alloc(mm, pgd, addr); @@ -2051,9 +2058,10 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, return -ENOMEM; do { next = p4d_addr_end(addr, end); - if (remap_pud_range(mm, p4d, addr, next, - pfn + (addr >> PAGE_SHIFT), prot)) - return -ENOMEM; + err = remap_pud_range(mm, p4d, addr, next, + pfn + (addr >> PAGE_SHIFT), prot); + if (err) + return err; } while (p4d++, addr = next, addr != end); return 0; } @@ -2369,9 +2377,9 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) * * We do this without the lock held, so that it can sleep if it needs to. */ -static int do_page_mkwrite(struct vm_fault *vmf) +static vm_fault_t do_page_mkwrite(struct vm_fault *vmf) { - int ret; + vm_fault_t ret; struct page *page = vmf->page; unsigned int old_flags = vmf->flags; @@ -2475,7 +2483,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf) * held to the old page, as well as updating the rmap. * - In any case, unlock the PTL and drop the reference we took to the old page. */ -static int wp_page_copy(struct vm_fault *vmf) +static vm_fault_t wp_page_copy(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; @@ -2503,7 +2511,7 @@ static int wp_page_copy(struct vm_fault *vmf) cow_user_page(new_page, old_page, vmf->address, vma); } - if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) + if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false)) goto oom_free_new; __SetPageUptodate(new_page); @@ -2623,7 +2631,7 @@ oom: * The function expects the page to be locked or other protection against * concurrent faults / writeback (such as DAX radix tree locks). */ -int finish_mkwrite_fault(struct vm_fault *vmf) +vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) { WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, @@ -2644,12 +2652,12 @@ int finish_mkwrite_fault(struct vm_fault *vmf) * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED * mapping */ -static int wp_pfn_shared(struct vm_fault *vmf) +static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { - int ret; + vm_fault_t ret; pte_unmap_unlock(vmf->pte, vmf->ptl); vmf->flags |= FAULT_FLAG_MKWRITE; @@ -2662,7 +2670,7 @@ static int wp_pfn_shared(struct vm_fault *vmf) return VM_FAULT_WRITE; } -static int wp_page_shared(struct vm_fault *vmf) +static vm_fault_t wp_page_shared(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; @@ -2670,7 +2678,7 @@ static int wp_page_shared(struct vm_fault *vmf) get_page(vmf->page); if (vma->vm_ops && vma->vm_ops->page_mkwrite) { - int tmp; + vm_fault_t tmp; pte_unmap_unlock(vmf->pte, vmf->ptl); tmp = do_page_mkwrite(vmf); @@ -2713,7 +2721,7 @@ static int wp_page_shared(struct vm_fault *vmf) * but allow concurrent faults), with pte both mapped and locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_wp_page(struct vm_fault *vmf) +static vm_fault_t do_wp_page(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; @@ -2889,7 +2897,7 @@ EXPORT_SYMBOL(unmap_mapping_range); * We return with the mmap_sem locked or unlocked in the same cases * as does filemap_fault(). */ -int do_swap_page(struct vm_fault *vmf) +vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = NULL, *swapcache; @@ -2898,7 +2906,7 @@ int do_swap_page(struct vm_fault *vmf) pte_t pte; int locked; int exclusive = 0; - int ret = 0; + vm_fault_t ret = 0; if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) goto out; @@ -3003,8 +3011,8 @@ int do_swap_page(struct vm_fault *vmf) goto out_page; } - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, - &memcg, false)) { + if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, + &memcg, false)) { ret = VM_FAULT_OOM; goto out_page; } @@ -3109,12 +3117,12 @@ out_release: * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_anonymous_page(struct vm_fault *vmf) +static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct mem_cgroup *memcg; struct page *page; - int ret = 0; + vm_fault_t ret = 0; pte_t entry; /* File mapping without ->vm_ops ? */ @@ -3165,7 +3173,8 @@ static int do_anonymous_page(struct vm_fault *vmf) if (!page) goto oom; - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) + if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg, + false)) goto oom_free_page; /* @@ -3223,10 +3232,10 @@ oom: * released depending on flags and vma->vm_ops->fault() return value. * See filemap_fault() and __lock_page_retry(). */ -static int __do_fault(struct vm_fault *vmf) +static vm_fault_t __do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - int ret; + vm_fault_t ret; ret = vma->vm_ops->fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | @@ -3260,7 +3269,7 @@ static int pmd_devmap_trans_unstable(pmd_t *pmd) return pmd_devmap(*pmd) || pmd_trans_unstable(pmd); } -static int pte_alloc_one_map(struct vm_fault *vmf) +static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -3336,13 +3345,14 @@ static void deposit_prealloc_pte(struct vm_fault *vmf) vmf->prealloc_pte = NULL; } -static int do_set_pmd(struct vm_fault *vmf, struct page *page) +static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t entry; - int i, ret; + int i; + vm_fault_t ret; if (!transhuge_vma_suitable(vma, haddr)) return VM_FAULT_FALLBACK; @@ -3372,7 +3382,7 @@ static int do_set_pmd(struct vm_fault *vmf, struct page *page) if (write) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR); + add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); page_add_file_rmap(page, true); /* * deposit and withdraw with pmd lock held @@ -3392,7 +3402,7 @@ out: return ret; } #else -static int do_set_pmd(struct vm_fault *vmf, struct page *page) +static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { BUILD_BUG(); return 0; @@ -3413,13 +3423,13 @@ static int do_set_pmd(struct vm_fault *vmf, struct page *page) * Target users are page handler itself and implementations of * vm_ops->map_pages. */ -int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, +vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, struct page *page) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; pte_t entry; - int ret; + vm_fault_t ret; if (pmd_none(*vmf->pmd) && PageTransCompound(page) && IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { @@ -3478,10 +3488,10 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, * The function expects the page to be locked and on success it consumes a * reference of a page being mapped (for the PTE which maps it). */ -int finish_fault(struct vm_fault *vmf) +vm_fault_t finish_fault(struct vm_fault *vmf) { struct page *page; - int ret = 0; + vm_fault_t ret = 0; /* Did we COW the page? */ if ((vmf->flags & FAULT_FLAG_WRITE) && @@ -3567,12 +3577,13 @@ late_initcall(fault_around_debugfs); * (and therefore to page order). This way it's easier to guarantee * that we don't cross page table boundaries. */ -static int do_fault_around(struct vm_fault *vmf) +static vm_fault_t do_fault_around(struct vm_fault *vmf) { unsigned long address = vmf->address, nr_pages, mask; pgoff_t start_pgoff = vmf->pgoff; pgoff_t end_pgoff; - int off, ret = 0; + int off; + vm_fault_t ret = 0; nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; @@ -3622,10 +3633,10 @@ out: return ret; } -static int do_read_fault(struct vm_fault *vmf) +static vm_fault_t do_read_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - int ret = 0; + vm_fault_t ret = 0; /* * Let's call ->map_pages() first and use ->fault() as fallback @@ -3649,10 +3660,10 @@ static int do_read_fault(struct vm_fault *vmf) return ret; } -static int do_cow_fault(struct vm_fault *vmf) +static vm_fault_t do_cow_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - int ret; + vm_fault_t ret; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; @@ -3661,7 +3672,7 @@ static int do_cow_fault(struct vm_fault *vmf) if (!vmf->cow_page) return VM_FAULT_OOM; - if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL, + if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL, &vmf->memcg, false)) { put_page(vmf->cow_page); return VM_FAULT_OOM; @@ -3688,10 +3699,10 @@ uncharge_out: return ret; } -static int do_shared_fault(struct vm_fault *vmf) +static vm_fault_t do_shared_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - int ret, tmp; + vm_fault_t ret, tmp; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) @@ -3729,10 +3740,10 @@ static int do_shared_fault(struct vm_fault *vmf) * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ -static int do_fault(struct vm_fault *vmf) +static vm_fault_t do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - int ret; + vm_fault_t ret; /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ if (!vma->vm_ops->fault) @@ -3767,7 +3778,7 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, return mpol_misplaced(page, vma, addr); } -static int do_numa_page(struct vm_fault *vmf) +static vm_fault_t do_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = NULL; @@ -3857,7 +3868,7 @@ out: return 0; } -static inline int create_huge_pmd(struct vm_fault *vmf) +static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) { if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_anonymous_page(vmf); @@ -3867,7 +3878,7 @@ static inline int create_huge_pmd(struct vm_fault *vmf) } /* `inline' is required to avoid gcc 4.1.2 build error */ -static inline int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) +static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) { if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_wp_page(vmf, orig_pmd); @@ -3886,7 +3897,7 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE); } -static int create_huge_pud(struct vm_fault *vmf) +static vm_fault_t create_huge_pud(struct vm_fault *vmf) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* No support for anonymous transparent PUD pages yet */ @@ -3898,7 +3909,7 @@ static int create_huge_pud(struct vm_fault *vmf) return VM_FAULT_FALLBACK; } -static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) +static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* No support for anonymous transparent PUD pages yet */ @@ -3925,7 +3936,7 @@ static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) * The mmap_sem may have been released depending on flags and our return value. * See filemap_fault() and __lock_page_or_retry(). */ -static int handle_pte_fault(struct vm_fault *vmf) +static vm_fault_t handle_pte_fault(struct vm_fault *vmf) { pte_t entry; @@ -4013,8 +4024,8 @@ unlock: * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ -static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, - unsigned int flags) +static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, + unsigned long address, unsigned int flags) { struct vm_fault vmf = { .vma = vma, @@ -4027,7 +4038,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; p4d_t *p4d; - int ret; + vm_fault_t ret; pgd = pgd_offset(mm, address); p4d = p4d_alloc(mm, pgd, address); @@ -4102,10 +4113,10 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ -int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, +vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) { - int ret; + vm_fault_t ret; __set_current_state(TASK_RUNNING); @@ -4125,7 +4136,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, * space. Kernel faults are handled more gracefully. */ if (flags & FAULT_FLAG_USER) - mem_cgroup_oom_enable(); + mem_cgroup_enter_user_fault(); if (unlikely(is_vm_hugetlb_page(vma))) ret = hugetlb_fault(vma->vm_mm, vma, address, flags); @@ -4133,7 +4144,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, ret = __handle_mm_fault(vma, address, flags); if (flags & FAULT_FLAG_USER) { - mem_cgroup_oom_disable(); + mem_cgroup_exit_user_fault(); /* * The task may have entered a memcg OOM situation but * if the allocation error was handled gracefully (no @@ -4397,6 +4408,9 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, return -EINVAL; maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot); + if (!maddr) + return -ENOMEM; + if (write) memcpy_toio(maddr + offset, buf, len); else @@ -4568,71 +4582,93 @@ EXPORT_SYMBOL(__might_fault); #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) -static void clear_gigantic_page(struct page *page, - unsigned long addr, - unsigned int pages_per_huge_page) -{ - int i; - struct page *p = page; - - might_sleep(); - for (i = 0; i < pages_per_huge_page; - i++, p = mem_map_next(p, page, i)) { - cond_resched(); - clear_user_highpage(p, addr + i * PAGE_SIZE); - } -} -void clear_huge_page(struct page *page, - unsigned long addr_hint, unsigned int pages_per_huge_page) +/* + * Process all subpages of the specified huge page with the specified + * operation. The target subpage will be processed last to keep its + * cache lines hot. + */ +static inline void process_huge_page( + unsigned long addr_hint, unsigned int pages_per_huge_page, + void (*process_subpage)(unsigned long addr, int idx, void *arg), + void *arg) { int i, n, base, l; unsigned long addr = addr_hint & ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); - if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { - clear_gigantic_page(page, addr, pages_per_huge_page); - return; - } - - /* Clear sub-page to access last to keep its cache lines hot */ + /* Process target subpage last to keep its cache lines hot */ might_sleep(); n = (addr_hint - addr) / PAGE_SIZE; if (2 * n <= pages_per_huge_page) { - /* If sub-page to access in first half of huge page */ + /* If target subpage in first half of huge page */ base = 0; l = n; - /* Clear sub-pages at the end of huge page */ + /* Process subpages at the end of huge page */ for (i = pages_per_huge_page - 1; i >= 2 * n; i--) { cond_resched(); - clear_user_highpage(page + i, addr + i * PAGE_SIZE); + process_subpage(addr + i * PAGE_SIZE, i, arg); } } else { - /* If sub-page to access in second half of huge page */ + /* If target subpage in second half of huge page */ base = pages_per_huge_page - 2 * (pages_per_huge_page - n); l = pages_per_huge_page - n; - /* Clear sub-pages at the begin of huge page */ + /* Process subpages at the begin of huge page */ for (i = 0; i < base; i++) { cond_resched(); - clear_user_highpage(page + i, addr + i * PAGE_SIZE); + process_subpage(addr + i * PAGE_SIZE, i, arg); } } /* - * Clear remaining sub-pages in left-right-left-right pattern - * towards the sub-page to access + * Process remaining subpages in left-right-left-right pattern + * towards the target subpage */ for (i = 0; i < l; i++) { int left_idx = base + i; int right_idx = base + 2 * l - 1 - i; cond_resched(); - clear_user_highpage(page + left_idx, - addr + left_idx * PAGE_SIZE); + process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg); + cond_resched(); + process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg); + } +} + +static void clear_gigantic_page(struct page *page, + unsigned long addr, + unsigned int pages_per_huge_page) +{ + int i; + struct page *p = page; + + might_sleep(); + for (i = 0; i < pages_per_huge_page; + i++, p = mem_map_next(p, page, i)) { cond_resched(); - clear_user_highpage(page + right_idx, - addr + right_idx * PAGE_SIZE); + clear_user_highpage(p, addr + i * PAGE_SIZE); } } +static void clear_subpage(unsigned long addr, int idx, void *arg) +{ + struct page *page = arg; + + clear_user_highpage(page + idx, addr); +} + +void clear_huge_page(struct page *page, + unsigned long addr_hint, unsigned int pages_per_huge_page) +{ + unsigned long addr = addr_hint & + ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); + + if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { + clear_gigantic_page(page, addr, pages_per_huge_page); + return; + } + + process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page); +} + static void copy_user_gigantic_page(struct page *dst, struct page *src, unsigned long addr, struct vm_area_struct *vma, @@ -4652,11 +4688,31 @@ static void copy_user_gigantic_page(struct page *dst, struct page *src, } } +struct copy_subpage_arg { + struct page *dst; + struct page *src; + struct vm_area_struct *vma; +}; + +static void copy_subpage(unsigned long addr, int idx, void *arg) +{ + struct copy_subpage_arg *copy_arg = arg; + + copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx, + addr, copy_arg->vma); +} + void copy_user_huge_page(struct page *dst, struct page *src, - unsigned long addr, struct vm_area_struct *vma, + unsigned long addr_hint, struct vm_area_struct *vma, unsigned int pages_per_huge_page) { - int i; + unsigned long addr = addr_hint & + ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); + struct copy_subpage_arg arg = { + .dst = dst, + .src = src, + .vma = vma, + }; if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { copy_user_gigantic_page(dst, src, addr, vma, @@ -4664,11 +4720,7 @@ void copy_user_huge_page(struct page *dst, struct page *src, return; } - might_sleep(); - for (i = 0; i < pages_per_huge_page; i++) { - cond_resched(); - copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); - } + process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg); } long copy_huge_page_from_user(struct page *dst_page, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 7deb49f69e27..9eea6e809a4e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -982,8 +982,6 @@ static void reset_node_present_pages(pg_data_t *pgdat) static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) { struct pglist_data *pgdat; - unsigned long zones_size[MAX_NR_ZONES] = {0}; - unsigned long zholes_size[MAX_NR_ZONES] = {0}; unsigned long start_pfn = PFN_DOWN(start); pgdat = NODE_DATA(nid); @@ -1006,8 +1004,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) /* we can use NODE_DATA(nid) from here */ + pgdat->node_id = nid; + pgdat->node_start_pfn = start_pfn; + /* init node's zones as empty zones, we don't have any present pages.*/ - free_area_init_node(nid, zones_size, start_pfn, zholes_size); + free_area_init_core_hotplug(nid); pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); /* @@ -1017,25 +1018,20 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) build_all_zonelists(pgdat); /* - * zone->managed_pages is set to an approximate value in - * free_area_init_core(), which will cause - * /sys/device/system/node/nodeX/meminfo has wrong data. - * So reset it to 0 before any memory is onlined. - */ - reset_node_managed_pages(pgdat); - - /* * When memory is hot-added, all the memory is in offline state. So * clear all zones' present_pages because they will be updated in * online_pages() and offline_pages(). */ + reset_node_managed_pages(pgdat); reset_node_present_pages(pgdat); return pgdat; } -static void rollback_node_hotadd(int nid, pg_data_t *pgdat) +static void rollback_node_hotadd(int nid) { + pg_data_t *pgdat = NODE_DATA(nid); + arch_refresh_nodedata(nid, NULL); free_percpu(pgdat->per_cpu_nodestats); arch_free_nodedata(pgdat); @@ -1046,28 +1042,48 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) /** * try_online_node - online a node if offlined * @nid: the node ID - * + * @start: start addr of the node + * @set_node_online: Whether we want to online the node * called by cpu_up() to online a node without onlined memory. + * + * Returns: + * 1 -> a new node has been allocated + * 0 -> the node is already online + * -ENOMEM -> the node could not be allocated */ -int try_online_node(int nid) +static int __try_online_node(int nid, u64 start, bool set_node_online) { - pg_data_t *pgdat; - int ret; + pg_data_t *pgdat; + int ret = 1; if (node_online(nid)) return 0; - mem_hotplug_begin(); - pgdat = hotadd_new_pgdat(nid, 0); + pgdat = hotadd_new_pgdat(nid, start); if (!pgdat) { pr_err("Cannot online node %d due to NULL pgdat\n", nid); ret = -ENOMEM; goto out; } - node_set_online(nid); - ret = register_one_node(nid); - BUG_ON(ret); + + if (set_node_online) { + node_set_online(nid); + ret = register_one_node(nid); + BUG_ON(ret); + } out: + return ret; +} + +/* + * Users of this function always want to online/register the node + */ +int try_online_node(int nid) +{ + int ret; + + mem_hotplug_begin(); + ret = __try_online_node(nid, 0, true); mem_hotplug_done(); return ret; } @@ -1099,9 +1115,7 @@ static int online_memory_block(struct memory_block *mem, void *arg) int __ref add_memory_resource(int nid, struct resource *res, bool online) { u64 start, size; - pg_data_t *pgdat = NULL; - bool new_pgdat; - bool new_node; + bool new_node = false; int ret; start = res->start; @@ -1111,11 +1125,6 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) if (ret) return ret; - { /* Stupid hack to suppress address-never-null warning */ - void *p = NODE_DATA(nid); - new_pgdat = !p; - } - mem_hotplug_begin(); /* @@ -1126,48 +1135,31 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) */ memblock_add_node(start, size, nid); - new_node = !node_online(nid); - if (new_node) { - pgdat = hotadd_new_pgdat(nid, start); - ret = -ENOMEM; - if (!pgdat) - goto error; - } + ret = __try_online_node(nid, start, false); + if (ret < 0) + goto error; + new_node = ret; /* call arch's memory hotadd */ ret = arch_add_memory(nid, start, size, NULL, true); - if (ret < 0) goto error; - /* we online node here. we can't roll back from here. */ - node_set_online(nid); - if (new_node) { - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long nr_pages = size >> PAGE_SHIFT; - - ret = __register_one_node(nid); - if (ret) - goto register_fail; - - /* - * link memory sections under this node. This is already - * done when creatig memory section in register_new_memory - * but that depends to have the node registered so offline - * nodes have to go through register_node. - * TODO clean up this mess. - */ - ret = link_mem_sections(nid, start_pfn, nr_pages, false); -register_fail: - /* - * If sysfs file of new node can't create, cpu on the node + /* If sysfs file of new node can't be created, cpu on the node * can't be hot-added. There is no rollback way now. * So, check by BUG_ON() to catch it reluctantly.. + * We online node here. We can't roll back from here. */ + node_set_online(nid); + ret = __register_one_node(nid); BUG_ON(ret); } + /* link memory sections under this node.*/ + ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1)); + BUG_ON(ret); + /* create new memmap entry */ firmware_map_add_hotplug(start, start + size, "System RAM"); @@ -1180,8 +1172,8 @@ register_fail: error: /* rollback pgdat allocation and others */ - if (new_pgdat && pgdat) - rollback_node_hotadd(nid, pgdat); + if (new_node) + rollback_node_hotadd(nid); memblock_remove(start, size); out: diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9ac49ef17b4e..da858f794eb6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1784,7 +1784,7 @@ unsigned int mempolicy_slab_node(void) zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, highest_zoneidx, &policy->v.nodes); - return z->zone ? z->zone->node : node; + return z->zone ? zone_to_nid(z->zone) : node; } default: @@ -2326,7 +2326,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long node_zonelist(numa_node_id(), GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), &pol->v.nodes); - polnid = z->zone->node; + polnid = zone_to_nid(z->zone); break; default: @@ -2504,7 +2504,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) goto put_new; /* Create pseudo-vma that contains just the policy */ - memset(&pvma, 0, sizeof(struct vm_area_struct)); + vma_init(&pvma, NULL); pvma.vm_end = TASK_SIZE; /* policy covers entire file */ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ diff --git a/mm/mempool.c b/mm/mempool.c index b54f2c20e5e0..0ef8cc8d1602 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -111,7 +111,7 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element) kasan_free_pages(element, (unsigned long)pool->pool_data); } -static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags) +static void kasan_unpoison_element(mempool_t *pool, void *element) { if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) kasan_unpoison_slab(element); @@ -127,12 +127,12 @@ static __always_inline void add_element(mempool_t *pool, void *element) pool->elements[pool->curr_nr++] = element; } -static void *remove_element(mempool_t *pool, gfp_t flags) +static void *remove_element(mempool_t *pool) { void *element = pool->elements[--pool->curr_nr]; BUG_ON(pool->curr_nr < 0); - kasan_unpoison_element(pool, element, flags); + kasan_unpoison_element(pool, element); check_element(pool, element); return element; } @@ -151,7 +151,7 @@ static void *remove_element(mempool_t *pool, gfp_t flags) void mempool_exit(mempool_t *pool) { while (pool->curr_nr) { - void *element = remove_element(pool, GFP_KERNEL); + void *element = remove_element(pool); pool->free(element, pool->pool_data); } kfree(pool->elements); @@ -213,6 +213,7 @@ EXPORT_SYMBOL(mempool_init_node); /** * mempool_init - initialize a memory pool + * @pool: pointer to the memory pool that should be initialized * @min_nr: the minimum number of elements guaranteed to be * allocated for this pool. * @alloc_fn: user-defined element-allocation function. @@ -301,7 +302,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr) spin_lock_irqsave(&pool->lock, flags); if (new_min_nr <= pool->min_nr) { while (new_min_nr < pool->curr_nr) { - element = remove_element(pool, GFP_KERNEL); + element = remove_element(pool); spin_unlock_irqrestore(&pool->lock, flags); pool->free(element, pool->pool_data); spin_lock_irqsave(&pool->lock, flags); @@ -387,7 +388,7 @@ repeat_alloc: spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr)) { - element = remove_element(pool, gfp_temp); + element = remove_element(pool); spin_unlock_irqrestore(&pool->lock, flags); /* paired with rmb in mempool_free(), read comment there */ smp_wmb(); diff --git a/mm/migrate.c b/mm/migrate.c index 8c0af0f7cab1..d6a2e89b086a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1131,7 +1131,8 @@ out: * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work * around it. */ -#if (GCC_VERSION >= 40700 && GCC_VERSION < 40900) && defined(CONFIG_ARM) +#if defined(CONFIG_ARM) && \ + defined(GCC_VERSION) && GCC_VERSION < 40900 && GCC_VERSION >= 40700 #define ICE_noinline noinline #else #define ICE_noinline @@ -1211,7 +1212,7 @@ out: * intentionally. Although it's rather weird, * it's how HWPoison flag works at the moment. */ - if (!test_set_page_hwpoison(page)) + if (set_hwpoison_free_buddy_page(page)) num_poisoned_pages_inc(); } } else { @@ -1330,8 +1331,6 @@ put_anon: out: if (rc != -EAGAIN) putback_active_hugepage(hpage); - if (reason == MR_MEMORY_FAILURE && !test_set_page_hwpoison(hpage)) - num_poisoned_pages_inc(); /* * If migration was not successful and there's a freeing callback, use @@ -2951,7 +2950,8 @@ int migrate_vma(const struct migrate_vma_ops *ops, /* Sanity check the arguments */ start &= PAGE_MASK; end &= PAGE_MASK; - if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) + if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || + vma_is_dax(vma)) return -EINVAL; if (start < vma->vm_start || start >= vma->vm_end) return -EINVAL; diff --git a/mm/mlock.c b/mm/mlock.c index 74e5a6547c3d..41cc47e28ad6 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -527,7 +527,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, vm_flags_t old_flags = vma->vm_flags; if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || - is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) + is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || + vma_is_dax(vma)) /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ goto out; diff --git a/mm/mm_init.c b/mm/mm_init.c index 5b72266b4b03..6838a530789b 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -53,13 +53,8 @@ void __init mminit_verify_zonelist(void) zone->name); /* Iterate the zonelist */ - for_each_zone_zonelist(zone, z, zonelist, zoneid) { -#ifdef CONFIG_NUMA - pr_cont("%d:%s ", zone->node, zone->name); -#else - pr_cont("0:%s ", zone->name); -#endif /* CONFIG_NUMA */ - } + for_each_zone_zonelist(zone, z, zonelist, zoneid) + pr_cont("%d:%s ", zone_to_nid(zone), zone->name); pr_cont("\n"); } } diff --git a/mm/mmap.c b/mm/mmap.c index d1eb87ef4b1a..5f2b2b184c60 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -182,12 +182,12 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return next; } -static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf); - +static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags, + struct list_head *uf); SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long retval; @@ -245,7 +245,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) goto out; /* Ok, looks good - let it rip. */ - if (do_brk(oldbrk, newbrk-oldbrk, &uf) < 0) + if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0) goto out; set_brk: @@ -911,7 +911,7 @@ again: anon_vma_merge(vma, next); mm->map_count--; mpol_put(vma_policy(next)); - kmem_cache_free(vm_area_cachep, next); + vm_area_free(next); /* * In mprotect's case 6 (see comments on vma_merge), * we must remove another next too. It would clutter @@ -1729,19 +1729,17 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(mm); if (!vma) { error = -ENOMEM; goto unacct_error; } - vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; vma->vm_flags = vm_flags; vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; - INIT_LIST_HEAD(&vma->anon_vma_chain); if (file) { if (vm_flags & VM_DENYWRITE) { @@ -1780,6 +1778,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, error = shmem_zero_setup(vma); if (error) goto free_vma; + } else { + vma_set_anonymous(vma); } vma_link(mm, vma, prev, rb_link, rb_parent); @@ -1796,11 +1796,12 @@ out: vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { - if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || - vma == get_gate_vma(current->mm))) - mm->locked_vm += (len >> PAGE_SHIFT); - else + if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || + is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current->mm)) vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + else + mm->locked_vm += (len >> PAGE_SHIFT); } if (file) @@ -1832,7 +1833,7 @@ allow_write_and_free_vma: if (vm_flags & VM_DENYWRITE) allow_write_access(file); free_vma: - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); unacct_error: if (charged) vm_unacct_memory(charged); @@ -2620,15 +2621,10 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return err; } - new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + new = vm_area_dup(vma); if (!new) return -ENOMEM; - /* most fields are the same, copy all, and then fixup */ - *new = *vma; - - INIT_LIST_HEAD(&new->anon_vma_chain); - if (new_below) new->vm_end = addr; else { @@ -2669,7 +2665,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, out_free_mpol: mpol_put(vma_policy(new)); out_free_vma: - kmem_cache_free(vm_area_cachep, new); + vm_area_free(new); return err; } @@ -2929,21 +2925,14 @@ static inline void verify_mm_writelocked(struct mm_struct *mm) * anonymous maps. eventually we may be able to do some * brk-specific accounting here. */ -static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags, struct list_head *uf) +static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags, struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; - unsigned long len; struct rb_node **rb_link, *rb_parent; pgoff_t pgoff = addr >> PAGE_SHIFT; int error; - len = PAGE_ALIGN(request); - if (len < request) - return -ENOMEM; - if (!len) - return 0; - /* Until we need other flags, refuse anything except VM_EXEC. */ if ((flags & (~VM_EXEC)) != 0) return -EINVAL; @@ -2991,14 +2980,13 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long /* * create a vma struct for an anonymous mapping */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(mm); if (!vma) { vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; } - INIT_LIST_HEAD(&vma->anon_vma_chain); - vma->vm_mm = mm; + vma_set_anonymous(vma); vma->vm_start = addr; vma->vm_end = addr + len; vma->vm_pgoff = pgoff; @@ -3015,18 +3003,20 @@ out: return 0; } -static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf) -{ - return do_brk_flags(addr, len, 0, uf); -} - -int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags) +int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) { struct mm_struct *mm = current->mm; + unsigned long len; int ret; bool populate; LIST_HEAD(uf); + len = PAGE_ALIGN(request); + if (len < request) + return -ENOMEM; + if (!len) + return 0; + if (down_write_killable(&mm->mmap_sem)) return -EINTR; @@ -3073,9 +3063,7 @@ void exit_mmap(struct mm_struct *mm) * which clears VM_LOCKED, otherwise the oom reaper cannot * reliably test it. */ - mutex_lock(&oom_lock); - __oom_reap_task_mm(mm); - mutex_unlock(&oom_lock); + (void)__oom_reap_task_mm(mm); set_bit(MMF_OOM_SKIP, &mm->flags); down_write(&mm->mmap_sem); @@ -3207,16 +3195,14 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, } *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); } else { - new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + new_vma = vm_area_dup(vma); if (!new_vma) goto out; - *new_vma = *vma; new_vma->vm_start = addr; new_vma->vm_end = addr + len; new_vma->vm_pgoff = pgoff; if (vma_dup_policy(vma, new_vma)) goto out_free_vma; - INIT_LIST_HEAD(&new_vma->anon_vma_chain); if (anon_vma_clone(new_vma, vma)) goto out_free_mempol; if (new_vma->vm_file) @@ -3231,7 +3217,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, out_free_mempol: mpol_put(vma_policy(new_vma)); out_free_vma: - kmem_cache_free(vm_area_cachep, new_vma); + vm_area_free(new_vma); out: return NULL; } @@ -3355,12 +3341,10 @@ static struct vm_area_struct *__install_special_mapping( int ret; struct vm_area_struct *vma; - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(mm); if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD(&vma->anon_vma_chain); - vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; @@ -3381,7 +3365,7 @@ static struct vm_area_struct *__install_special_mapping( return vma; out: - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return ERR_PTR(ret); } diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index eff6b88a993f..82bb1a939c0e 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -174,18 +174,29 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, srcu_read_unlock(&srcu, id); } -void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, - unsigned long start, unsigned long end) +int __mmu_notifier_invalidate_range_start(struct mm_struct *mm, + unsigned long start, unsigned long end, + bool blockable) { struct mmu_notifier *mn; + int ret = 0; int id; id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { - if (mn->ops->invalidate_range_start) - mn->ops->invalidate_range_start(mn, mm, start, end); + if (mn->ops->invalidate_range_start) { + int _ret = mn->ops->invalidate_range_start(mn, mm, start, end, blockable); + if (_ret) { + pr_info("%pS callback failed with %d in %sblockable context.\n", + mn->ops->invalidate_range_start, _ret, + !blockable ? "non-" : ""); + ret = _ret; + } + } } srcu_read_unlock(&srcu, id); + + return ret; } EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); diff --git a/mm/mprotect.c b/mm/mprotect.c index 625608bc8962..6d331620b9e5 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -306,6 +306,42 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, return pages; } +static int prot_none_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? + 0 : -EACCES; +} + +static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? + 0 : -EACCES; +} + +static int prot_none_test(unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + return 0; +} + +static int prot_none_walk(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long newflags) +{ + pgprot_t new_pgprot = vm_get_page_prot(newflags); + struct mm_walk prot_none_walk = { + .pte_entry = prot_none_pte_entry, + .hugetlb_entry = prot_none_hugetlb_entry, + .test_walk = prot_none_test, + .mm = current->mm, + .private = &new_pgprot, + }; + + return walk_page_range(start, end, &prot_none_walk); +} + int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags) @@ -324,6 +360,19 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, } /* + * Do PROT_NONE PFN permission checks here when we can still + * bail out without undoing a lot of state. This is a rather + * uncommon case, so doesn't need to be very optimized. + */ + if (arch_has_pfn_modify_check() && + (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && + (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) { + error = prot_none_walk(vma, start, end, newflags); + if (error) + return error; + } + + /* * If we make a private mapping writable we increase our commit; * but (without finer accounting) cannot reduce our commit if we * make it unwritable again. hugetlb mapping were accounted for diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 9b02fda0886b..439af3b765a7 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -42,7 +42,7 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, { void *ptr; u64 addr; - ulong flags = choose_memblock_flags(); + enum memblock_flags flags = choose_memblock_flags(); if (limit > memblock.current_limit) limit = memblock.current_limit; @@ -72,7 +72,7 @@ again: return ptr; } -/* +/** * free_bootmem_late - free bootmem pages directly to page allocator * @addr: starting address of the range * @size: size of the range in bytes @@ -176,7 +176,7 @@ void __init reset_all_zones_managed_pages(void) /** * free_all_bootmem - release free pages to the buddy allocator * - * Returns the number of pages actually released. + * Return: the number of pages actually released. */ unsigned long __init free_all_bootmem(void) { @@ -193,7 +193,7 @@ unsigned long __init free_all_bootmem(void) /** * free_bootmem_node - mark a page range as usable * @pgdat: node the range resides on - * @physaddr: starting address of the range + * @physaddr: starting physical address of the range * @size: size of the range in bytes * * Partial pages will be considered reserved and left as they are. @@ -208,7 +208,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, /** * free_bootmem - mark a page range as usable - * @addr: starting address of the range + * @addr: starting physical address of the range * @size: size of the range in bytes * * Partial pages will be considered reserved and left as they are. @@ -256,7 +256,7 @@ restart: * * Allocation may happen on any node in the system. * - * Returns NULL on failure. + * Return: address of the allocated region or %NULL on failure. */ void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal) @@ -293,6 +293,8 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, * Allocation may happen on any node in the system. * * The function panics if the request can not be satisfied. + * + * Return: address of the allocated region. */ void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) @@ -367,6 +369,8 @@ static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, * can not hold the requested memory. * * The function panics if the request can not be satisfied. + * + * Return: address of the allocated region. */ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) @@ -396,6 +400,8 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, * Allocation may happen on any node in the system. * * The function panics if the request can not be satisfied. + * + * Return: address of the allocated region. */ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) @@ -425,6 +431,8 @@ void * __init __alloc_bootmem_low_nopanic(unsigned long size, * can not hold the requested memory. * * The function panics if the request can not be satisfied. + * + * Return: address of the allocated region. */ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) diff --git a/mm/nommu.c b/mm/nommu.c index 4452d8bd9ae4..e4aac33216ae 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -364,10 +364,6 @@ void *vzalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vzalloc_node); -#ifndef PAGE_KERNEL_EXEC -# define PAGE_KERNEL_EXEC PAGE_KERNEL -#endif - /** * vmalloc_exec - allocate virtually contiguous, executable memory * @size: allocation size @@ -769,7 +765,7 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) if (vma->vm_file) fput(vma->vm_file); put_nommu_region(vma->vm_region); - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); } /* @@ -1145,6 +1141,8 @@ static int do_mmap_private(struct vm_area_struct *vma, if (ret < len) memset(base + ret, 0, len - ret); + } else { + vma_set_anonymous(vma); } return 0; @@ -1204,7 +1202,7 @@ unsigned long do_mmap(struct file *file, if (!region) goto error_getting_region; - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(current->mm); if (!vma) goto error_getting_vma; @@ -1212,7 +1210,6 @@ unsigned long do_mmap(struct file *file, region->vm_flags = vm_flags; region->vm_pgoff = pgoff; - INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_flags = vm_flags; vma->vm_pgoff = pgoff; @@ -1368,7 +1365,7 @@ error: kmem_cache_free(vm_region_jar, region); if (vma->vm_file) fput(vma->vm_file); - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return ret; sharing_violation: @@ -1469,14 +1466,13 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (!region) return -ENOMEM; - new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + new = vm_area_dup(vma); if (!new) { kmem_cache_free(vm_region_jar, region); return -ENOMEM; } /* most fields are the same, copy all, and then fixup */ - *new = *vma; *region = *vma->vm_region; new->vm_region = region; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 84081e77bc51..b5b25e4dcbbb 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -53,6 +53,14 @@ int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; int sysctl_oom_dump_tasks = 1; +/* + * Serializes oom killer invocations (out_of_memory()) from all contexts to + * prevent from over eager oom killing (e.g. when the oom killer is invoked + * from different domains). + * + * oom_killer_disable() relies on this lock to stabilize oom_killer_disabled + * and mark_oom_victim + */ DEFINE_MUTEX(oom_lock); #ifdef CONFIG_NUMA @@ -392,7 +400,8 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) struct task_struct *p; struct task_struct *task; - pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); + pr_info("Tasks state (memory values in pages):\n"); + pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); rcu_read_lock(); for_each_process(p) { if (oom_unkillable_task(p, memcg, nodemask)) @@ -408,7 +417,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n", + pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), mm_pgtables_bytes(task->mm), @@ -479,9 +488,10 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); static struct task_struct *oom_reaper_list; static DEFINE_SPINLOCK(oom_reaper_lock); -void __oom_reap_task_mm(struct mm_struct *mm) +bool __oom_reap_task_mm(struct mm_struct *mm) { struct vm_area_struct *vma; + bool ret = true; /* * Tell all users of get_user/copy_from_user etc... that the content @@ -511,50 +521,32 @@ void __oom_reap_task_mm(struct mm_struct *mm) struct mmu_gather tlb; tlb_gather_mmu(&tlb, mm, start, end); - mmu_notifier_invalidate_range_start(mm, start, end); + if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) { + ret = false; + continue; + } unmap_page_range(&tlb, vma, start, end, NULL); mmu_notifier_invalidate_range_end(mm, start, end); tlb_finish_mmu(&tlb, start, end); } } + + return ret; } +/* + * Reaps the address space of the give task. + * + * Returns true on success and false if none or part of the address space + * has been reclaimed and the caller should retry later. + */ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) { bool ret = true; - /* - * We have to make sure to not race with the victim exit path - * and cause premature new oom victim selection: - * oom_reap_task_mm exit_mm - * mmget_not_zero - * mmput - * atomic_dec_and_test - * exit_oom_victim - * [...] - * out_of_memory - * select_bad_process - * # no TIF_MEMDIE task selects new victim - * unmap_page_range # frees some memory - */ - mutex_lock(&oom_lock); - if (!down_read_trylock(&mm->mmap_sem)) { - ret = false; trace_skip_task_reaping(tsk->pid); - goto unlock_oom; - } - - /* - * If the mm has invalidate_{start,end}() notifiers that could block, - * sleep to give the oom victim some more time. - * TODO: we really want to get rid of this ugly hack and make sure that - * notifiers cannot block for unbounded amount of time - */ - if (mm_has_blockable_invalidate_notifiers(mm)) { - up_read(&mm->mmap_sem); - schedule_timeout_idle(HZ); - goto unlock_oom; + return false; } /* @@ -564,25 +556,27 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) * down_write();up_write() cycle in exit_mmap(). */ if (test_bit(MMF_OOM_SKIP, &mm->flags)) { - up_read(&mm->mmap_sem); trace_skip_task_reaping(tsk->pid); - goto unlock_oom; + goto out_unlock; } trace_start_task_reaping(tsk->pid); - __oom_reap_task_mm(mm); + /* failed to reap part of the address space. Try again later */ + ret = __oom_reap_task_mm(mm); + if (!ret) + goto out_finish; pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", task_pid_nr(tsk), tsk->comm, K(get_mm_counter(mm, MM_ANONPAGES)), K(get_mm_counter(mm, MM_FILEPAGES)), K(get_mm_counter(mm, MM_SHMEMPAGES))); +out_finish: + trace_finish_task_reaping(tsk->pid); +out_unlock: up_read(&mm->mmap_sem); - trace_finish_task_reaping(tsk->pid); -unlock_oom: - mutex_unlock(&oom_lock); return ret; } @@ -835,68 +829,12 @@ static bool task_will_free_mem(struct task_struct *task) return ret; } -static void oom_kill_process(struct oom_control *oc, const char *message) +static void __oom_kill_process(struct task_struct *victim) { - struct task_struct *p = oc->chosen; - unsigned int points = oc->chosen_points; - struct task_struct *victim = p; - struct task_struct *child; - struct task_struct *t; + struct task_struct *p; struct mm_struct *mm; - unsigned int victim_points = 0; - static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); bool can_oom_reap = true; - /* - * If the task is already exiting, don't alarm the sysadmin or kill - * its children or threads, just give it access to memory reserves - * so it can die quickly - */ - task_lock(p); - if (task_will_free_mem(p)) { - mark_oom_victim(p); - wake_oom_reaper(p); - task_unlock(p); - put_task_struct(p); - return; - } - task_unlock(p); - - if (__ratelimit(&oom_rs)) - dump_header(oc, p); - - pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", - message, task_pid_nr(p), p->comm, points); - - /* - * If any of p's children has a different mm and is eligible for kill, - * the one with the highest oom_badness() score is sacrificed for its - * parent. This attempts to lose the minimal amount of work done while - * still freeing memory. - */ - read_lock(&tasklist_lock); - for_each_thread(p, t) { - list_for_each_entry(child, &t->children, sibling) { - unsigned int child_points; - - if (process_shares_mm(child, p->mm)) - continue; - /* - * oom_badness() returns 0 if the thread is unkillable - */ - child_points = oom_badness(child, - oc->memcg, oc->nodemask, oc->totalpages); - if (child_points > victim_points) { - put_task_struct(victim); - victim = child; - victim_points = child_points; - get_task_struct(victim); - } - } - } - read_unlock(&tasklist_lock); - p = find_lock_task_mm(victim); if (!p) { put_task_struct(victim); @@ -920,7 +858,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) * in order to prevent the OOM victim from depleting the memory * reserves from the user space under its control. */ - do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); + do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, PIDTYPE_TGID); mark_oom_victim(victim); pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), @@ -958,7 +896,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) */ if (unlikely(p->flags & PF_KTHREAD)) continue; - do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); + do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, PIDTYPE_TGID); } rcu_read_unlock(); @@ -971,6 +909,99 @@ static void oom_kill_process(struct oom_control *oc, const char *message) #undef K /* + * Kill provided task unless it's secured by setting + * oom_score_adj to OOM_SCORE_ADJ_MIN. + */ +static int oom_kill_memcg_member(struct task_struct *task, void *unused) +{ + if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { + get_task_struct(task); + __oom_kill_process(task); + } + return 0; +} + +static void oom_kill_process(struct oom_control *oc, const char *message) +{ + struct task_struct *p = oc->chosen; + unsigned int points = oc->chosen_points; + struct task_struct *victim = p; + struct task_struct *child; + struct task_struct *t; + struct mem_cgroup *oom_group; + unsigned int victim_points = 0; + static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + /* + * If the task is already exiting, don't alarm the sysadmin or kill + * its children or threads, just give it access to memory reserves + * so it can die quickly + */ + task_lock(p); + if (task_will_free_mem(p)) { + mark_oom_victim(p); + wake_oom_reaper(p); + task_unlock(p); + put_task_struct(p); + return; + } + task_unlock(p); + + if (__ratelimit(&oom_rs)) + dump_header(oc, p); + + pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", + message, task_pid_nr(p), p->comm, points); + + /* + * If any of p's children has a different mm and is eligible for kill, + * the one with the highest oom_badness() score is sacrificed for its + * parent. This attempts to lose the minimal amount of work done while + * still freeing memory. + */ + read_lock(&tasklist_lock); + for_each_thread(p, t) { + list_for_each_entry(child, &t->children, sibling) { + unsigned int child_points; + + if (process_shares_mm(child, p->mm)) + continue; + /* + * oom_badness() returns 0 if the thread is unkillable + */ + child_points = oom_badness(child, + oc->memcg, oc->nodemask, oc->totalpages); + if (child_points > victim_points) { + put_task_struct(victim); + victim = child; + victim_points = child_points; + get_task_struct(victim); + } + } + } + read_unlock(&tasklist_lock); + + /* + * Do we need to kill the entire memory cgroup? + * Or even one of the ancestor memory cgroups? + * Check this out before killing the victim task. + */ + oom_group = mem_cgroup_get_oom_group(victim, oc->memcg); + + __oom_kill_process(victim); + + /* + * If necessary, kill all tasks in the selected memory cgroup. + */ + if (oom_group) { + mem_cgroup_print_oom_group(oom_group); + mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL); + mem_cgroup_put(oom_group); + } +} + +/* * Determines whether the kernel must panic because of the panic_on_oom sysctl. */ static void check_panic_on_oom(struct oom_control *oc, @@ -1077,15 +1108,9 @@ bool out_of_memory(struct oom_control *oc) dump_header(oc, NULL); panic("Out of memory and no killable processes...\n"); } - if (oc->chosen && oc->chosen != (void *)-1UL) { + if (oc->chosen && oc->chosen != (void *)-1UL) oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : "Memory cgroup out of memory"); - /* - * Give the killed process a good chance to exit before trying - * to allocate memory again. - */ - schedule_timeout_killable(1); - } return !!oc->chosen; } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 337c6afb3345..84ae9bf5858a 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -27,7 +27,6 @@ #include <linux/mpage.h> #include <linux/rmap.h> #include <linux/percpu.h> -#include <linux/notifier.h> #include <linux/smp.h> #include <linux/sysctl.h> #include <linux/cpu.h> @@ -2490,8 +2489,8 @@ EXPORT_SYMBOL(__set_page_dirty_nobuffers); /* * Call this whenever redirtying a page, to de-account the dirty counters - * (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written - * counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to + * (NR_DIRTIED, WB_DIRTIED, tsk->nr_dirtied), so that they match the written + * counters (NR_WRITTEN, WB_WRITTEN) in long term. The mismatches will lead to * systematic errors in balanced_dirty_ratelimit and the dirty pages position * control. */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1521100f1e63..05e983f42316 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -32,7 +32,6 @@ #include <linux/slab.h> #include <linux/ratelimit.h> #include <linux/oom.h> -#include <linux/notifier.h> #include <linux/topology.h> #include <linux/sysctl.h> #include <linux/cpu.h> @@ -155,16 +154,17 @@ static inline void set_pcppage_migratetype(struct page *page, int migratetype) * The following functions are used by the suspend/hibernate code to temporarily * change gfp_allowed_mask in order to avoid using I/O during memory allocations * while devices are suspended. To avoid races with the suspend/hibernate code, - * they should always be called with pm_mutex held (gfp_allowed_mask also should - * only be modified with pm_mutex held, unless the suspend/hibernate code is - * guaranteed not to run in parallel with that modification). + * they should always be called with system_transition_mutex held + * (gfp_allowed_mask also should only be modified with system_transition_mutex + * held, unless the suspend/hibernate code is guaranteed not to run in parallel + * with that modification). */ static gfp_t saved_gfp_mask; void pm_restore_gfp_mask(void) { - WARN_ON(!mutex_is_locked(&pm_mutex)); + WARN_ON(!mutex_is_locked(&system_transition_mutex)); if (saved_gfp_mask) { gfp_allowed_mask = saved_gfp_mask; saved_gfp_mask = 0; @@ -173,7 +173,7 @@ void pm_restore_gfp_mask(void) void pm_restrict_gfp_mask(void) { - WARN_ON(!mutex_is_locked(&pm_mutex)); + WARN_ON(!mutex_is_locked(&system_transition_mutex)); WARN_ON(saved_gfp_mask); saved_gfp_mask = gfp_allowed_mask; gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); @@ -2908,10 +2908,10 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) if (!static_branch_likely(&vm_numa_stat_key)) return; - if (z->node != numa_node_id()) + if (zone_to_nid(z) != numa_node_id()) local_stat = NUMA_OTHER; - if (z->node == preferred_zone->node) + if (zone_to_nid(z) == zone_to_nid(preferred_zone)) __inc_numa_state(z, NUMA_HIT); else { __inc_numa_state(z, NUMA_MISS); @@ -4164,11 +4164,12 @@ retry: alloc_flags = reserve_flags; /* - * Reset the zonelist iterators if memory policies can be ignored. - * These allocations are high priority and system rather than user - * orientated. + * Reset the nodemask and zonelist iterators if memory policies can be + * ignored. These allocations are high priority and system rather than + * user oriented. */ if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { + ac->nodemask = NULL; ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); } @@ -4402,19 +4403,15 @@ out: EXPORT_SYMBOL(__alloc_pages_nodemask); /* - * Common helper functions. + * Common helper functions. Never use with __GFP_HIGHMEM because the returned + * address cannot represent highmem pages. Use alloc_pages and then kmap if + * you need to access high mem. */ unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) { struct page *page; - /* - * __get_free_pages() returns a virtual address, which cannot represent - * a highmem page - */ - VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); - - page = alloc_pages(gfp_mask, order); + page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order); if (!page) return 0; return (unsigned long) page_address(page); @@ -5280,7 +5277,7 @@ int local_memory_node(int node) z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL), gfp_zone(GFP_KERNEL), NULL); - return z->zone->node; + return zone_to_nid(z->zone); } #endif @@ -5566,13 +5563,12 @@ static int zone_batchsize(struct zone *zone) /* * The per-cpu-pages pools are set to around 1000th of the - * size of the zone. But no more than 1/2 of a meg. - * - * OK, so we don't know how big the cache is. So guess. + * size of the zone. */ batch = zone->managed_pages / 1024; - if (batch * PAGE_SIZE > 512 * 1024) - batch = (512 * 1024) / PAGE_SIZE; + /* But no more than a meg. */ + if (batch * PAGE_SIZE > 1024 * 1024) + batch = (1024 * 1024) / PAGE_SIZE; batch /= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; @@ -6123,7 +6119,7 @@ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned l return usemapsize / 8; } -static void __init setup_usemap(struct pglist_data *pgdat, +static void __ref setup_usemap(struct pglist_data *pgdat, struct zone *zone, unsigned long zone_start_pfn, unsigned long zonesize) @@ -6143,7 +6139,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ -void __paginginit set_pageblock_order(void) +void __init set_pageblock_order(void) { unsigned int order; @@ -6171,14 +6167,14 @@ void __paginginit set_pageblock_order(void) * include/linux/pageblock-flags.h for the values of pageblock_order based on * the kernel config */ -void __paginginit set_pageblock_order(void) +void __init set_pageblock_order(void) { } #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ -static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, - unsigned long present_pages) +static unsigned long __init calc_memmap_size(unsigned long spanned_pages, + unsigned long present_pages) { unsigned long pages = spanned_pages; @@ -6197,39 +6193,99 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; } -/* - * Set up the zone data structures: - * - mark all pages reserved - * - mark all memory queues empty - * - clear the memory bitmaps - * - * NOTE: pgdat should get zeroed by caller. - */ -static void __paginginit free_area_init_core(struct pglist_data *pgdat) -{ - enum zone_type j; - int nid = pgdat->node_id; - - pgdat_resize_init(pgdat); #ifdef CONFIG_NUMA_BALANCING +static void pgdat_init_numabalancing(struct pglist_data *pgdat) +{ spin_lock_init(&pgdat->numabalancing_migrate_lock); pgdat->numabalancing_migrate_nr_pages = 0; pgdat->numabalancing_migrate_next_window = jiffies; +} +#else +static void pgdat_init_numabalancing(struct pglist_data *pgdat) {} #endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void pgdat_init_split_queue(struct pglist_data *pgdat) +{ spin_lock_init(&pgdat->split_queue_lock); INIT_LIST_HEAD(&pgdat->split_queue); pgdat->split_queue_len = 0; +} +#else +static void pgdat_init_split_queue(struct pglist_data *pgdat) {} #endif - init_waitqueue_head(&pgdat->kswapd_wait); - init_waitqueue_head(&pgdat->pfmemalloc_wait); + #ifdef CONFIG_COMPACTION +static void pgdat_init_kcompactd(struct pglist_data *pgdat) +{ init_waitqueue_head(&pgdat->kcompactd_wait); +} +#else +static void pgdat_init_kcompactd(struct pglist_data *pgdat) {} #endif + +static void __meminit pgdat_init_internals(struct pglist_data *pgdat) +{ + pgdat_resize_init(pgdat); + + pgdat_init_numabalancing(pgdat); + pgdat_init_split_queue(pgdat); + pgdat_init_kcompactd(pgdat); + + init_waitqueue_head(&pgdat->kswapd_wait); + init_waitqueue_head(&pgdat->pfmemalloc_wait); + pgdat_page_ext_init(pgdat); spin_lock_init(&pgdat->lru_lock); lruvec_init(node_lruvec(pgdat)); +} + +static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, + unsigned long remaining_pages) +{ + zone->managed_pages = remaining_pages; + zone_set_nid(zone, nid); + zone->name = zone_names[idx]; + zone->zone_pgdat = NODE_DATA(nid); + spin_lock_init(&zone->lock); + zone_seqlock_init(zone); + zone_pcp_init(zone); +} + +/* + * Set up the zone data structures + * - init pgdat internals + * - init all zones belonging to this node + * + * NOTE: this function is only called during memory hotplug + */ +#ifdef CONFIG_MEMORY_HOTPLUG +void __ref free_area_init_core_hotplug(int nid) +{ + enum zone_type z; + pg_data_t *pgdat = NODE_DATA(nid); + + pgdat_init_internals(pgdat); + for (z = 0; z < MAX_NR_ZONES; z++) + zone_init_internals(&pgdat->node_zones[z], z, nid, 0); +} +#endif + +/* + * Set up the zone data structures: + * - mark all pages reserved + * - mark all memory queues empty + * - clear the memory bitmaps + * + * NOTE: pgdat should get zeroed by caller. + * NOTE: this function is only called during early init. + */ +static void __init free_area_init_core(struct pglist_data *pgdat) +{ + enum zone_type j; + int nid = pgdat->node_id; + pgdat_init_internals(pgdat); pgdat->per_cpu_nodestats = &boot_nodestats; for (j = 0; j < MAX_NR_ZONES; j++) { @@ -6277,15 +6333,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) * when the bootmem allocator frees pages into the buddy system. * And all highmem pages will be managed by the buddy system. */ - zone->managed_pages = freesize; -#ifdef CONFIG_NUMA - zone->node = nid; -#endif - zone->name = zone_names[j]; - zone->zone_pgdat = pgdat; - spin_lock_init(&zone->lock); - zone_seqlock_init(zone); - zone_pcp_init(zone); + zone_init_internals(zone, j, nid, freesize); if (!size) continue; @@ -6345,8 +6393,24 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { } #endif /* CONFIG_FLAT_NODE_MEM_MAP */ -void __paginginit free_area_init_node(int nid, unsigned long *zones_size, - unsigned long node_start_pfn, unsigned long *zholes_size) +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +static inline void pgdat_set_deferred_range(pg_data_t *pgdat) +{ + /* + * We start only with one section of pages, more pages are added as + * needed until the rest of deferred pages are initialized. + */ + pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION, + pgdat->node_spanned_pages); + pgdat->first_deferred_pfn = ULONG_MAX; +} +#else +static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} +#endif + +void __init free_area_init_node(int nid, unsigned long *zones_size, + unsigned long node_start_pfn, + unsigned long *zholes_size) { pg_data_t *pgdat = NODE_DATA(nid); unsigned long start_pfn = 0; @@ -6370,20 +6434,12 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, zones_size, zholes_size); alloc_node_mem_map(pgdat); + pgdat_set_deferred_range(pgdat); -#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT - /* - * We start only with one section of pages, more pages are added as - * needed until the rest of deferred pages are initialized. - */ - pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION, - pgdat->node_spanned_pages); - pgdat->first_deferred_pfn = ULONG_MAX; -#endif free_area_init_core(pgdat); } -#ifdef CONFIG_HAVE_MEMBLOCK +#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP) /* * Only struct pages that are backed by physical memory are zeroed and * initialized by going through __init_single_page(). But, there are some @@ -6391,7 +6447,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, * may be accessed (for example page_to_pfn() on some configuration accesses * flags). We must explicitly zero those struct pages. */ -void __paginginit zero_resv_unavail(void) +void __init zero_resv_unavail(void) { phys_addr_t start, end; unsigned long pfn; @@ -6404,8 +6460,11 @@ void __paginginit zero_resv_unavail(void) pgcnt = 0; for_each_resv_unavail_range(i, &start, &end) { for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) { - if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) + if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { + pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) + + pageblock_nr_pages - 1; continue; + } mm_zero_struct_page(pfn_to_page(pfn)); pgcnt++; } @@ -6421,7 +6480,7 @@ void __paginginit zero_resv_unavail(void) if (pgcnt) pr_info("Reserved but unavailable: %lld pages", pgcnt); } -#endif /* CONFIG_HAVE_MEMBLOCK */ +#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */ #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -6847,6 +6906,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) /* Initialise every node */ mminit_verify_pageflags_layout(); setup_nr_node_ids(); + zero_resv_unavail(); for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); free_area_init_node(nid, NULL, @@ -6857,7 +6917,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) node_set_state(nid, N_MEMORY); check_for_memory(pgdat, nid); } - zero_resv_unavail(); } static int __init cmdline_parse_core(char *p, unsigned long *core, @@ -6939,9 +6998,21 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s) start = (void *)PAGE_ALIGN((unsigned long)start); end = (void *)((unsigned long)end & PAGE_MASK); for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { + struct page *page = virt_to_page(pos); + void *direct_map_addr; + + /* + * 'direct_map_addr' might be different from 'pos' + * because some architectures' virt_to_page() + * work with aliases. Getting the direct map + * address ensures that we get a _writeable_ + * alias for the memset(). + */ + direct_map_addr = page_address(page); if ((unsigned int)poison <= 0xFF) - memset(pos, poison, PAGE_SIZE); - free_reserved_page(virt_to_page(pos)); + memset(direct_map_addr, poison, PAGE_SIZE); + + free_reserved_page(page); } if (pages && s) @@ -7033,9 +7104,9 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) void __init free_area_init(unsigned long *zones_size) { + zero_resv_unavail(); free_area_init_node(0, zones_size, __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); - zero_resv_unavail(); } static int page_alloc_cpu_dead(unsigned int cpu) @@ -8024,3 +8095,33 @@ bool is_free_buddy_page(struct page *page) return order < MAX_ORDER; } + +#ifdef CONFIG_MEMORY_FAILURE +/* + * Set PG_hwpoison flag if a given page is confirmed to be a free page. This + * test is performed under the zone lock to prevent a race against page + * allocation. + */ +bool set_hwpoison_free_buddy_page(struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long pfn = page_to_pfn(page); + unsigned long flags; + unsigned int order; + bool hwpoisoned = false; + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + struct page *page_head = page - (pfn & ((1 << order) - 1)); + + if (PageBuddy(page_head) && page_order(page_head) >= order) { + if (!TestSetPageHWPoison(page)) + hwpoisoned = true; + break; + } + } + spin_unlock_irqrestore(&zone->lock, flags); + + return hwpoisoned; +} +#endif diff --git a/mm/page_ext.c b/mm/page_ext.c index 5295ef331165..a9826da84ccb 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -120,7 +120,7 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) pgdat->node_page_ext = NULL; } -struct page_ext *lookup_page_ext(struct page *page) +struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); unsigned long index; @@ -195,7 +195,7 @@ fail: #else /* CONFIG_FLAT_NODE_MEM_MAP */ -struct page_ext *lookup_page_ext(struct page *page) +struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); diff --git a/mm/page_io.c b/mm/page_io.c index b41cf9644585..aafd19ec1db4 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -338,7 +338,8 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, ret = -ENOMEM; goto out; } - bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); + bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc); + bio_associate_blkcg_from_page(bio, page); count_swpout_vm_event(page); set_page_writeback(page); unlock_page(page); diff --git a/mm/percpu.c b/mm/percpu.c index 0b6480979ac7..a749d4d96e3e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -170,6 +170,14 @@ static LIST_HEAD(pcpu_map_extend_chunks); int pcpu_nr_empty_pop_pages; /* + * The number of populated pages in use by the allocator, protected by + * pcpu_lock. This number is kept per a unit per chunk (i.e. when a page gets + * allocated/deallocated, it is allocated/deallocated in all units of a chunk + * and increments/decrements this count by 1). + */ +static unsigned long pcpu_nr_populated; + +/* * Balance work is used to populate or destroy chunks asynchronously. We * try to keep the number of populated free pages between * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one @@ -1232,6 +1240,7 @@ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start, bitmap_set(chunk->populated, page_start, nr); chunk->nr_populated += nr; + pcpu_nr_populated += nr; if (!for_alloc) { chunk->nr_empty_pop_pages += nr; @@ -1260,6 +1269,7 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, chunk->nr_populated -= nr; chunk->nr_empty_pop_pages -= nr; pcpu_nr_empty_pop_pages -= nr; + pcpu_nr_populated -= nr; } /* @@ -2176,6 +2186,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages; pcpu_chunk_relocate(pcpu_first_chunk, -1); + /* include all regions of the first chunk */ + pcpu_nr_populated += PFN_DOWN(size_sum); + pcpu_stats_chunk_alloc(); trace_percpu_create_chunk(base_addr); @@ -2746,6 +2759,22 @@ void __init setup_per_cpu_areas(void) #endif /* CONFIG_SMP */ /* + * pcpu_nr_pages - calculate total number of populated backing pages + * + * This reflects the number of pages populated to back chunks. Metadata is + * excluded in the number exposed in meminfo as the number of backing pages + * scales with the number of cpus and can quickly outweigh the memory used for + * metadata. It also keeps this calculation nice and simple. + * + * RETURNS: + * Total number of populated backing pages in use by the allocator. + */ +unsigned long pcpu_nr_pages(void) +{ + return pcpu_nr_populated * pcpu_nr_units; +} + +/* * Percpu allocator is initialized early during boot when neither slab or * workqueue is available. Plug async management until everything is up * and running. diff --git a/mm/readahead.c b/mm/readahead.c index e273f0de3376..a59ea70527b9 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -19,6 +19,7 @@ #include <linux/syscalls.h> #include <linux/file.h> #include <linux/mm_inline.h> +#include <linux/blk-cgroup.h> #include "internal.h" @@ -385,6 +386,7 @@ ondemand_readahead(struct address_space *mapping, { struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages = ra->ra_pages; + unsigned long add_pages; pgoff_t prev_offset; /* @@ -474,10 +476,17 @@ readit: * Will this read hit the readahead marker made by itself? * If so, trigger the readahead marker hit now, and merge * the resulted next readahead window into the current one. + * Take care of maximum IO pages as above. */ if (offset == ra->start && ra->size == ra->async_size) { - ra->async_size = get_next_ra_size(ra, max_pages); - ra->size += ra->async_size; + add_pages = get_next_ra_size(ra, max_pages); + if (ra->size + add_pages <= max_pages) { + ra->async_size = add_pages; + ra->size += add_pages; + } else { + ra->size = max_pages; + ra->async_size = max_pages >> 1; + } } return ra_submit(ra, mapping, filp); @@ -505,6 +514,9 @@ void page_cache_sync_readahead(struct address_space *mapping, if (!ra->ra_pages) return; + if (blk_cgroup_congested()) + return; + /* be dumb */ if (filp && (filp->f_mode & FMODE_RANDOM)) { force_page_cache_readahead(mapping, filp, offset, req_size); @@ -555,6 +567,9 @@ page_cache_async_readahead(struct address_space *mapping, if (inode_read_congested(mapping->host)) return; + if (blk_cgroup_congested()) + return; + /* do read-ahead */ ondemand_readahead(mapping, ra, filp, true, offset, req_size); } diff --git a/mm/rmap.c b/mm/rmap.c index 6db729dc4c50..eb477809a5c0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -64,6 +64,7 @@ #include <linux/backing-dev.h> #include <linux/page_idle.h> #include <linux/memremap.h> +#include <linux/userfaultfd_k.h> #include <asm/tlbflush.h> @@ -1481,11 +1482,16 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, set_pte_at(mm, address, pvmw.pte, pteval); } - } else if (pte_unused(pteval)) { + } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) { /* * The guest indicated that the page content is of no * interest anymore. Simply discard the pte, vmscan * will take care of the rest. + * A future reference will then fault in a new zero + * page. When userfaultfd is active, we must not drop + * this page though, as its main user (postcopy + * migration) will not expect userfaults on already + * copied pages. */ dec_mm_counter(mm, mm_counter(page)); /* We have to invalidate as we cleared the pte */ diff --git a/mm/shmem.c b/mm/shmem.c index 2cab84403055..0376c124b043 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -29,6 +29,7 @@ #include <linux/pagemap.h> #include <linux/file.h> #include <linux/mm.h> +#include <linux/random.h> #include <linux/sched/signal.h> #include <linux/export.h> #include <linux/swap.h> @@ -123,7 +124,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, - struct vm_fault *vmf, int *fault_type); + struct vm_fault *vmf, vm_fault_t *fault_type); int shmem_getpage(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp) @@ -1239,8 +1240,8 @@ int shmem_unuse(swp_entry_t swap, struct page *page) * the shmem_swaplist_mutex which might hold up shmem_writepage(). * Charged back to the user (not to caller) when swap account is used. */ - error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg, - false); + error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL, + &memcg, false); if (error) goto out; /* No radix_tree_preload: swap entry keeps a place for page in tree */ @@ -1420,7 +1421,7 @@ static void shmem_pseudo_vma_init(struct vm_area_struct *vma, struct shmem_inode_info *info, pgoff_t index) { /* Create a pseudo vma that just contains the policy */ - memset(vma, 0, sizeof(*vma)); + vma_init(vma, NULL); /* Bias interleave by inode number to distribute better across nodes */ vma->vm_pgoff = index + info->vfs_inode.i_ino; vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index); @@ -1619,7 +1620,8 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, */ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, - struct vm_area_struct *vma, struct vm_fault *vmf, int *fault_type) + struct vm_area_struct *vma, struct vm_fault *vmf, + vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); @@ -1712,7 +1714,7 @@ repeat: goto failed; } - error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, + error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, false); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, @@ -1818,7 +1820,7 @@ alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, if (sgp == SGP_WRITE) __SetPageReferenced(page); - error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, + error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, PageTransHuge(page)); if (error) goto unacct; @@ -2187,7 +2189,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode inode_init_owner(inode, dir, mode); inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); - inode->i_generation = get_seconds(); + inode->i_generation = prandom_u32(); info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); @@ -2291,7 +2293,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, __SetPageSwapBacked(page); __SetPageUptodate(page); - ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg, false); + ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false); if (ret) goto out_release; @@ -3896,18 +3898,11 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); /* common code */ -static const struct dentry_operations anon_ops = { - .d_dname = simple_dname -}; - static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size, unsigned long flags, unsigned int i_flags) { - struct file *res; struct inode *inode; - struct path path; - struct super_block *sb; - struct qstr this; + struct file *res; if (IS_ERR(mnt)) return ERR_CAST(mnt); @@ -3918,41 +3913,21 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, l if (shmem_acct_size(flags, size)) return ERR_PTR(-ENOMEM); - res = ERR_PTR(-ENOMEM); - this.name = name; - this.len = strlen(name); - this.hash = 0; /* will go */ - sb = mnt->mnt_sb; - path.mnt = mntget(mnt); - path.dentry = d_alloc_pseudo(sb, &this); - if (!path.dentry) - goto put_memory; - d_set_d_op(path.dentry, &anon_ops); - - res = ERR_PTR(-ENOSPC); - inode = shmem_get_inode(sb, NULL, S_IFREG | 0777, 0, flags); - if (!inode) - goto put_memory; - + inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0, + flags); + if (unlikely(!inode)) { + shmem_unacct_size(flags, size); + return ERR_PTR(-ENOSPC); + } inode->i_flags |= i_flags; - d_instantiate(path.dentry, inode); inode->i_size = size; clear_nlink(inode); /* It is unlinked */ res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); + if (!IS_ERR(res)) + res = alloc_file_pseudo(inode, mnt, name, O_RDWR, + &shmem_file_operations); if (IS_ERR(res)) - goto put_path; - - res = alloc_file(&path, FMODE_WRITE | FMODE_READ, - &shmem_file_operations); - if (IS_ERR(res)) - goto put_path; - - return res; - -put_memory: - shmem_unacct_size(flags, size); -put_path: - path_put(&path); + iput(inode); return res; } diff --git a/mm/slab.h b/mm/slab.h index 68bdf498da3b..58c6c1c2a78e 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -203,7 +203,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM /* List of all root caches. */ extern struct list_head slab_root_caches; @@ -296,7 +296,7 @@ extern void memcg_link_cache(struct kmem_cache *s); extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, void (*deact_fn)(struct kmem_cache *)); -#else /* CONFIG_MEMCG && !CONFIG_SLOB */ +#else /* CONFIG_MEMCG_KMEM */ /* If !memcg, all caches are root. */ #define slab_root_caches slab_caches @@ -351,7 +351,7 @@ static inline void memcg_link_cache(struct kmem_cache *s) { } -#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) { diff --git a/mm/slab_common.c b/mm/slab_common.c index 890b1f04a03a..fea3376f9816 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -127,7 +127,7 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, return i; } -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM LIST_HEAD(slab_root_caches); @@ -256,7 +256,7 @@ static inline void destroy_memcg_params(struct kmem_cache *s) static inline void memcg_unlink_cache(struct kmem_cache *s) { } -#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ /* * Figure out what the alignment of the objects will be given a set of @@ -567,10 +567,14 @@ static int shutdown_cache(struct kmem_cache *s) list_del(&s->list); if (s->flags & SLAB_TYPESAFE_BY_RCU) { +#ifdef SLAB_SUPPORTS_SYSFS + sysfs_slab_unlink(s); +#endif list_add_tail(&s->list, &slab_caches_to_rcu_destroy); schedule_work(&slab_caches_to_rcu_destroy_work); } else { #ifdef SLAB_SUPPORTS_SYSFS + sysfs_slab_unlink(s); sysfs_slab_release(s); #else slab_kmem_cache_release(s); @@ -580,7 +584,7 @@ static int shutdown_cache(struct kmem_cache *s) return 0; } -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM /* * memcg_create_kmem_cache - Create a cache for a memory cgroup. * @memcg: The memory cgroup the new cache is for. @@ -857,7 +861,7 @@ static inline int shutdown_memcg_caches(struct kmem_cache *s) static inline void flush_memcg_workqueue(struct kmem_cache *s) { } -#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ void slab_kmem_cache_release(struct kmem_cache *s) { diff --git a/mm/slub.c b/mm/slub.c index a3b8467c14af..8da34a8af53d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -19,7 +19,6 @@ #include <linux/slab.h> #include "slab.h" #include <linux/proc_fs.h> -#include <linux/notifier.h> #include <linux/seq_file.h> #include <linux/kasan.h> #include <linux/cpu.h> @@ -271,8 +270,7 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object) static void prefetch_freepointer(const struct kmem_cache *s, void *object) { - if (object) - prefetch(freelist_dereference(s, object + s->offset)); + prefetch(object + s->offset); } static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) @@ -5667,7 +5665,6 @@ static void sysfs_slab_remove_workfn(struct work_struct *work) kset_unregister(s->memcg_kset); #endif kobject_uevent(&s->kobj, KOBJ_REMOVE); - kobject_del(&s->kobj); out: kobject_put(&s->kobj); } @@ -5752,6 +5749,12 @@ static void sysfs_slab_remove(struct kmem_cache *s) schedule_work(&s->kobj_remove_work); } +void sysfs_slab_unlink(struct kmem_cache *s) +{ + if (slab_state >= FULL) + kobject_del(&s->kobj); +} + void sysfs_slab_release(struct kmem_cache *s) { if (slab_state >= FULL) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index bd0276d5f66b..8301293331a2 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -43,12 +43,9 @@ static void * __ref __earlyonly_bootmem_alloc(int node, unsigned long goal) { return memblock_virt_alloc_try_nid_raw(size, align, goal, - BOOTMEM_ALLOC_ACCESSIBLE, node); + BOOTMEM_ALLOC_ACCESSIBLE, node); } -static void *vmemmap_buf; -static void *vmemmap_buf_end; - void * __meminit vmemmap_alloc_block(unsigned long size, int node) { /* If the main allocator is up use that, fallback to bootmem. */ @@ -76,18 +73,10 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) /* need to make sure size is all the same during early stage */ void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node) { - void *ptr; - - if (!vmemmap_buf) - return vmemmap_alloc_block(size, node); - - /* take the from buf */ - ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size); - if (ptr + size > vmemmap_buf_end) - return vmemmap_alloc_block(size, node); - - vmemmap_buf = ptr + size; + void *ptr = sparse_buffer_alloc(size); + if (!ptr) + ptr = vmemmap_alloc_block(size, node); return ptr; } @@ -272,45 +261,3 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid, return map; } - -void __init sparse_mem_maps_populate_node(struct page **map_map, - unsigned long pnum_begin, - unsigned long pnum_end, - unsigned long map_count, int nodeid) -{ - unsigned long pnum; - unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; - void *vmemmap_buf_start; - - size = ALIGN(size, PMD_SIZE); - vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count, - PMD_SIZE, __pa(MAX_DMA_ADDRESS)); - - if (vmemmap_buf_start) { - vmemmap_buf = vmemmap_buf_start; - vmemmap_buf_end = vmemmap_buf_start + size * map_count; - } - - for (pnum = pnum_begin; pnum < pnum_end; pnum++) { - struct mem_section *ms; - - if (!present_section_nr(pnum)) - continue; - - map_map[pnum] = sparse_mem_map_populate(pnum, nodeid, NULL); - if (map_map[pnum]) - continue; - ms = __nr_to_section(pnum); - pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", - __func__); - ms->section_mem_map = 0; - } - - if (vmemmap_buf_start) { - /* need to free left buf */ - memblock_free_early(__pa(vmemmap_buf), - vmemmap_buf_end - vmemmap_buf); - vmemmap_buf = NULL; - vmemmap_buf_end = NULL; - } -} diff --git a/mm/sparse.c b/mm/sparse.c index f13f2723950a..10b07eea9a6e 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -200,6 +200,11 @@ static inline int next_present_section_nr(int section_nr) (section_nr <= __highest_present_section_nr)); \ section_nr = next_present_section_nr(section_nr)) +static inline unsigned long first_present_section_nr(void) +{ + return next_present_section_nr(-1); +} + /* Record a memory area against a node. */ void __init memory_present(int nid, unsigned long start, unsigned long end) { @@ -257,19 +262,14 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum); } -static int __meminit sparse_init_one_section(struct mem_section *ms, +static void __meminit sparse_init_one_section(struct mem_section *ms, unsigned long pnum, struct page *mem_map, unsigned long *pageblock_bitmap) { - if (!present_section(ms)) - return -EINVAL; - ms->section_mem_map &= ~SECTION_MAP_MASK; ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) | SECTION_HAS_MEM_MAP; ms->pageblock_flags = pageblock_bitmap; - - return 1; } unsigned long usemap_size(void) @@ -370,160 +370,121 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) } #endif /* CONFIG_MEMORY_HOTREMOVE */ -static void __init sparse_early_usemaps_alloc_node(void *data, - unsigned long pnum_begin, - unsigned long pnum_end, - unsigned long usemap_count, int nodeid) +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static unsigned long __init section_map_size(void) { - void *usemap; - unsigned long pnum; - unsigned long **usemap_map = (unsigned long **)data; - int size = usemap_size(); - - usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), - size * usemap_count); - if (!usemap) { - pr_warn("%s: allocation failed\n", __func__); - return; - } + return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE); +} - for (pnum = pnum_begin; pnum < pnum_end; pnum++) { - if (!present_section_nr(pnum)) - continue; - usemap_map[pnum] = usemap; - usemap += size; - check_usemap_section_nr(nodeid, usemap_map[pnum]); - } +#else +static unsigned long __init section_map_size(void) +{ + return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); } -#ifndef CONFIG_SPARSEMEM_VMEMMAP struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid, struct vmem_altmap *altmap) { - struct page *map; - unsigned long size; + unsigned long size = section_map_size(); + struct page *map = sparse_buffer_alloc(size); + + if (map) + return map; - size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); map = memblock_virt_alloc_try_nid(size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid); return map; } -void __init sparse_mem_maps_populate_node(struct page **map_map, - unsigned long pnum_begin, - unsigned long pnum_end, - unsigned long map_count, int nodeid) -{ - void *map; - unsigned long pnum; - unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; - - size = PAGE_ALIGN(size); - map = memblock_virt_alloc_try_nid_raw(size * map_count, - PAGE_SIZE, __pa(MAX_DMA_ADDRESS), - BOOTMEM_ALLOC_ACCESSIBLE, nodeid); - if (map) { - for (pnum = pnum_begin; pnum < pnum_end; pnum++) { - if (!present_section_nr(pnum)) - continue; - map_map[pnum] = map; - map += size; - } - return; - } +#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ - /* fallback */ - for (pnum = pnum_begin; pnum < pnum_end; pnum++) { - struct mem_section *ms; +static void *sparsemap_buf __meminitdata; +static void *sparsemap_buf_end __meminitdata; - if (!present_section_nr(pnum)) - continue; - map_map[pnum] = sparse_mem_map_populate(pnum, nodeid, NULL); - if (map_map[pnum]) - continue; - ms = __nr_to_section(pnum); - pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", - __func__); - ms->section_mem_map = 0; - } +static void __init sparse_buffer_init(unsigned long size, int nid) +{ + WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */ + sparsemap_buf = + memblock_virt_alloc_try_nid_raw(size, PAGE_SIZE, + __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, nid); + sparsemap_buf_end = sparsemap_buf + size; } -#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER -static void __init sparse_early_mem_maps_alloc_node(void *data, - unsigned long pnum_begin, - unsigned long pnum_end, - unsigned long map_count, int nodeid) +static void __init sparse_buffer_fini(void) { - struct page **map_map = (struct page **)data; - sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end, - map_count, nodeid); + unsigned long size = sparsemap_buf_end - sparsemap_buf; + + if (sparsemap_buf && size > 0) + memblock_free_early(__pa(sparsemap_buf), size); + sparsemap_buf = NULL; } -#else -static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) -{ - struct page *map; - struct mem_section *ms = __nr_to_section(pnum); - int nid = sparse_early_nid(ms); - map = sparse_mem_map_populate(pnum, nid, NULL); - if (map) - return map; +void * __meminit sparse_buffer_alloc(unsigned long size) +{ + void *ptr = NULL; - pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", - __func__); - ms->section_mem_map = 0; - return NULL; + if (sparsemap_buf) { + ptr = PTR_ALIGN(sparsemap_buf, size); + if (ptr + size > sparsemap_buf_end) + ptr = NULL; + else + sparsemap_buf = ptr + size; + } + return ptr; } -#endif void __weak __meminit vmemmap_populate_print_last(void) { } -/** - * alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap - * @map: usemap_map for pageblock flags or mmap_map for vmemmap +/* + * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end) + * And number of present sections in this node is map_count. */ -static void __init alloc_usemap_and_memmap(void (*alloc_func) - (void *, unsigned long, unsigned long, - unsigned long, int), void *data) +static void __init sparse_init_nid(int nid, unsigned long pnum_begin, + unsigned long pnum_end, + unsigned long map_count) { - unsigned long pnum; - unsigned long map_count; - int nodeid_begin = 0; - unsigned long pnum_begin = 0; - - for_each_present_section_nr(0, pnum) { - struct mem_section *ms; + unsigned long pnum, usemap_longs, *usemap; + struct page *map; - ms = __nr_to_section(pnum); - nodeid_begin = sparse_early_nid(ms); - pnum_begin = pnum; - break; + usemap_longs = BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS); + usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid), + usemap_size() * + map_count); + if (!usemap) { + pr_err("%s: node[%d] usemap allocation failed", __func__, nid); + goto failed; + } + sparse_buffer_init(map_count * section_map_size(), nid); + for_each_present_section_nr(pnum_begin, pnum) { + if (pnum >= pnum_end) + break; + + map = sparse_mem_map_populate(pnum, nid, NULL); + if (!map) { + pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.", + __func__, nid); + pnum_begin = pnum; + goto failed; + } + check_usemap_section_nr(nid, usemap); + sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap); + usemap += usemap_longs; } - map_count = 1; - for_each_present_section_nr(pnum_begin + 1, pnum) { + sparse_buffer_fini(); + return; +failed: + /* We failed to allocate, mark all the following pnums as not present */ + for_each_present_section_nr(pnum_begin, pnum) { struct mem_section *ms; - int nodeid; + if (pnum >= pnum_end) + break; ms = __nr_to_section(pnum); - nodeid = sparse_early_nid(ms); - if (nodeid == nodeid_begin) { - map_count++; - continue; - } - /* ok, we need to take cake of from pnum_begin to pnum - 1*/ - alloc_func(data, pnum_begin, pnum, - map_count, nodeid_begin); - /* new start, update count etc*/ - nodeid_begin = nodeid; - pnum_begin = pnum; - map_count = 1; + ms->section_mem_map = 0; } - /* ok, last chunk */ - alloc_func(data, pnum_begin, __highest_present_section_nr+1, - map_count, nodeid_begin); } /* @@ -532,72 +493,29 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func) */ void __init sparse_init(void) { - unsigned long pnum; - struct page *map; - unsigned long *usemap; - unsigned long **usemap_map; - int size; -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - int size2; - struct page **map_map; -#endif - - /* see include/linux/mmzone.h 'struct mem_section' definition */ - BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section))); + unsigned long pnum_begin = first_present_section_nr(); + int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin)); + unsigned long pnum_end, map_count = 1; /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ set_pageblock_order(); - /* - * map is using big page (aka 2M in x86 64 bit) - * usemap is less one page (aka 24 bytes) - * so alloc 2M (with 2M align) and 24 bytes in turn will - * make next 2M slip to one more 2M later. - * then in big system, the memory will have a lot of holes... - * here try to allocate 2M pages continuously. - * - * powerpc need to call sparse_init_one_section right after each - * sparse_early_mem_map_alloc, so allocate usemap_map at first. - */ - size = sizeof(unsigned long *) * NR_MEM_SECTIONS; - usemap_map = memblock_virt_alloc(size, 0); - if (!usemap_map) - panic("can not allocate usemap_map\n"); - alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, - (void *)usemap_map); - -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - size2 = sizeof(struct page *) * NR_MEM_SECTIONS; - map_map = memblock_virt_alloc(size2, 0); - if (!map_map) - panic("can not allocate map_map\n"); - alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, - (void *)map_map); -#endif - - for_each_present_section_nr(0, pnum) { - usemap = usemap_map[pnum]; - if (!usemap) - continue; + for_each_present_section_nr(pnum_begin + 1, pnum_end) { + int nid = sparse_early_nid(__nr_to_section(pnum_end)); -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - map = map_map[pnum]; -#else - map = sparse_early_mem_map_alloc(pnum); -#endif - if (!map) + if (nid == nid_begin) { + map_count++; continue; - - sparse_init_one_section(__nr_to_section(pnum), pnum, map, - usemap); + } + /* Init node with sections in range [pnum_begin, pnum_end) */ + sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count); + nid_begin = nid; + pnum_begin = pnum_end; + map_count = 1; } - + /* cover the last node */ + sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count); vmemmap_populate_print_last(); - -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - memblock_free_early(__pa(map_map), size2); -#endif - memblock_free_early(__pa(usemap_map), size); } #ifdef CONFIG_MEMORY_HOTPLUG @@ -760,6 +678,7 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat, ret = sparse_index_init(section_nr, pgdat->node_id); if (ret < 0 && ret != -EEXIST) return ret; + ret = 0; memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, altmap); if (!memmap) return -ENOMEM; @@ -786,12 +705,11 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat, #endif section_mark_present(ms); - - ret = sparse_init_one_section(ms, section_nr, memmap, usemap); + sparse_init_one_section(ms, section_nr, memmap, usemap); out: pgdat_resize_unlock(pgdat, &flags); - if (ret <= 0) { + if (ret < 0) { kfree(usemap); __kfree_section_memmap(memmap, altmap); } diff --git a/mm/swap_slots.c b/mm/swap_slots.c index a791411fed71..63a7b4563a57 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -38,9 +38,9 @@ static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots); static bool swap_slot_cache_active; bool swap_slot_cache_enabled; static bool swap_slot_cache_initialized; -DEFINE_MUTEX(swap_slots_cache_mutex); +static DEFINE_MUTEX(swap_slots_cache_mutex); /* Serialize swap slots cache enable/disable operations */ -DEFINE_MUTEX(swap_slots_cache_enable_mutex); +static DEFINE_MUTEX(swap_slots_cache_enable_mutex); static void __drain_swap_slots_cache(unsigned int type); static void deactivate_swap_slots_cache(void); @@ -269,8 +269,8 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache) cache->cur = 0; if (swap_slot_cache_active) - cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, false, - cache->slots); + cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, + cache->slots, 1); return cache->nr; } @@ -316,7 +316,7 @@ swp_entry_t get_swap_page(struct page *page) if (PageTransHuge(page)) { if (IS_ENABLED(CONFIG_THP_SWAP)) - get_swap_pages(1, true, &entry); + get_swap_pages(1, &entry, HPAGE_PMD_NR); goto out; } @@ -350,7 +350,7 @@ repeat: goto out; } - get_swap_pages(1, false, &entry); + get_swap_pages(1, &entry, 1); out: if (mem_cgroup_try_charge_swap(page, entry)) { put_swap_page(page, entry); diff --git a/mm/swapfile.c b/mm/swapfile.c index 2cc2972eedaf..d954b71c4f9c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -204,8 +204,16 @@ static void discard_swap_cluster(struct swap_info_struct *si, #ifdef CONFIG_THP_SWAP #define SWAPFILE_CLUSTER HPAGE_PMD_NR + +#define swap_entry_size(size) (size) #else #define SWAPFILE_CLUSTER 256 + +/* + * Define swap_entry_size() as constant to let compiler to optimize + * out some code if !CONFIG_THP_SWAP + */ +#define swap_entry_size(size) 1 #endif #define LATENCY_LIMIT 256 @@ -269,7 +277,9 @@ static inline void cluster_set_null(struct swap_cluster_info *info) static inline bool cluster_is_huge(struct swap_cluster_info *info) { - return info->flags & CLUSTER_FLAG_HUGE; + if (IS_ENABLED(CONFIG_THP_SWAP)) + return info->flags & CLUSTER_FLAG_HUGE; + return false; } static inline void cluster_clear_huge(struct swap_cluster_info *info) @@ -296,13 +306,18 @@ static inline void unlock_cluster(struct swap_cluster_info *ci) spin_unlock(&ci->lock); } +/* + * Determine the locking method in use for this device. Return + * swap_cluster_info if SSD-style cluster-based locking is in place. + */ static inline struct swap_cluster_info *lock_cluster_or_swap_info( - struct swap_info_struct *si, - unsigned long offset) + struct swap_info_struct *si, unsigned long offset) { struct swap_cluster_info *ci; + /* Try to use fine-grained SSD-style locking if available: */ ci = lock_cluster(si, offset); + /* Otherwise, fall back to traditional, coarse locking: */ if (!ci) spin_lock(&si->lock); @@ -863,7 +878,6 @@ no_page: return n_ret; } -#ifdef CONFIG_THP_SWAP static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) { unsigned long idx; @@ -871,6 +885,15 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) unsigned long offset, i; unsigned char *map; + /* + * Should not even be attempting cluster allocations when huge + * page swap is disabled. Warn and fail the allocation. + */ + if (!IS_ENABLED(CONFIG_THP_SWAP)) { + VM_WARN_ON_ONCE(1); + return 0; + } + if (cluster_list_empty(&si->free_clusters)) return 0; @@ -901,13 +924,6 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) unlock_cluster(ci); swap_range_free(si, offset, SWAPFILE_CLUSTER); } -#else -static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) -{ - VM_WARN_ON_ONCE(1); - return 0; -} -#endif /* CONFIG_THP_SWAP */ static unsigned long scan_swap_map(struct swap_info_struct *si, unsigned char usage) @@ -924,18 +940,18 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, } -int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[]) +int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) { - unsigned long nr_pages = cluster ? SWAPFILE_CLUSTER : 1; + unsigned long size = swap_entry_size(entry_size); struct swap_info_struct *si, *next; long avail_pgs; int n_ret = 0; int node; /* Only single cluster request supported */ - WARN_ON_ONCE(n_goal > 1 && cluster); + WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER); - avail_pgs = atomic_long_read(&nr_swap_pages) / nr_pages; + avail_pgs = atomic_long_read(&nr_swap_pages) / size; if (avail_pgs <= 0) goto noswap; @@ -945,7 +961,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[]) if (n_goal > avail_pgs) n_goal = avail_pgs; - atomic_long_sub(n_goal * nr_pages, &nr_swap_pages); + atomic_long_sub(n_goal * size, &nr_swap_pages); spin_lock(&swap_avail_lock); @@ -972,14 +988,14 @@ start_over: spin_unlock(&si->lock); goto nextsi; } - if (cluster) { + if (size == SWAPFILE_CLUSTER) { if (!(si->flags & SWP_FILE)) n_ret = swap_alloc_cluster(si, swp_entries); } else n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, n_goal, swp_entries); spin_unlock(&si->lock); - if (n_ret || cluster) + if (n_ret || size == SWAPFILE_CLUSTER) goto check_out; pr_debug("scan_swap_map of si %d failed to find offset\n", si->type); @@ -1005,7 +1021,7 @@ nextsi: check_out: if (n_ret < n_goal) - atomic_long_add((long)(n_goal - n_ret) * nr_pages, + atomic_long_add((long)(n_goal - n_ret) * size, &nr_swap_pages); noswap: return n_ret; @@ -1107,16 +1123,13 @@ static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, return p; } -static unsigned char __swap_entry_free(struct swap_info_struct *p, - swp_entry_t entry, unsigned char usage) +static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, + unsigned long offset, + unsigned char usage) { - struct swap_cluster_info *ci; - unsigned long offset = swp_offset(entry); unsigned char count; unsigned char has_cache; - ci = lock_cluster_or_swap_info(p, offset); - count = p->swap_map[offset]; has_cache = count & SWAP_HAS_CACHE; @@ -1144,6 +1157,17 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p, usage = count | has_cache; p->swap_map[offset] = usage ? : SWAP_HAS_CACHE; + return usage; +} + +static unsigned char __swap_entry_free(struct swap_info_struct *p, + swp_entry_t entry, unsigned char usage) +{ + struct swap_cluster_info *ci; + unsigned long offset = swp_offset(entry); + + ci = lock_cluster_or_swap_info(p, offset); + usage = __swap_entry_free_locked(p, offset, usage); unlock_cluster_or_swap_info(p, ci); return usage; @@ -1184,19 +1208,7 @@ void swap_free(swp_entry_t entry) /* * Called after dropping swapcache to decrease refcnt to swap entries. */ -static void swapcache_free(swp_entry_t entry) -{ - struct swap_info_struct *p; - - p = _swap_info_get(entry); - if (p) { - if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE)) - free_swap_slot(entry); - } -} - -#ifdef CONFIG_THP_SWAP -static void swapcache_free_cluster(swp_entry_t entry) +void put_swap_page(struct page *page, swp_entry_t entry) { unsigned long offset = swp_offset(entry); unsigned long idx = offset / SWAPFILE_CLUSTER; @@ -1205,42 +1217,48 @@ static void swapcache_free_cluster(swp_entry_t entry) unsigned char *map; unsigned int i, free_entries = 0; unsigned char val; + int size = swap_entry_size(hpage_nr_pages(page)); si = _swap_info_get(entry); if (!si) return; - ci = lock_cluster(si, offset); - VM_BUG_ON(!cluster_is_huge(ci)); - map = si->swap_map + offset; - for (i = 0; i < SWAPFILE_CLUSTER; i++) { - val = map[i]; - VM_BUG_ON(!(val & SWAP_HAS_CACHE)); - if (val == SWAP_HAS_CACHE) - free_entries++; - } - if (!free_entries) { - for (i = 0; i < SWAPFILE_CLUSTER; i++) - map[i] &= ~SWAP_HAS_CACHE; + ci = lock_cluster_or_swap_info(si, offset); + if (size == SWAPFILE_CLUSTER) { + VM_BUG_ON(!cluster_is_huge(ci)); + map = si->swap_map + offset; + for (i = 0; i < SWAPFILE_CLUSTER; i++) { + val = map[i]; + VM_BUG_ON(!(val & SWAP_HAS_CACHE)); + if (val == SWAP_HAS_CACHE) + free_entries++; + } + cluster_clear_huge(ci); + if (free_entries == SWAPFILE_CLUSTER) { + unlock_cluster_or_swap_info(si, ci); + spin_lock(&si->lock); + ci = lock_cluster(si, offset); + memset(map, 0, SWAPFILE_CLUSTER); + unlock_cluster(ci); + mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); + swap_free_cluster(si, idx); + spin_unlock(&si->lock); + return; + } } - cluster_clear_huge(ci); - unlock_cluster(ci); - if (free_entries == SWAPFILE_CLUSTER) { - spin_lock(&si->lock); - ci = lock_cluster(si, offset); - memset(map, 0, SWAPFILE_CLUSTER); - unlock_cluster(ci); - mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); - swap_free_cluster(si, idx); - spin_unlock(&si->lock); - } else if (free_entries) { - for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) { - if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE)) - free_swap_slot(entry); + for (i = 0; i < size; i++, entry.val++) { + if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { + unlock_cluster_or_swap_info(si, ci); + free_swap_slot(entry); + if (i == size - 1) + return; + lock_cluster_or_swap_info(si, offset); } } + unlock_cluster_or_swap_info(si, ci); } +#ifdef CONFIG_THP_SWAP int split_swap_cluster(swp_entry_t entry) { struct swap_info_struct *si; @@ -1255,19 +1273,7 @@ int split_swap_cluster(swp_entry_t entry) unlock_cluster(ci); return 0; } -#else -static inline void swapcache_free_cluster(swp_entry_t entry) -{ -} -#endif /* CONFIG_THP_SWAP */ - -void put_swap_page(struct page *page, swp_entry_t entry) -{ - if (!PageTransHuge(page)) - swapcache_free(entry); - else - swapcache_free_cluster(entry); -} +#endif static int swp_entry_cmp(const void *ent1, const void *ent2) { @@ -1409,7 +1415,6 @@ out: return count; } -#ifdef CONFIG_THP_SWAP static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, swp_entry_t entry) { @@ -1422,12 +1427,12 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, ci = lock_cluster_or_swap_info(si, offset); if (!ci || !cluster_is_huge(ci)) { - if (map[roffset] != SWAP_HAS_CACHE) + if (swap_count(map[roffset])) ret = true; goto unlock_out; } for (i = 0; i < SWAPFILE_CLUSTER; i++) { - if (map[offset + i] != SWAP_HAS_CACHE) { + if (swap_count(map[offset + i])) { ret = true; break; } @@ -1442,7 +1447,7 @@ static bool page_swapped(struct page *page) swp_entry_t entry; struct swap_info_struct *si; - if (likely(!PageTransCompound(page))) + if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) return page_swapcount(page) != 0; page = compound_head(page); @@ -1466,10 +1471,8 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, /* hugetlbfs shouldn't call it */ VM_BUG_ON_PAGE(PageHuge(page), page); - if (likely(!PageTransCompound(page))) { - mapcount = atomic_read(&page->_mapcount) + 1; - if (total_mapcount) - *total_mapcount = mapcount; + if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) { + mapcount = page_trans_huge_mapcount(page, total_mapcount); if (PageSwapCache(page)) swapcount = page_swapcount(page); if (total_swapcount) @@ -1516,26 +1519,6 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, return map_swapcount; } -#else -#define swap_page_trans_huge_swapped(si, entry) swap_swapcount(si, entry) -#define page_swapped(page) (page_swapcount(page) != 0) - -static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, - int *total_swapcount) -{ - int mapcount, swapcount = 0; - - /* hugetlbfs shouldn't call it */ - VM_BUG_ON_PAGE(PageHuge(page), page); - - mapcount = page_trans_huge_mapcount(page, total_mapcount); - if (PageSwapCache(page)) - swapcount = page_swapcount(page); - if (total_swapcount) - *total_swapcount = swapcount; - return mapcount + swapcount; -} -#endif /* * We can write to an anon page without COW if there are no other references @@ -2909,6 +2892,35 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) return 0; } + +/* + * Find out how many pages are allowed for a single swap device. There + * are two limiting factors: + * 1) the number of bits for the swap offset in the swp_entry_t type, and + * 2) the number of bits in the swap pte, as defined by the different + * architectures. + * + * In order to find the largest possible bit mask, a swap entry with + * swap type 0 and swap offset ~0UL is created, encoded to a swap pte, + * decoded to a swp_entry_t again, and finally the swap offset is + * extracted. + * + * This will mask all the bits from the initial ~0UL mask that can't + * be encoded in either the swp_entry_t or the architecture definition + * of a swap pte. + */ +unsigned long generic_max_swapfile_size(void) +{ + return swp_offset(pte_to_swp_entry( + swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; +} + +/* Can be overridden by an architecture for additional checks. */ +__weak unsigned long max_swapfile_size(void) +{ + return generic_max_swapfile_size(); +} + static unsigned long read_swap_header(struct swap_info_struct *p, union swap_header *swap_header, struct inode *inode) @@ -2944,22 +2956,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p, p->cluster_next = 1; p->cluster_nr = 0; - /* - * Find out how many pages are allowed for a single swap - * device. There are two limiting factors: 1) the number - * of bits for the swap offset in the swp_entry_t type, and - * 2) the number of bits in the swap pte as defined by the - * different architectures. In order to find the - * largest possible bit mask, a swap entry with swap type 0 - * and swap offset ~0UL is created, encoded to a swap pte, - * decoded to a swp_entry_t again, and finally the swap - * offset is extracted. This will mask all the bits from - * the initial ~0UL mask that can't be encoded in either - * the swp_entry_t or the architecture definition of a - * swap pte. - */ - maxpages = swp_offset(pte_to_swp_entry( - swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; + maxpages = max_swapfile_size(); last_page = swap_header->info.last_page; if (!last_page) { pr_warn("Empty swap-file\n"); @@ -3731,6 +3728,37 @@ static void free_swap_count_continuations(struct swap_info_struct *si) } } +#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) +void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node, + gfp_t gfp_mask) +{ + struct swap_info_struct *si, *next; + if (!(gfp_mask & __GFP_IO) || !memcg) + return; + + if (!blk_cgroup_congested()) + return; + + /* + * We've already scheduled a throttle, avoid taking the global swap + * lock. + */ + if (current->throttle_queue) + return; + + spin_lock(&swap_avail_lock); + plist_for_each_entry_safe(si, next, &swap_avail_heads[node], + avail_lists[node]) { + if (si->bdev) { + blkcg_schedule_throttle(bdev_get_queue(si->bdev), + true); + break; + } + } + spin_unlock(&swap_avail_lock); +} +#endif + static int __init swapfile_init(void) { int nid; diff --git a/mm/usercopy.c b/mm/usercopy.c index e9e9325f7638..852eb4e53f06 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -20,6 +20,8 @@ #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/thread_info.h> +#include <linux/atomic.h> +#include <linux/jump_label.h> #include <asm/sections.h> /* @@ -240,6 +242,8 @@ static inline void check_heap_object(const void *ptr, unsigned long n, } } +static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks); + /* * Validates that the given object is: * - not bogus address @@ -248,6 +252,9 @@ static inline void check_heap_object(const void *ptr, unsigned long n, */ void __check_object_size(const void *ptr, unsigned long n, bool to_user) { + if (static_branch_unlikely(&bypass_usercopy_checks)) + return; + /* Skip all tests if size is zero. */ if (!n) return; @@ -279,3 +286,21 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user) check_kernel_text_object((const unsigned long)ptr, n, to_user); } EXPORT_SYMBOL(__check_object_size); + +static bool enable_checks __initdata = true; + +static int __init parse_hardened_usercopy(char *str) +{ + return strtobool(str, &enable_checks); +} + +__setup("hardened_usercopy=", parse_hardened_usercopy); + +static int __init set_hardened_usercopy(void) +{ + if (enable_checks == false) + static_branch_enable(&bypass_usercopy_checks); + return 1; +} + +late_initcall(set_hardened_usercopy); diff --git a/mm/util.c b/mm/util.c index 3351659200e6..d2890a407332 100644 --- a/mm/util.c +++ b/mm/util.c @@ -196,7 +196,7 @@ void *vmemdup_user(const void __user *src, size_t len) } EXPORT_SYMBOL(vmemdup_user); -/* +/** * strndup_user - duplicate an existing string from user space * @s: The string to duplicate * @n: Maximum number of bytes to copy, including the trailing NUL. @@ -434,6 +434,13 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) } EXPORT_SYMBOL(kvmalloc_node); +/** + * kvfree - free memory allocated with kvmalloc + * @addr: pointer returned by kvmalloc + * + * If the memory is allocated from vmalloc area it is freed with vfree(). + * Otherwise kfree() is used. + */ void kvfree(const void *addr) { if (is_vmalloc_addr(addr)) diff --git a/mm/vmacache.c b/mm/vmacache.c index db7596eb6132..ea517bef7dc5 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c @@ -6,6 +6,18 @@ #include <linux/sched/task.h> #include <linux/mm.h> #include <linux/vmacache.h> +#include <asm/pgtable.h> + +/* + * Hash based on the pmd of addr if configured with MMU, which provides a good + * hit rate for workloads with spatial locality. Otherwise, use pages. + */ +#ifdef CONFIG_MMU +#define VMACACHE_SHIFT PMD_SHIFT +#else +#define VMACACHE_SHIFT PAGE_SHIFT +#endif +#define VMACACHE_HASH(addr) ((addr >> VMACACHE_SHIFT) & VMACACHE_MASK) /* * Flush vma caches for threads that share a given mm. @@ -87,6 +99,7 @@ static bool vmacache_valid(struct mm_struct *mm) struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) { + int idx = VMACACHE_HASH(addr); int i; count_vm_vmacache_event(VMACACHE_FIND_CALLS); @@ -95,16 +108,20 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) return NULL; for (i = 0; i < VMACACHE_SIZE; i++) { - struct vm_area_struct *vma = current->vmacache.vmas[i]; + struct vm_area_struct *vma = current->vmacache.vmas[idx]; - if (!vma) - continue; - if (WARN_ON_ONCE(vma->vm_mm != mm)) - break; - if (vma->vm_start <= addr && vma->vm_end > addr) { - count_vm_vmacache_event(VMACACHE_FIND_HITS); - return vma; + if (vma) { +#ifdef CONFIG_DEBUG_VM_VMACACHE + if (WARN_ON_ONCE(vma->vm_mm != mm)) + break; +#endif + if (vma->vm_start <= addr && vma->vm_end > addr) { + count_vm_vmacache_event(VMACACHE_FIND_HITS); + return vma; + } } + if (++idx == VMACACHE_SIZE) + idx = 0; } return NULL; @@ -115,6 +132,7 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, unsigned long start, unsigned long end) { + int idx = VMACACHE_HASH(start); int i; count_vm_vmacache_event(VMACACHE_FIND_CALLS); @@ -123,12 +141,14 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, return NULL; for (i = 0; i < VMACACHE_SIZE; i++) { - struct vm_area_struct *vma = current->vmacache.vmas[i]; + struct vm_area_struct *vma = current->vmacache.vmas[idx]; if (vma && vma->vm_start == start && vma->vm_end == end) { count_vm_vmacache_event(VMACACHE_FIND_HITS); return vma; } + if (++idx == VMACACHE_SIZE) + idx = 0; } return NULL; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index cfea25be7754..a728fc492557 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1907,10 +1907,6 @@ void *vzalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vzalloc_node); -#ifndef PAGE_KERNEL_EXEC -# define PAGE_KERNEL_EXEC PAGE_KERNEL -#endif - /** * vmalloc_exec - allocate virtually contiguous, executable memory * @size: allocation size diff --git a/mm/vmscan.c b/mm/vmscan.c index 03822f86f288..7e7d25504651 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -65,12 +65,6 @@ struct scan_control { /* How many pages shrink_list() should reclaim */ unsigned long nr_to_reclaim; - /* This context's GFP mask */ - gfp_t gfp_mask; - - /* Allocation order */ - int order; - /* * Nodemask of nodes allowed by the caller. If NULL, all nodes * are scanned. @@ -83,12 +77,6 @@ struct scan_control { */ struct mem_cgroup *target_mem_cgroup; - /* Scan (total_size >> priority) pages at once */ - int priority; - - /* The highest zone to isolate pages for reclaim from */ - enum zone_type reclaim_idx; - /* Writepage batching in laptop mode; RECLAIM_WRITE */ unsigned int may_writepage:1; @@ -111,6 +99,18 @@ struct scan_control { /* One of the zones is ready for compaction */ unsigned int compaction_ready:1; + /* Allocation order */ + s8 order; + + /* Scan (total_size >> priority) pages at once */ + s8 priority; + + /* The highest zone to isolate pages for reclaim from */ + s8 reclaim_idx; + + /* This context's GFP mask */ + gfp_t gfp_mask; + /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; @@ -169,6 +169,70 @@ unsigned long vm_total_pages; static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); +#ifdef CONFIG_MEMCG_KMEM + +/* + * We allow subsystems to populate their shrinker-related + * LRU lists before register_shrinker_prepared() is called + * for the shrinker, since we don't want to impose + * restrictions on their internal registration order. + * In this case shrink_slab_memcg() may find corresponding + * bit is set in the shrinkers map. + * + * This value is used by the function to detect registering + * shrinkers and to skip do_shrink_slab() calls for them. + */ +#define SHRINKER_REGISTERING ((struct shrinker *)~0UL) + +static DEFINE_IDR(shrinker_idr); +static int shrinker_nr_max; + +static int prealloc_memcg_shrinker(struct shrinker *shrinker) +{ + int id, ret = -ENOMEM; + + down_write(&shrinker_rwsem); + /* This may call shrinker, so it must use down_read_trylock() */ + id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL); + if (id < 0) + goto unlock; + + if (id >= shrinker_nr_max) { + if (memcg_expand_shrinker_maps(id)) { + idr_remove(&shrinker_idr, id); + goto unlock; + } + + shrinker_nr_max = id + 1; + } + shrinker->id = id; + ret = 0; +unlock: + up_write(&shrinker_rwsem); + return ret; +} + +static void unregister_memcg_shrinker(struct shrinker *shrinker) +{ + int id = shrinker->id; + + BUG_ON(id < 0); + + down_write(&shrinker_rwsem); + idr_remove(&shrinker_idr, id); + up_write(&shrinker_rwsem); +} +#else /* CONFIG_MEMCG_KMEM */ +static int prealloc_memcg_shrinker(struct shrinker *shrinker) +{ + return 0; +} + +static void unregister_memcg_shrinker(struct shrinker *shrinker) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + #ifdef CONFIG_MEMCG static bool global_reclaim(struct scan_control *sc) { @@ -313,11 +377,28 @@ int prealloc_shrinker(struct shrinker *shrinker) shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); if (!shrinker->nr_deferred) return -ENOMEM; + + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { + if (prealloc_memcg_shrinker(shrinker)) + goto free_deferred; + } + return 0; + +free_deferred: + kfree(shrinker->nr_deferred); + shrinker->nr_deferred = NULL; + return -ENOMEM; } void free_prealloced_shrinker(struct shrinker *shrinker) { + if (!shrinker->nr_deferred) + return; + + if (shrinker->flags & SHRINKER_MEMCG_AWARE) + unregister_memcg_shrinker(shrinker); + kfree(shrinker->nr_deferred); shrinker->nr_deferred = NULL; } @@ -326,6 +407,10 @@ void register_shrinker_prepared(struct shrinker *shrinker) { down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); +#ifdef CONFIG_MEMCG_KMEM + if (shrinker->flags & SHRINKER_MEMCG_AWARE) + idr_replace(&shrinker_idr, shrinker, shrinker->id); +#endif up_write(&shrinker_rwsem); } @@ -347,6 +432,8 @@ void unregister_shrinker(struct shrinker *shrinker) { if (!shrinker->nr_deferred) return; + if (shrinker->flags & SHRINKER_MEMCG_AWARE) + unregister_memcg_shrinker(shrinker); down_write(&shrinker_rwsem); list_del(&shrinker->list); up_write(&shrinker_rwsem); @@ -371,9 +458,12 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, : SHRINK_BATCH; long scanned = 0, next_deferred; + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + nid = 0; + freeable = shrinker->count_objects(shrinker, shrinkctl); - if (freeable == 0) - return 0; + if (freeable == 0 || freeable == SHRINK_EMPTY) + return freeable; /* * copy the current shrinker scan count into a local variable @@ -474,6 +564,84 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, return freed; } +#ifdef CONFIG_MEMCG_KMEM +static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, + struct mem_cgroup *memcg, int priority) +{ + struct memcg_shrinker_map *map; + unsigned long freed = 0; + int ret, i; + + if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)) + return 0; + + if (!down_read_trylock(&shrinker_rwsem)) + return 0; + + map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map, + true); + if (unlikely(!map)) + goto unlock; + + for_each_set_bit(i, map->map, shrinker_nr_max) { + struct shrink_control sc = { + .gfp_mask = gfp_mask, + .nid = nid, + .memcg = memcg, + }; + struct shrinker *shrinker; + + shrinker = idr_find(&shrinker_idr, i); + if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) { + if (!shrinker) + clear_bit(i, map->map); + continue; + } + + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) { + clear_bit(i, map->map); + /* + * After the shrinker reported that it had no objects to + * free, but before we cleared the corresponding bit in + * the memcg shrinker map, a new object might have been + * added. To make sure, we have the bit set in this + * case, we invoke the shrinker one more time and reset + * the bit if it reports that it is not empty anymore. + * The memory barrier here pairs with the barrier in + * memcg_set_shrinker_bit(): + * + * list_lru_add() shrink_slab_memcg() + * list_add_tail() clear_bit() + * <MB> <MB> + * set_bit() do_shrink_slab() + */ + smp_mb__after_atomic(); + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) + ret = 0; + else + memcg_set_shrinker_bit(memcg, nid, i); + } + freed += ret; + + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + break; + } + } +unlock: + up_read(&shrinker_rwsem); + return freed; +} +#else /* CONFIG_MEMCG_KMEM */ +static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, + struct mem_cgroup *memcg, int priority) +{ + return 0; +} +#endif /* CONFIG_MEMCG_KMEM */ + /** * shrink_slab - shrink slab caches * @gfp_mask: allocation context @@ -486,10 +654,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, * unaware shrinkers will receive a node id of 0 instead. * - * @memcg specifies the memory cgroup to target. If it is not NULL, - * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan - * objects from the memory cgroup specified. Otherwise, only unaware - * shrinkers are called. + * @memcg specifies the memory cgroup to target. Unaware shrinkers + * are called only if it is the root cgroup. * * @priority is sc->priority, we take the number of objects and >> by priority * in order to get the scan target. @@ -502,9 +668,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, { struct shrinker *shrinker; unsigned long freed = 0; + int ret; - if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))) - return 0; + if (!mem_cgroup_is_root(memcg)) + return shrink_slab_memcg(gfp_mask, nid, memcg, priority); if (!down_read_trylock(&shrinker_rwsem)) goto out; @@ -516,19 +683,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, .memcg = memcg, }; - /* - * If kernel memory accounting is disabled, we ignore - * SHRINKER_MEMCG_AWARE flag and call all shrinkers - * passing NULL for memcg. - */ - if (memcg_kmem_enabled() && - !!memcg != !!(shrinker->flags & SHRINKER_MEMCG_AWARE)) - continue; - - if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) - sc.nid = 0; - - freed += do_shrink_slab(&sc, shrinker, priority); + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) + ret = 0; + freed += ret; /* * Bail out if someone want to register a new shrinker to * prevent the regsitration from being stalled for long periods @@ -554,6 +712,7 @@ void drop_slab_node(int nid) struct mem_cgroup *memcg = NULL; freed = 0; + memcg = mem_cgroup_iter(NULL, NULL, NULL); do { freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); @@ -744,7 +903,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, refcount = 2; if (!page_ref_freeze(page, refcount)) goto cannot_free; - /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ + /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */ if (unlikely(PageDirty(page))) { page_ref_unfreeze(page, refcount); goto cannot_free; @@ -2573,9 +2732,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) shrink_node_memcg(pgdat, memcg, sc, &lru_pages); node_lru_pages += lru_pages; - if (memcg) - shrink_slab(sc->gfp_mask, pgdat->node_id, - memcg, sc->priority); + shrink_slab(sc->gfp_mask, pgdat->node_id, + memcg, sc->priority); /* Record the group's reclaim efficiency */ vmpressure(sc->gfp_mask, memcg, false, @@ -2599,10 +2757,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) } } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); - if (global_reclaim(sc)) - shrink_slab(sc->gfp_mask, pgdat->node_id, NULL, - sc->priority); - if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; @@ -3064,6 +3218,14 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, }; /* + * scan_control uses s8 fields for order, priority, and reclaim_idx. + * Confirm they are large enough for max values. + */ + BUILD_BUG_ON(MAX_ORDER > S8_MAX); + BUILD_BUG_ON(DEF_PRIORITY > S8_MAX); + BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX); + + /* * Do not enter reclaim if fatal signal was delivered while throttled. * 1 is returned so that the page allocator does not OOM kill at this * point. diff --git a/mm/vmstat.c b/mm/vmstat.c index 75eda9c2b260..8ba0870ecddd 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1796,11 +1796,9 @@ static void vmstat_update(struct work_struct *w) * to occur in the future. Keep on running the * update worker thread. */ - preempt_disable(); queue_delayed_work_on(smp_processor_id(), mm_percpu_wq, this_cpu_ptr(&vmstat_work), round_jiffies_relative(sysctl_stat_interval)); - preempt_enable(); } } diff --git a/mm/workingset.c b/mm/workingset.c index 40ee02c83978..4516dd790129 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -366,10 +366,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, unsigned long nodes; unsigned long cache; - /* list_lru lock nests inside the IRQ-safe i_pages lock */ - local_irq_disable(); nodes = list_lru_shrink_count(&shadow_nodes, sc); - local_irq_enable(); /* * Approximate a reasonable limit for the radix tree nodes @@ -402,6 +399,9 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, } max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3); + if (!nodes) + return SHRINK_EMPTY; + if (nodes <= max_nodes) return 0; return nodes - max_nodes; @@ -434,7 +434,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, /* Coming from the list, invert the lock order */ if (!xa_trylock(&mapping->i_pages)) { - spin_unlock(lru_lock); + spin_unlock_irq(lru_lock); ret = LRU_RETRY; goto out; } @@ -472,26 +472,20 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, workingset_lookup_update(mapping)); out_invalid: - xa_unlock(&mapping->i_pages); + xa_unlock_irq(&mapping->i_pages); ret = LRU_REMOVED_RETRY; out: - local_irq_enable(); cond_resched(); - local_irq_disable(); - spin_lock(lru_lock); + spin_lock_irq(lru_lock); return ret; } static unsigned long scan_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { - unsigned long ret; - /* list_lru lock nests inside the IRQ-safe i_pages lock */ - local_irq_disable(); - ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL); - local_irq_enable(); - return ret; + return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate, + NULL); } static struct shrinker workingset_shadow_shrinker = { @@ -528,15 +522,17 @@ static int __init workingset_init(void) pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); - ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key); + ret = prealloc_shrinker(&workingset_shadow_shrinker); if (ret) goto err; - ret = register_shrinker(&workingset_shadow_shrinker); + ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key, + &workingset_shadow_shrinker); if (ret) goto err_list_lru; + register_shrinker_prepared(&workingset_shadow_shrinker); return 0; err_list_lru: - list_lru_destroy(&shadow_nodes); + free_prealloced_shrinker(&workingset_shadow_shrinker); err: return ret; } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 8d87e973a4f5..9da65552e7ca 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -924,20 +924,7 @@ static void reset_page(struct page *page) page->freelist = NULL; } -/* - * To prevent zspage destroy during migration, zspage freeing should - * hold locks of all pages in the zspage. - */ -void lock_zspage(struct zspage *zspage) -{ - struct page *page = get_first_page(zspage); - - do { - lock_page(page); - } while ((page = get_next_page(page)) != NULL); -} - -int trylock_zspage(struct zspage *zspage) +static int trylock_zspage(struct zspage *zspage) { struct page *cursor, *fail; @@ -1814,6 +1801,19 @@ static enum fullness_group putback_zspage(struct size_class *class, } #ifdef CONFIG_COMPACTION +/* + * To prevent zspage destroy during migration, zspage freeing should + * hold locks of all pages in the zspage. + */ +static void lock_zspage(struct zspage *zspage) +{ + struct page *page = get_first_page(zspage); + + do { + lock_page(page); + } while ((page = get_next_page(page)) != NULL); +} + static struct dentry *zs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { @@ -1905,7 +1905,7 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage, __SetPageMovable(newpage, page_mapping(oldpage)); } -bool zs_page_isolate(struct page *page, isolate_mode_t mode) +static bool zs_page_isolate(struct page *page, isolate_mode_t mode) { struct zs_pool *pool; struct size_class *class; @@ -1960,7 +1960,7 @@ bool zs_page_isolate(struct page *page, isolate_mode_t mode) return true; } -int zs_page_migrate(struct address_space *mapping, struct page *newpage, +static int zs_page_migrate(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode) { struct zs_pool *pool; @@ -2076,7 +2076,7 @@ unpin_objects: return ret; } -void zs_page_putback(struct page *page) +static void zs_page_putback(struct page *page) { struct zs_pool *pool; struct size_class *class; @@ -2108,7 +2108,7 @@ void zs_page_putback(struct page *page) spin_unlock(&class->lock); } -const struct address_space_operations zsmalloc_aops = { +static const struct address_space_operations zsmalloc_aops = { .isolate_page = zs_page_isolate, .migratepage = zs_page_migrate, .putback_page = zs_page_putback, diff --git a/mm/zswap.c b/mm/zswap.c index 7d34e69507e3..cd91fd9d96b8 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1026,6 +1026,15 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, ret = -ENOMEM; goto reject; } + + /* A second zswap_is_full() check after + * zswap_shrink() to make sure it's now + * under the max_pool_percent + */ + if (zswap_is_full()) { + ret = -ENOMEM; + goto reject; + } } /* allocate entry */ |